# Task 2

## Similar to Task 1, but emphasize the utilization of method chaining and pipes


Imports:

In [1]:
import pandas as pd

## 2.1 Load the dataset

Load the dataset:

In [2]:
df = pd.read_csv("JO0104D3_20250411-135149.csv", encoding="latin-1", na_values=['..'])
df.head(36)

Unnamed: 0,region,crop,1981,1985,1989,1990,1991,1992,1993,1994,1995,1999,2001,2002,2003,2005
0,0114 Upplands Väsby,total arable land,1715.0,1652.0,1614.0,1478.0,1419.0,1311.0,1288.0,1434.0,1398.0,1380.0,1310.0,1286.0,1281.0,1353.0
1,0114 Upplands Väsby,winter wheat,80.0,40.0,477.0,520.0,180.0,213.0,195.0,194.0,230.0,291.0,393.0,330.0,433.0,408.0
2,0114 Upplands Väsby,spring wheat,212.0,208.0,,,7.0,,,,12.0,28.0,,,,7.0
3,0114 Upplands Väsby,rye,,5.0,82.0,72.0,26.0,,,,,,,,,
4,0114 Upplands Väsby,winter barley,,,,,,,6.0,6.0,,,,,,
5,0114 Upplands Väsby,barley,500.0,586.0,264.0,213.0,328.0,106.0,,,,,,,,
6,0114 Upplands Väsby,spring barley,,,,,,,114.0,127.0,135.0,135.0,188.0,158.0,93.0,159.0
7,0114 Upplands Väsby,oats,225.0,199.0,142.0,194.0,111.0,92.0,70.0,175.0,85.0,103.0,104.0,79.0,47.0,
8,0114 Upplands Väsby,mixed grain and triticale,,,,7.0,,,,,,,,,,
9,0114 Upplands Väsby,ley for hay and forage plants,,,,,,,189.0,157.0,145.0,,,,,


## 2.2 Group by region and year to find top N crops per region per year, using pipes and lambda functions

We are curious as to which crops are the most produced for each region per year:

In [13]:
df_top_3 = df.copy()

# drop rows that are not really crops
df_top_3.query("crop != 'total arable land'", inplace=True)
df_top_3.query("crop != 'other untilled arable land'", inplace=True)
df_top_3.query("crop != 'unspecified arable land'", inplace=True)

year_cols = df_top_3.columns[2:]

N_CROPS = 3

top_crops_per_year_region = (
    df_top_3.pipe(lambda x: x.groupby(['region', 'crop'])[year_cols].sum())
       .pipe(lambda x: x.reset_index())
       .pipe(lambda x: x.melt(id_vars=['region', 'crop'], var_name='year', value_name='land'))
       .pipe(lambda x: x.sort_values(by=['region', 'year', 'land'], ascending=[True, True, False]))
       .pipe(lambda x: x.groupby(['region', 'year']).head(N_CROPS))
       .pipe(lambda x: x.reset_index())
       .pipe(lambda x: x.drop("index", axis=1))
)

top_crops_per_year_region.head(50)

Unnamed: 0,region,crop,year,land
0,0114 Upplands Väsby,barley,1981,500.0
1,0114 Upplands Väsby,utilized ley for hay and pasture,1981,229.0
2,0114 Upplands Väsby,oats,1981,225.0
3,0114 Upplands Väsby,barley,1985,586.0
4,0114 Upplands Väsby,spring wheat,1985,208.0
5,0114 Upplands Väsby,utilized ley for hay and pasture,1985,201.0
6,0114 Upplands Väsby,winter wheat,1989,477.0
7,0114 Upplands Väsby,utilized ley for hay and pasture,1989,275.0
8,0114 Upplands Väsby,barley,1989,264.0
9,0114 Upplands Väsby,winter wheat,1990,520.0


## 2.3 Filling missing values using different methods

### 2.3.1 Approach 1: Interpolation

In [None]:
df_approach_1 = df.copy()
df_approach_1[year_cols] = df_approach_1[year_cols].interpolate(method='linear', axis=1)

### 2.3.2 Approach 2: Imputation with Mean

In [None]:
df_approach_2 = df.copy()
df_approach_2[year_cols] = df_approach_2[year_cols].fillna(df_approach_2[year_cols].mean())

### 2.3.3 Approach 3: Fill with Zeroes

For this dataset, replacing NaN values with zeros is a realistic approach. The absence of a recorded value likely indicates that the crop measurement was truly zero rather than an error.

In [None]:
df_approach_3 = df.copy()
df_approach_3[year_cols] = df_approach_3[year_cols].fillna(0)

## 2.4 Comparison summary after applying each method, using method chaining

In [None]:
comparison_summary = pd.concat([
    df.isnull().sum().rename('Way-Before').to_frame(),
    df_approach_1.isnull().sum().rename('Interpolation').to_frame(),
    df_approach_2.isnull().sum().rename('Mean Imputation').to_frame(),
    df_approach_3.isnull().sum().rename('Filling with Zeros').to_frame(),
], axis=1)

comparison_summary