# Task 2

## Similar to Task 1, but emphasize the utilization of method chaining, pipes, and lambda functions


Imports:

In [1]:
import pandas as pd

## 2.1 Load the dataset

Load the dataset:

In [2]:
df = pd.read_csv("JO0104D3_20250411-135149.csv", encoding="latin-1", na_values=['..'])
df.head()

Unnamed: 0,region,crop,1981,1985,1989,1990,1991,1992,1993,1994,1995,1999,2001,2002,2003,2005
0,0114 Upplands Väsby,total arable land,1715.0,1652.0,1614.0,1478.0,1419.0,1311.0,1288.0,1434.0,1398.0,1380.0,1310.0,1286.0,1281.0,1353.0
1,0114 Upplands Väsby,winter wheat,80.0,40.0,477.0,520.0,180.0,213.0,195.0,194.0,230.0,291.0,393.0,330.0,433.0,408.0
2,0114 Upplands Väsby,spring wheat,212.0,208.0,,,7.0,,,,12.0,28.0,,,,7.0
3,0114 Upplands Väsby,rye,,5.0,82.0,72.0,26.0,,,,,,,,,
4,0114 Upplands Väsby,winter barley,,,,,,,6.0,6.0,,,,,,


## 2.2 Find top N crops per region per year, using pipes and lambda functions

We are curious as to which crops are the most produced for each region per year:

In [3]:
df_top_3 = df.copy()

# drop rows that are not really crops
df_top_3.query("crop != 'total arable land'", inplace=True)
df_top_3.query("crop != 'other untilled arable land'", inplace=True)
df_top_3.query("crop != 'unspecified arable land'", inplace=True)
df_top_3.query("crop != 'not utilized ley for hay and pasture'", inplace=True)

year_cols = df_top_3.columns[2:]

N_CROPS = 3

top_crops_per_year_region = (
    df_top_3.pipe(lambda x: x.groupby(['region', 'crop'])[year_cols].sum())
       .pipe(lambda x: x.reset_index())
       .pipe(lambda x: x.melt(id_vars=['region', 'crop'], var_name='year', value_name='land')) # take the year columns and convert them into rows instead, with a new colum 'land'
       .pipe(lambda x: x.sort_values(by=['region', 'year', 'land'], ascending=[True, True, False]))
       .pipe(lambda x: x.groupby(['region', 'year']).head(N_CROPS)) # for each combination of region and year, take top N crops
       .pipe(lambda x: x.reset_index())
       .pipe(lambda x: x.drop("index", axis=1)) # remove ugly column
)

N_REGIONS = 1
top_crops_per_year_region.head(N_CROPS * 14 * N_REGIONS) # there are 14 years in the dataset

Unnamed: 0,region,crop,year,land
0,0114 Upplands Väsby,barley,1981,500.0
1,0114 Upplands Väsby,utilized ley for hay and pasture,1981,229.0
2,0114 Upplands Väsby,oats,1981,225.0
3,0114 Upplands Väsby,barley,1985,586.0
4,0114 Upplands Väsby,spring wheat,1985,208.0
5,0114 Upplands Väsby,utilized ley for hay and pasture,1985,201.0
6,0114 Upplands Väsby,winter wheat,1989,477.0
7,0114 Upplands Väsby,utilized ley for hay and pasture,1989,275.0
8,0114 Upplands Väsby,barley,1989,264.0
9,0114 Upplands Väsby,winter wheat,1990,520.0


## 2.3 Filling missing values using different methods

### Approach 1: Interpolation

Interpolate missing values at each row (since the time series is column wise)

In [4]:
df_approach_1 = df.copy()
df_approach_1[year_cols] = df_approach_1[year_cols].interpolate(method='linear', axis=1)

#### Approach 1.1: Interpolation with filling

Interpolation may not fill all NaN values, so complement with backward and forward fill

In [5]:
df_approach_1_last = df_approach_1.copy()

# backward fill and then forward fill, to complement the initial interpolation
df_approach_1_last[year_cols] = df_approach_1_last[year_cols].bfill(axis=1).ffill(axis=1)

### Approach 2: Imputation with Mean

Fill NaN values with the mean value

In [6]:
df_approach_2 = df.copy()
df_approach_2[year_cols] = df_approach_2[year_cols].fillna(df_approach_2[year_cols].mean())

### Approach 3: Fill with Zeroes

For this dataset, replacing NaN values with zeros is a realistic approach. The absence of a recorded value likely indicates that the crop measurement was truly zero rather than an error.

In [7]:
df_approach_3 = df.copy()
df_approach_3[year_cols] = df_approach_3[year_cols].fillna(0)

## 2.4 Comparison summary after testing various approaches, using method chaining

Lets see how the different methods affected the number of NaN values. Method chaining helps a lot with readability.

In [8]:
comparison_summary = pd.concat([
    df.isnull().sum().rename('Original').to_frame(),
    df_approach_1.isnull().sum().rename('Interpolation').to_frame(),
    df_approach_1_last.isnull().sum().rename('Interpolation filling').to_frame(),
    df_approach_2.isnull().sum().rename('Mean Imputation').to_frame(),
    df_approach_3.isnull().sum().rename('Filling with Zeros').to_frame(),
], axis=1)

comparison_summary

Unnamed: 0,Original,Interpolation,Interpolation filling,Mean Imputation,Filling with Zeros
region,0,0,0,0,0
crop,0,0,0,0,0
1981,5427,5427,2198,0,0
1985,5369,5138,2198,0,0
1989,5396,5046,2198,0,0
1990,5407,4746,2198,0,0
1991,5486,4726,2198,0,0
1992,5509,4656,2198,0,0
1993,5122,3437,2198,0,0
1994,5169,3402,2198,0,0
