In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler

In [2]:
df = pd.read_csv('../data/df_prepped.csv')
df.head()

Unnamed: 0,Year,Countries,Sand_1,Sand_2,Sand_3,Sand_4,Sand_5,Sand_6,Sand_7,Clay_1,...,spi_12mon_10,spi_12mon_11,spi_12mon_12,Y_maize_major,Farm,Sow_Maize_month_int,Harvest_Maize_month_int,maize_lag-1,maize_lag-2,maize_lag-3
0,2007,Angola,50,51,51,48,45,46,46,37,...,0.994139,1.051325,0.891661,0.615357,104_Angola,9,4,0.554392,0.721607,0.620005
1,2007,Angola,62,64,63,59,58,59,59,27,...,0.269691,-0.364644,-0.698008,0.257656,99_Angola,9,4,0.117051,0.300217,0.212699
2,2007,Angola,69,71,70,67,65,65,66,19,...,-0.574688,-1.053915,-1.099187,4.286831,108_Angola,9,4,3.093239,4.044452,2.295351
3,2007,Angola,60,63,61,57,53,53,53,29,...,0.433416,0.436678,0.3009,0.700384,102_Angola,9,4,0.677797,0.907431,0.783018
4,2007,Angola,67,69,68,63,61,61,61,22,...,1.139544,0.957729,0.12511,0.55345,43_Angola,9,4,0.412071,0.675967,0.605584


In [3]:
df.Countries.nunique()

30

In [4]:
df.Farm.nunique()

3887

One-hot for categorical variables

In [5]:
df_onehot = pd.get_dummies(df, columns=['Countries', 'Farm'], prefix=['Country', 'Farm'])
df_onehot

Unnamed: 0,Year,Sand_1,Sand_2,Sand_3,Sand_4,Sand_5,Sand_6,Sand_7,Clay_1,Clay_2,...,Farm_992_Democratic Republic of the Congo,Farm_993_Democratic Republic of the Congo,Farm_994_Democratic Republic of the Congo,Farm_995_Democratic Republic of the Congo,Farm_996_Democratic Republic of the Congo,Farm_997_Democratic Republic of the Congo,Farm_998_Democratic Republic of the Congo,Farm_999_Democratic Republic of the Congo,Farm_99_Angola,Farm_9_Angola
0,2007,50,51,51,48,45,46,46,37,35,...,False,False,False,False,False,False,False,False,False,False
1,2007,62,64,63,59,58,59,59,27,25,...,False,False,False,False,False,False,False,False,True,False
2,2007,69,71,70,67,65,65,66,19,16,...,False,False,False,False,False,False,False,False,False,False
3,2007,60,63,61,57,53,53,53,29,26,...,False,False,False,False,False,False,False,False,False,False
4,2007,67,69,68,63,61,61,61,22,19,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32354,2016,73,75,74,69,66,66,66,20,18,...,False,False,False,False,False,False,False,False,False,False
32355,2016,57,58,58,54,52,51,52,28,26,...,False,False,False,False,False,False,False,False,False,False
32356,2016,55,57,56,54,52,51,51,30,28,...,False,False,False,False,False,False,False,False,False,False
32357,2016,70,72,71,67,63,63,62,17,14,...,False,False,False,False,False,False,False,False,False,False


In [6]:
df_onehot.shape

(32359, 4000)

## Train-test split

In [7]:
# Separate a test set, the year 2016
test = df_onehot[df_onehot.Year == 2016]
df_onehot_train = df_onehot[df_onehot.Year != 2016]

## Cross-validation setup

In [8]:
# https://stackoverflow.com/questions/58069691/how-to-create-a-train-test-split-of-time-series-data-by-year

year_list = df_onehot_train['Year'].unique().tolist()
splits = {'train': [], 'val': []}

for idx, yr in enumerate(year_list[:-1]):
    if yr < 2010:
        # To get only the last 5 splits
        continue
    train_yr = year_list[:idx+1]
    test_yr = [year_list[idx+1]]
    print('TRAIN: ', train_yr, 'VAL: ',test_yr)
    
    splits['train'].append(df_onehot_train.loc[df_onehot_train.Year.isin(train_yr), :])
    splits['val'].append(df_onehot_train.loc[df_onehot_train.Year.isin(test_yr), :])

TRAIN:  [2007, 2008, 2009, 2010] VAL:  [2011]
TRAIN:  [2007, 2008, 2009, 2010, 2011] VAL:  [2012]
TRAIN:  [2007, 2008, 2009, 2010, 2011, 2012] VAL:  [2013]
TRAIN:  [2007, 2008, 2009, 2010, 2011, 2012, 2013] VAL:  [2014]
TRAIN:  [2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014] VAL:  [2015]


Baseline selection

In [9]:
SEED = 1

In [10]:
regressors = {
    "KNN": KNeighborsRegressor(n_neighbors=5), # 1
    "Random Forest": RandomForestRegressor(n_estimators=50, random_state=SEED), # 2
    "Gradient Boosting": AdaBoostRegressor(n_estimators=50, random_state=SEED) # 3
}

In [11]:
# This will take about 50 minutes

reg_names = []
cv_MAE = []
cv_RMSE = []
cv_folds = []

i = 1
for train, val in zip(splits['train'], splits['val']):

    print('Fold: ', i)
    i += 1

    # Shuffle
    train = train.sample(frac=1, random_state=SEED)
    val = val.sample(frac=1, random_state=SEED)
    
    # X and y
    X_train = train.drop(columns=['Y_maize_major','Year'], axis=1)
    y_train = train['Y_maize_major']
    X_val = val.drop(columns=['Y_maize_major','Year'], axis=1)
    y_val = val['Y_maize_major']

    # Scale to [0,1] range
    sc = MinMaxScaler()
    X_train = pd.DataFrame(sc.fit_transform(X_train), columns=X_train.columns)
    X_val = pd.DataFrame(sc.transform(X_val), columns=X_val.columns)

    # Fit and predict
    for reg_name, reg in regressors.items():
        reg_names.append(reg_name)
        reg.fit(X_train, y_train)
        y_val_pred = reg.predict(X_val)
        rmse = mean_squared_error(y_val, y_val_pred, squared=False)
        mae = mean_absolute_error(y_val, y_val_pred)
        cv_MAE.append(mae)
        cv_RMSE.append(rmse)
        cv_folds.append(i)

        print(reg_name)
        print('RMSE:', rmse)
        print('MAE:', mae)

    print('###')
    print()

Fold:  1
KNN
RMSE: 0.3819310056677642
MAE: 0.24173904248839592
Random Forest
RMSE: 0.39642733142417036
MAE: 0.2423667388154175
Gradient Boosting
RMSE: 0.5000855467877465
MAE: 0.4129980989777344
###

Fold:  2
KNN
RMSE: 0.3849340563407218
MAE: 0.2447192015488737
Random Forest
RMSE: 0.3912447027822915
MAE: 0.23087309833790448
Gradient Boosting
RMSE: 0.5235330671948287
MAE: 0.386980821826822
###

Fold:  3
KNN
RMSE: 0.3769166279858769
MAE: 0.23379424911092153


In [None]:
df_results = pd.DataFrame()
df_results['fold'] = cv_folds
df_results['regressor'] = reg_names
df_results['RMSE'] = cv_RMSE
df_results['MAE'] = cv_MAE
df_results

Unnamed: 0,fold,regressor,RMSE,MAE
0,2,KNN,0.381931,0.241739
1,2,Random Forest,0.396427,0.242367
2,2,Gradient Boosting,0.500086,0.412998
3,3,KNN,0.384934,0.244719
4,3,Random Forest,0.391245,0.230873
5,3,Gradient Boosting,0.523533,0.386981
6,4,KNN,0.376917,0.233794
7,4,Random Forest,0.335582,0.247103
8,4,Gradient Boosting,0.397042,0.317089
9,5,KNN,0.47351,0.311877


In [None]:
df_results.to_csv('baseline_selection_results.csv',index=False)

In [None]:
df_results.groupby('regressor').agg({'RMSE': ['min', 'max', 'mean'], 'MAE': ['min', 'max', 'mean']})

Unnamed: 0_level_0,RMSE,RMSE,RMSE,MAE,MAE,MAE
Unnamed: 0_level_1,min,max,mean,min,max,mean
regressor,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Gradient Boosting,0.397042,0.587859,0.493389,0.317089,0.412998,0.368664
KNN,0.376917,0.571034,0.437665,0.233794,0.331075,0.272641
Random Forest,0.335582,0.509344,0.410461,0.230873,0.262936,0.244674


As we can see, Random Forest has the best performance in both RMSE and MAE. Mean CV RMSE is 0.410461. Mean CV MAE is 0.244674.

## Refit Random Forest on train and predict on test set

In [None]:
# Shuffle
train = df_onehot_train.sample(frac=1, random_state=SEED)
test = test.sample(frac=1, random_state=SEED)

# X and y
X_train = train.drop(columns=['Y_maize_major','Year'], axis=1)
y_train = train['Y_maize_major']
X_test = test.drop(columns=['Y_maize_major','Year'], axis=1)
y_test = test['Y_maize_major']

# Scale to [0,1] range
sc = MinMaxScaler()
X_train = pd.DataFrame(sc.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(sc.transform(X_test), columns=X_test.columns)

In [None]:
# Fit on train and predict on test
reg = regressors['Random Forest']
reg.fit(X_train, y_train)
y_test_pred = reg.predict(X_test)

In [None]:
# Calculate and print metrics
rmse = mean_squared_error(y_test, y_test_pred, squared=False)
mae = mean_absolute_error(y_test, y_test_pred)
print('Performance on test set (year 2016) (baseline results)')
print('RMSE:', rmse)
print('MAE:', mae)

Performance on test set (year 2016) (baseline results)
RMSE: 0.34957354478142083
MAE: 0.2037005796011297


In [None]:
result_example = X_test.copy(deep=True)
result_example.insert(0, "y_pred", y_test_pred)
result_example.insert(0, "y_true", list(y_test))
result_example = result_example[result_example['Country_Democratic Republic of the Congo']==True]
result_example = result_example.loc[:,~result_example.columns.str.startswith('Country')]
result_example = result_example.loc[:,~result_example.columns.str.startswith('Farm')]
print('An example of true harvest maize major and predicted value for farms in Congo for year 2016:')
result_example.head(10)

An example of true harvest maize major and predicted value for farms in Congo for year 2016:


Unnamed: 0,y_true,y_pred,Sand_1,Sand_2,Sand_3,Sand_4,Sand_5,Sand_6,Sand_7,Clay_1,...,spi_12mon_8,spi_12mon_9,spi_12mon_10,spi_12mon_11,spi_12mon_12,Sow_Maize_month_int,Harvest_Maize_month_int,maize_lag-1,maize_lag-2,maize_lag-3
8,0.743546,0.678138,0.707692,0.716418,0.701493,0.69697,0.691176,0.691176,0.69697,0.333333,...,0.514627,0.524026,0.547731,0.586617,0.552612,0.545455,0.0,0.044966,0.048402,0.046394
9,0.929391,0.889212,0.553846,0.567164,0.552239,0.530303,0.5,0.485294,0.484848,0.470588,...,0.286514,0.283456,0.275987,0.316065,0.292958,0.545455,0.0,0.064482,0.064086,0.065058
10,0.773282,0.630775,0.476923,0.507463,0.492537,0.469697,0.455882,0.455882,0.454545,0.54902,...,0.425017,0.421177,0.371739,0.418382,0.386626,0.545455,0.0,0.033285,0.059246,0.058882
11,1.196573,1.221125,0.507692,0.537313,0.522388,0.5,0.470588,0.485294,0.484848,0.45098,...,0.323139,0.336894,0.37406,0.417048,0.39514,0.545455,0.0,0.086362,0.08551,0.087031
13,0.627577,0.631528,0.661538,0.671642,0.686567,0.666667,0.617647,0.617647,0.606061,0.333333,...,0.324695,0.312411,0.323476,0.337261,0.339365,0.545455,0.0,0.042617,0.045255,0.044097
36,0.973832,0.998984,0.507692,0.522388,0.507463,0.454545,0.426471,0.411765,0.424242,0.490196,...,0.480912,0.475134,0.472899,0.523768,0.443548,0.545455,0.0,0.065079,0.070121,0.0732
40,1.035206,1.029772,0.676923,0.686567,0.671642,0.681818,0.647059,0.647059,0.651515,0.372549,...,0.393324,0.347569,0.351349,0.378513,0.349365,0.545455,0.0,0.072267,0.073191,0.072435
43,1.102481,1.089537,0.476923,0.492537,0.477612,0.454545,0.441176,0.441176,0.424242,0.54902,...,0.318601,0.307776,0.318799,0.341581,0.324272,0.545455,0.0,0.079209,0.077729,0.078492
44,0.95934,0.971366,0.538462,0.552239,0.537313,0.484848,0.470588,0.470588,0.469697,0.45098,...,0.34574,0.34138,0.375738,0.430789,0.414329,0.545455,0.0,0.069983,0.06705,0.06909
52,1.342746,1.643682,0.292308,0.313433,0.298507,0.287879,0.294118,0.294118,0.318182,0.607843,...,0.600942,0.608665,0.586783,0.549201,0.441641,0.545455,0.0,0.088017,0.171576,0.160413
