In [29]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler

In [34]:
df = pd.read_csv('../data/df_prepped.csv')
pd.set_option('display.max_columns', None)

print('df.shape:', df.shape)
df.head()

df.shape: (32359, 46)


Unnamed: 0,Year,Countries,Sand_1,Sand_2,Sand_3,Sand_4,Sand_5,Sand_6,Sand_7,Clay_1,Clay_2,Clay_3,Clay_4,Clay_5,Clay_6,Clay_7,OC_1,OC_2,OC_3,OC_4,OC_5,OC_6,OC_7,PAW_1,PAW_2,PAW_3,PAW_4,PAW_5,PAW_6,PAW_7,Y_maize_major,Farm,Sow_Maize_month_int,Harvest_Maize_month_int,sow_to_harvest_months,pcp_mean,tmax_mean,tmin_mean,spi_mean,maize_lag-1,maize_lag-2,maize_lag-3,pcp_mean_lag-1,tmax_mean_lag-1,tmix_mean_lag-1,spi_mean_lag-1
0,2007,Angola,50,51,51,48,45,46,46,37,35,36,39,42,42,42,0.52,0.23,0.17,0.09,0.04,0.02,0.02,0.15,0.15,0.14,0.13,0.1,0.07,0.07,0.615357,104_Angola,9,4,7,114.212553,301.525359,292.421194,0.925277,0.554392,0.721607,0.620005,97.103755,301.939623,292.21402,0.093447
1,2007,Angola,62,64,63,59,58,59,59,27,25,26,29,31,30,30,0.11,0.05,0.07,0.04,0.02,0.02,0.01,0.11,0.1,0.1,0.09,0.07,0.07,0.03,0.257656,99_Angola,9,4,7,29.700994,304.262436,288.525057,0.685189,0.117051,0.300217,0.212699,59.292237,301.882929,288.092753,0.182926
2,2007,Angola,69,71,70,67,65,65,66,19,16,18,21,24,24,23,0.09,0.06,0.07,0.04,0.02,0.02,0.02,0.1,0.1,0.1,0.09,0.07,0.07,0.07,4.286831,108_Angola,9,4,7,20.730178,305.84436,290.321532,-0.117002,3.093239,4.044452,2.295351,58.196545,302.89142,289.377311,0.991663
3,2007,Angola,60,63,61,57,53,53,53,29,26,28,32,35,36,36,0.46,0.16,0.14,0.08,0.05,0.04,0.03,0.12,0.13,0.12,0.12,0.11,0.1,0.09,0.700384,102_Angola,9,4,7,150.898834,299.110089,287.426233,0.417313,0.677797,0.907431,0.783018,149.210195,298.973795,287.311403,0.206751
4,2007,Angola,67,69,68,63,61,61,61,22,19,21,25,28,28,29,0.15,0.09,0.09,0.05,0.02,0.01,0.01,0.11,0.11,0.11,0.11,0.08,0.04,0.04,0.55345,43_Angola,9,4,7,55.400451,304.329997,290.368481,1.314301,0.412071,0.675967,0.605584,74.556629,304.00686,290.606725,-0.075621


In [35]:
df.Countries.nunique()

30

In [36]:
df.Farm.nunique()

3887

## Train-test split

In [43]:
df_label = df.loc[:,['Countries','Farm']]
#df = df[df.Countries == 'Kenya']
df = df.drop(['Countries','Farm'], axis=1)

KeyError: "None of [Index(['Countries', 'Farm'], dtype='object')] are in the [columns]"

In [38]:
# Separate a test set, the year 2016
test = df[df.Year == 2016]
df_train = df[df.Year < 2016]

## Cross-validation setup

In [39]:
# https://stackoverflow.com/questions/58069691/how-to-create-a-train-test-split-of-time-series-data-by-year

year_list = df['Year'].unique().tolist()
splits = {'train': [], 'val': []}

for idx, yr in enumerate(year_list[:-1]):
    if yr < 2010:
        # To get only the last 5 splits
        continue
    train_yr = year_list[:idx+1]
    test_yr = [year_list[idx+1]]
    print('TRAIN: ', train_yr, 'VAL: ',test_yr)
    
    splits['train'].append(df.loc[df.Year.isin(train_yr), :])
    splits['val'].append(df.loc[df.Year.isin(test_yr), :])

TRAIN:  [2007, 2008, 2009, 2010] VAL:  [2011]
TRAIN:  [2007, 2008, 2009, 2010, 2011] VAL:  [2012]
TRAIN:  [2007, 2008, 2009, 2010, 2011, 2012] VAL:  [2013]
TRAIN:  [2007, 2008, 2009, 2010, 2011, 2012, 2013] VAL:  [2014]
TRAIN:  [2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014] VAL:  [2015]
TRAIN:  [2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015] VAL:  [2016]


Baseline selection

In [40]:
SEED = 1

In [41]:
regressors = {
    "KNN": KNeighborsRegressor(n_neighbors=5), # 1
    "Random Forest": RandomForestRegressor(n_estimators=50, random_state=SEED), # 2
    "Gradient Boosting": AdaBoostRegressor(n_estimators=50, random_state=SEED) # 3
}

In [42]:
# This will take about 50 minutes

reg_names = []
cv_MAE = []
cv_RMSE = []
cv_folds = []

i = 1
for train, val in zip(splits['train'], splits['val']):

    print('Fold: ', i)
    i += 1

    # Shuffle
    train = train.sample(frac=1, random_state=SEED)
    val = val.sample(frac=1, random_state=SEED)
    
    # X and y
    X_train = train.drop(columns=['Y_maize_major','Year'], axis=1)
    y_train = train['Y_maize_major']
    X_val = val.drop(columns=['Y_maize_major','Year'], axis=1)
    y_val = val['Y_maize_major']

    # Scale to [0,1] range
    sc = MinMaxScaler()
    X_train = pd.DataFrame(sc.fit_transform(X_train), columns=X_train.columns)
    X_val = pd.DataFrame(sc.transform(X_val), columns=X_val.columns)

    # Fit and predict
    for reg_name, reg in regressors.items():
        reg_names.append(reg_name)
        reg.fit(X_train, y_train)
        y_val_pred = reg.predict(X_val)
        rmse = mean_squared_error(y_val, y_val_pred, squared=False)
        mae = mean_absolute_error(y_val, y_val_pred)
        cv_MAE.append(mae)
        cv_RMSE.append(rmse)
        cv_folds.append(i)

        print(reg_name)
        print('RMSE:', rmse)
        print('MAE:', mae)

    print('###')
    print()

Fold:  1
KNN
RMSE: 0.4465806323595626
MAE: 0.28868668786784984
Random Forest
RMSE: 0.38437255963678657
MAE: 0.24324160184920535
Gradient Boosting
RMSE: 0.4881639031346736
MAE: 0.3979197209576341
###

Fold:  2
KNN
RMSE: 0.42722009842246067
MAE: 0.2687490135935836
Random Forest
RMSE: 0.39959561303281077
MAE: 0.23229678562401537
Gradient Boosting
RMSE: 0.5238886617186116
MAE: 0.396004782549813
###

Fold:  3
KNN
RMSE: 0.39912055116539635
MAE: 0.26739505304061434
Random Forest
RMSE: 0.34268828660347667
MAE: 0.25237963057903645
Gradient Boosting
RMSE: 0.40628242660175673
MAE: 0.32650477176005444
###

Fold:  4
KNN
RMSE: 0.48346694784439376
MAE: 0.3083752392950171
Random Forest
RMSE: 0.42932592874054964
MAE: 0.2400399162353698
Gradient Boosting
RMSE: 0.4786026311967016
MAE: 0.34664523545415565
###

Fold:  5
KNN
RMSE: 0.53934452014496
MAE: 0.3207416859771507
Random Forest
RMSE: 0.5431569653156771
MAE: 0.27480257737191516
Gradient Boosting
RMSE: 0.6223618966126824
MAE: 0.44517302393877406
###

F

In [44]:
df_results = pd.DataFrame()
df_results['fold'] = cv_folds
df_results['regressor'] = reg_names
df_results['RMSE'] = cv_RMSE
df_results['MAE'] = cv_MAE
df_results

Unnamed: 0,fold,regressor,RMSE,MAE
0,2,KNN,0.446581,0.288687
1,2,Random Forest,0.384373,0.243242
2,2,Gradient Boosting,0.488164,0.39792
3,3,KNN,0.42722,0.268749
4,3,Random Forest,0.399596,0.232297
5,3,Gradient Boosting,0.523889,0.396005
6,4,KNN,0.399121,0.267395
7,4,Random Forest,0.342688,0.25238
8,4,Gradient Boosting,0.406282,0.326505
9,5,KNN,0.483467,0.308375


In [45]:
df_results.to_csv('baseline_selection_results_Jan.csv',index=False)

In [46]:
df_results.groupby('regressor').agg({'RMSE': ['mean'], 'MAE': [ 'mean']})

Unnamed: 0_level_0,RMSE,MAE
Unnamed: 0_level_1,mean,mean
regressor,Unnamed: 1_level_2,Unnamed: 2_level_2
Gradient Boosting,0.509071,0.392796
KNN,0.435257,0.277573
Random Forest,0.410763,0.242607


In [51]:
## How to find from results mean RMSE for Random Forest 
mean_RMSE_random_forest = df_results[df_results['regressor'] == 'Random Forest'].agg({'RMSE': ['mean']}).values[0][0]
mean_MAE_random_forest = df_results[df_results['regressor'] == 'Random Forest'].agg({'MAE': ['mean']}).values[0][0]


In [62]:
print("As we can see, Random Forest has the best performance in both RMSE and MAE. \n\nMean RMSE is:\t {:.4f}".format(mean_RMSE_random_forest))
print("Mean MAE is:\t {:.4f}".format(mean_MAE_random_forest))


As we can see, Random Forest has the best performance in both RMSE and MAE. 

Mean RMSE is:	 0.4108
Mean MAE is:	 0.2426


## Refit Random Forest on train and predict on test set

In [63]:
# Shuffle
train = df.sample(frac=1, random_state=SEED)
test = test.sample(frac=1, random_state=SEED)

# X and y
X_train = train.drop(columns=['Y_maize_major','Year'], axis=1)
y_train = train['Y_maize_major']
X_test = test.drop(columns=['Y_maize_major','Year'], axis=1)
y_test = test['Y_maize_major']

# Scale to [0,1] range
sc = MinMaxScaler()
X_train = pd.DataFrame(sc.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(sc.transform(X_test), columns=X_test.columns)

In [65]:
# Fit on train and predict on test
reg = regressors['Random Forest']
reg.fit(X_train, y_train)
y_test_pred = reg.predict(X_test)

In [68]:
# Calculate and print metrics
rmse = mean_squared_error(y_test, y_test_pred, squared=False)
mae = mean_absolute_error(y_test, y_test_pred)
print('Performance on test set (year 2016) (baseline results)')
print('RMSE:',round(rmse,4))
print('MAE:',round(mae,4))

Performance on test set (year 2016) (baseline results)
RMSE: 0.1129
MAE: 0.0627
