In [32]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler

In [33]:
df = pd.read_csv('../data/df_prepped.csv')
pd.set_option('display.max_columns', None)

print('df.shape:', df.shape)
df.head()

df.shape: (32359, 46)


Unnamed: 0,Year,Countries,Sand_1,Sand_2,Sand_3,Sand_4,Sand_5,Sand_6,Sand_7,Clay_1,Clay_2,Clay_3,Clay_4,Clay_5,Clay_6,Clay_7,OC_1,OC_2,OC_3,OC_4,OC_5,OC_6,OC_7,PAW_1,PAW_2,PAW_3,PAW_4,PAW_5,PAW_6,PAW_7,Y_maize_major,Farm,Sow_Maize_month_int,Harvest_Maize_month_int,sow_to_harvest_months,pcp_mean,tmax_mean,tmin_mean,spi_mean,maize_lag-1,maize_lag-2,maize_lag-3,pcp_mean_lag-1,tmax_mean_lag-1,tmix_mean_lag-1,spi_mean_lag-1
0,2007,Angola,50,51,51,48,45,46,46,37,35,36,39,42,42,42,0.52,0.23,0.17,0.09,0.04,0.02,0.02,0.15,0.15,0.14,0.13,0.1,0.07,0.07,0.615357,104_Angola,9,4,7,114.212553,301.525359,292.421194,0.925277,0.554392,0.721607,0.620005,97.103755,301.939623,292.21402,0.093447
1,2007,Angola,62,64,63,59,58,59,59,27,25,26,29,31,30,30,0.11,0.05,0.07,0.04,0.02,0.02,0.01,0.11,0.1,0.1,0.09,0.07,0.07,0.03,0.257656,99_Angola,9,4,7,29.700994,304.262436,288.525057,0.685189,0.117051,0.300217,0.212699,59.292237,301.882929,288.092753,0.182926
2,2007,Angola,69,71,70,67,65,65,66,19,16,18,21,24,24,23,0.09,0.06,0.07,0.04,0.02,0.02,0.02,0.1,0.1,0.1,0.09,0.07,0.07,0.07,4.286831,108_Angola,9,4,7,20.730178,305.84436,290.321532,-0.117002,3.093239,4.044452,2.295351,58.196545,302.89142,289.377311,0.991663
3,2007,Angola,60,63,61,57,53,53,53,29,26,28,32,35,36,36,0.46,0.16,0.14,0.08,0.05,0.04,0.03,0.12,0.13,0.12,0.12,0.11,0.1,0.09,0.700384,102_Angola,9,4,7,150.898834,299.110089,287.426233,0.417313,0.677797,0.907431,0.783018,149.210195,298.973795,287.311403,0.206751
4,2007,Angola,67,69,68,63,61,61,61,22,19,21,25,28,28,29,0.15,0.09,0.09,0.05,0.02,0.01,0.01,0.11,0.11,0.11,0.11,0.08,0.04,0.04,0.55345,43_Angola,9,4,7,55.400451,304.329997,290.368481,1.314301,0.412071,0.675967,0.605584,74.556629,304.00686,290.606725,-0.075621


### Prepare train and test sets

In [34]:
df_label = df.loc[:,['Countries','Farm']]
#df = df[df.Countries == 'Kenya']
df = df.drop(['Countries','Farm'], axis=1)

In [35]:
# Separate a test set, the year 2016
df = df[df.Year <= 2016]

test = df[df.Year == 2016]
df_train = df[df.Year < 2016]

#### Cross-valiadation set-up

In [36]:
# https://stackoverflow.com/questions/58069691/how-to-create-a-train-test-split-of-time-series-data-by-year

year_list = df['Year'].unique().tolist()
splits = {'train': [], 'val': []}

for idx, yr in enumerate(year_list[:-1]):
    if yr < 2010:
        # To get only the last 5 splits
        continue
    train_yr = year_list[:idx+1]
    test_yr = [year_list[idx+1]]
    print('TRAIN: ', train_yr, 'VAL: ',test_yr)
    
    splits['train'].append(df.loc[df.Year.isin(train_yr), :])
    splits['val'].append(df.loc[df.Year.isin(test_yr), :])

TRAIN:  [2007, 2008, 2009, 2010] VAL:  [2011]
TRAIN:  [2007, 2008, 2009, 2010, 2011] VAL:  [2012]
TRAIN:  [2007, 2008, 2009, 2010, 2011, 2012] VAL:  [2013]
TRAIN:  [2007, 2008, 2009, 2010, 2011, 2012, 2013] VAL:  [2014]
TRAIN:  [2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014] VAL:  [2015]
TRAIN:  [2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015] VAL:  [2016]


In [37]:
SEED = 1

### Find the best TPOT model on the whole df_train

In [38]:
from tpot import TPOTRegressor


In [39]:
train = df_train.copy()
X_train = train.drop(columns=['Y_maize_major','Year'], axis=1)
y_train = train['Y_maize_major']

In [40]:
tpot = TPOTRegressor(generations=10, population_size=10, scoring='neg_mean_squared_error',max_time_mins=10)

mod = tpot.fit(features=X_train, target=y_train)
pipeline = tpot.fitted_pipeline_



In [None]:
print(pipeline)

Pipeline(steps=[('randomforestregressor',
                 RandomForestRegressor(max_features=0.6000000000000001,
                                       min_samples_leaf=5,
                                       min_samples_split=3))])


#### Evaluate TPOT using CV

In [None]:
reg_names = []
cv_MAE = []
cv_RMSE = []
cv_folds = []

i = 1
for train, val in zip(splits['train'], splits['val']):

    print('Fold: ', i)
    i += 1

    # Shuffle
    train = train.sample(frac=1, random_state=SEED)
    val = val.sample(frac=1, random_state=SEED)
    
    # X and y
    X_train = train.drop(columns=['Y_maize_major','Year'], axis=1)
    y_train = train['Y_maize_major']
    X_val = val.drop(columns=['Y_maize_major','Year'], axis=1)
    y_val = val['Y_maize_major']

    # Scale to [0,1] range
    sc = MinMaxScaler()
    X_train = pd.DataFrame(sc.fit_transform(X_train), columns=X_train.columns)
    X_val = pd.DataFrame(sc.transform(X_val), columns=X_val.columns)

   
    pipeline.fit(X_train, y_train)
    y_val_pred = pipeline.predict(X_val)
    rmse = mean_squared_error(y_val, y_val_pred, squared=False)
    mae = mean_absolute_error(y_val, y_val_pred)
    cv_MAE.append(mae)
    cv_RMSE.append(rmse)
    cv_folds.append(i)

    print('RMSE:', rmse)
    print('MAE:', mae)

    print('###')
    print()

Fold:  1


KeyboardInterrupt: 

In [None]:
df_results = pd.DataFrame()
df_results['fold'] = cv_folds
df_results['regressor'] = 'TPOT'
df_results['RMSE'] = cv_RMSE
df_results['MAE'] = cv_MAE
df_results

In [None]:
df_results.to_csv('TPOT_results.csv',index=False)