In [1]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pycaret
from pycaret.regression import *

In [2]:
# check installed version
pycaret.__version__

'3.2.0'

In [3]:
# Load prepared data
df = pd.read_csv('../data/df_prepped.csv')
df.head()

Unnamed: 0,Year,Countries,Sand_1,Sand_2,Sand_3,Sand_4,Sand_5,Sand_6,Sand_7,Clay_1,...,maize_lag-2,pcp_mean_lag-2,tmax_mean_lag-2,tmin_mean_lag-2,spi_mean_lag-2,maize_lag-3,pcp_mean_lag-3,tmax_mean_lag-3,tmin_mean_lag-3,spi_mean_lag-3
0,2007,Angola,50,51,51,48,45,46,46,37,...,0.721607,129.051864,301.518536,292.496579,1.644698,0.620005,109.983325,301.786056,292.204097,0.514275
1,2007,Angola,62,64,63,59,58,59,59,27,...,0.300217,47.697564,303.988747,288.916992,0.909295,0.212699,41.130026,303.298082,288.642853,0.588172
2,2007,Angola,69,71,70,67,65,65,66,19,...,4.044452,42.130629,305.494178,290.535403,0.952237,2.295351,35.049776,304.824778,290.284886,0.371446
3,2007,Angola,60,63,61,57,53,53,53,29,...,0.907431,159.454723,299.404975,287.724299,1.374616,0.783018,174.08826,298.908208,287.362407,0.643207
4,2007,Angola,67,69,68,63,61,61,61,22,...,0.675967,66.69867,304.644632,290.635254,1.144088,0.605584,67.404588,303.930955,290.564185,0.553079


In [4]:
print('Dataframe shape: ', df.shape)
print('Num unique countries: ', df.Countries.nunique())
print('Num unique farms: ', df.Farm.nunique())
print('Num unique years: ', df.Year.nunique())
print('Years: ', list(df.Year.unique()))

Dataframe shape:  (32359, 50)
Num unique countries:  30
Num unique farms:  3887
Num unique years:  10
Years:  [2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016]


In [5]:
# Drop Countries and Farm
df_label = df.loc[:,['Countries','Farm']]
df = df.drop(['Countries','Farm'], axis=1)

In [6]:
# Separate a test set, the year 2016
df_test = df[df.Year == 2016].sort_values('Year')
df_train = df[df.Year != 2016]

print('The training set has years: ', list(df_train.Year.unique()))
print('The test set has years: ', list(df_test.Year.unique()))

The training set has years:  [2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015]
The test set has years:  [2016]


In [7]:
df_test.shape, df_train.shape

((2930, 48), (29429, 48))

In [33]:
list(df.columns)

['Year',
 'Sand_1',
 'Sand_2',
 'Sand_3',
 'Sand_4',
 'Sand_5',
 'Sand_6',
 'Sand_7',
 'Clay_1',
 'Clay_2',
 'Clay_3',
 'Clay_4',
 'Clay_5',
 'Clay_6',
 'Clay_7',
 'OC_1',
 'OC_2',
 'OC_3',
 'OC_4',
 'OC_5',
 'OC_6',
 'OC_7',
 'PAW_1',
 'PAW_2',
 'PAW_3',
 'PAW_4',
 'PAW_5',
 'PAW_6',
 'PAW_7',
 'Y_maize_major',
 'Sow_Maize_month_int',
 'Harvest_Maize_month_int',
 'sow_to_harvest_months',
 'maize_lag-1',
 'pcp_mean_lag-1',
 'tmax_mean_lag-1',
 'tmin_mean_lag-1',
 'spi_mean_lag-1',
 'maize_lag-2',
 'pcp_mean_lag-2',
 'tmax_mean_lag-2',
 'tmin_mean_lag-2',
 'spi_mean_lag-2',
 'maize_lag-3',
 'pcp_mean_lag-3',
 'tmax_mean_lag-3',
 'tmin_mean_lag-3',
 'spi_mean_lag-3']

## Pycaret

In [8]:
# init setup
s = setup(data=df_train, 
          test_data = df_test, 
          target = 'Y_maize_major', 
          fold = 5, 
          normalize = True,
          normalize_method = 'robust',
          session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Y_maize_major
2,Target type,Regression
3,Original data shape,"(32359, 48)"
4,Transformed data shape,"(32359, 48)"
5,Transformed train set shape,"(29429, 48)"
6,Transformed test set shape,"(2930, 48)"
7,Numeric features,47
8,Preprocess,True
9,Imputation type,simple


In [9]:
# Select top N models (default hyperparameters, without tuning)
N = 5
best_N = compare_models(n_select = N, sort = 'MAE', exclude=['lar'])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,0.2416,0.1596,0.3945,0.9318,0.1223,0.1976,0.59
omp,Orthogonal Matching Pursuit,0.2496,0.165,0.401,0.9293,0.1252,0.214,0.084
br,Bayesian Ridge,0.2537,0.1695,0.407,0.9274,0.1271,0.2142,0.152
ridge,Ridge Regression,0.2539,0.1695,0.407,0.9274,0.1272,0.2146,0.176
lr,Linear Regression,0.2541,0.1695,0.4071,0.9274,0.1273,0.2148,2.716
et,Extra Trees Regressor,0.2662,0.1965,0.4383,0.9136,0.1303,0.2168,9.19
lightgbm,Light Gradient Boosting Machine,0.275,0.2124,0.4548,0.9018,0.134,0.2194,0.46
rf,Random Forest Regressor,0.2798,0.2201,0.4635,0.8992,0.1353,0.2214,43.572
gbr,Gradient Boosting Regressor,0.2803,0.2107,0.4546,0.9043,0.1337,0.2333,11.516
knn,K Neighbors Regressor,0.2833,0.2099,0.4542,0.9068,0.1384,0.2389,0.618


In [10]:
df_pycaret_results = pull()

In [11]:
df_pycaret_results.to_csv('../experiment_results/pycaret_cv.csv')

In [12]:
# Tune best models
best_N_tuned = [tune_model(model) for model in best_N]

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2193,0.1219,0.3491,0.916,0.1245,0.2284
1,0.2962,0.2573,0.5072,0.9051,0.1394,0.2101
2,0.2412,0.1299,0.3604,0.9448,0.1218,0.1689
3,0.2221,0.1179,0.3433,0.9534,0.1114,0.1904
4,0.2233,0.1663,0.4078,0.9415,0.1127,0.1803
Mean,0.2404,0.1587,0.3936,0.9321,0.122,0.1956
Std,0.029,0.0522,0.0612,0.0184,0.0101,0.0213


Fitting 5 folds for each of 10 candidates, totalling 50 fits


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2375,0.1301,0.3608,0.9103,0.13,0.264
1,0.3127,0.2767,0.526,0.898,0.1447,0.2262
2,0.2522,0.1381,0.3716,0.9413,0.1277,0.1933
3,0.2362,0.1363,0.3691,0.9461,0.1181,0.2023
4,0.2318,0.1664,0.4079,0.9415,0.1157,0.1885
Mean,0.2541,0.1695,0.4071,0.9274,0.1273,0.2148
Std,0.0301,0.055,0.0616,0.0195,0.0103,0.0278


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2374,0.1302,0.3608,0.9102,0.13,0.2643
1,0.3125,0.277,0.5264,0.8978,0.1448,0.2251
2,0.2514,0.1376,0.3709,0.9415,0.1273,0.1922
3,0.2361,0.1362,0.369,0.9462,0.1179,0.2016
4,0.2314,0.1665,0.408,0.9414,0.1156,0.1883
Mean,0.2538,0.1695,0.407,0.9274,0.1271,0.2143
Std,0.0301,0.0552,0.0618,0.0196,0.0104,0.0281


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2377,0.1305,0.3613,0.91,0.13,0.2664
1,0.3123,0.2776,0.5269,0.8976,0.1449,0.2238
2,0.2491,0.136,0.3688,0.9422,0.1259,0.1887
3,0.236,0.1361,0.3689,0.9462,0.1174,0.2002
4,0.2308,0.1667,0.4083,0.9414,0.1154,0.1877
Mean,0.2532,0.1694,0.4068,0.9275,0.1267,0.2134
Std,0.0301,0.0556,0.0623,0.0198,0.0105,0.0295


Fitting 5 folds for each of 10 candidates, totalling 50 fits


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2375,0.1301,0.3608,0.9103,0.13,0.264
1,0.3127,0.2767,0.526,0.898,0.1447,0.2262
2,0.2522,0.1381,0.3716,0.9413,0.1277,0.1933
3,0.2362,0.1363,0.3691,0.9461,0.1181,0.2023
4,0.2318,0.1664,0.4079,0.9415,0.1157,0.1885
Mean,0.2541,0.1695,0.4071,0.9274,0.1273,0.2148
Std,0.0301,0.055,0.0616,0.0195,0.0103,0.0278


Fitting 5 folds for each of 2 candidates, totalling 10 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [13]:
top_3_models = compare_models(n_select = 3, sort = 'MAE', include=best_N_tuned)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
0,Huber Regressor,0.2404,0.1587,0.3936,0.9321,0.122,0.1956,0.514
1,Orthogonal Matching Pursuit,0.2496,0.165,0.401,0.9293,0.1252,0.214,0.068
3,Ridge Regression,0.2532,0.1694,0.4068,0.9275,0.1267,0.2134,0.078
2,Bayesian Ridge,0.2537,0.1695,0.407,0.9274,0.1271,0.2142,0.138
4,Linear Regression,0.2541,0.1695,0.4071,0.9274,0.1273,0.2148,0.112


In [14]:
df_pycaret_cv_with_tuning = pull()

In [15]:
df_pycaret_cv_with_tuning

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
0,Huber Regressor,0.2404,0.1587,0.3936,0.9321,0.122,0.1956,0.514
1,Orthogonal Matching Pursuit,0.2496,0.165,0.401,0.9293,0.1252,0.214,0.068
3,Ridge Regression,0.2532,0.1694,0.4068,0.9275,0.1267,0.2134,0.078
2,Bayesian Ridge,0.2537,0.1695,0.407,0.9274,0.1271,0.2142,0.138
4,Linear Regression,0.2541,0.1695,0.4071,0.9274,0.1273,0.2148,0.112


In [16]:
df_pycaret_cv_with_tuning.to_csv('../experiment_results/pycaret_cv_with_tuned_models.csv')

In [17]:
# blend top 3 models
top_3_blended = blend_models(top_3_models)
top_3_blended

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2251,0.1227,0.3503,0.9154,0.1251,0.2512
1,0.3039,0.2689,0.5186,0.9008,0.1431,0.2164
2,0.2379,0.1259,0.3548,0.9465,0.1195,0.169
3,0.229,0.1289,0.359,0.949,0.1143,0.1959
4,0.2225,0.1632,0.404,0.9426,0.1124,0.1804
Mean,0.2437,0.1619,0.3973,0.9309,0.1229,0.2026
Std,0.0306,0.0554,0.0636,0.0193,0.011,0.029


In [18]:
# stack models
top_3_stacked = stack_models(top_3_models)
top_3_stacked

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2173,0.1209,0.3476,0.9167,0.1249,0.2348
1,0.3164,0.2743,0.5237,0.8988,0.1471,0.2415
2,0.2528,0.1396,0.3737,0.9406,0.1275,0.1846
3,0.2396,0.1312,0.3623,0.9481,0.1165,0.1967
4,0.2255,0.1684,0.4103,0.9408,0.1144,0.183
Mean,0.2503,0.1669,0.4035,0.929,0.1261,0.2081
Std,0.0352,0.056,0.0636,0.0184,0.0116,0.0251


In [19]:
# get leaderboard
lb = get_leaderboard()
lb

Unnamed: 0_level_0,Model Name,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,Linear Regression,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.2541,0.1695,0.4071,0.9274,0.1273,0.2148
1,Lasso Regression,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.6614,0.9859,0.9832,0.5902,0.2884,0.7978
2,Ridge Regression,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.2539,0.1695,0.407,0.9274,0.1272,0.2146
3,Elastic Net,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.4647,0.4926,0.69,0.7973,0.2102,0.5564
4,Lasso Least Angle Regression,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.6614,0.986,0.9832,0.5902,0.2884,0.7978
5,Orthogonal Matching Pursuit,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.2496,0.165,0.401,0.9293,0.1252,0.214
6,Bayesian Ridge,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.2537,0.1695,0.407,0.9274,0.1271,0.2142
7,Passive Aggressive Regressor,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.4092,1.4394,0.9624,0.1147,0.2004,0.4718
8,Huber Regressor,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.2416,0.1596,0.3945,0.9318,0.1223,0.1976
9,K Neighbors Regressor,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.2833,0.2099,0.4542,0.9068,0.1384,0.2389


In [20]:
lb.sort_values(by='MAE', ascending=True)

Unnamed: 0_level_0,Model Name,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
27,Huber Regressor,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.2404,0.1587,0.3936,0.9321,0.122,0.1956
17,Huber Regressor,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.2404,0.1587,0.3936,0.9321,0.122,0.1956
18,Huber Regressor,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.2416,0.1596,0.3945,0.9318,0.1223,0.1976
8,Huber Regressor,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.2416,0.1596,0.3945,0.9318,0.1223,0.1976
32,Voting Regressor,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.2437,0.1619,0.3973,0.9309,0.1229,0.2026
28,Orthogonal Matching Pursuit,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.2496,0.165,0.401,0.9293,0.1252,0.214
20,Orthogonal Matching Pursuit,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.2496,0.165,0.401,0.9293,0.1252,0.214
5,Orthogonal Matching Pursuit,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.2496,0.165,0.401,0.9293,0.1252,0.214
33,Stacking Regressor,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.2503,0.1669,0.4035,0.929,0.1261,0.2081
30,Ridge Regression,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.2532,0.1694,0.4068,0.9275,0.1267,0.2134


In [21]:
# select the best model based on MAE
best_model = lb.sort_values(by='MAE', ascending=True)['Model'].iloc[0]

In [22]:
best_model

In [28]:
# predict on test set
holdout_pred = predict_model(best_model)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,0.1994,0.1055,0.3248,0.9605,0.0989,0.1386


In [30]:
# predict on test set
holdout_pred_blended = predict_model(top_3_blended)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Voting Regressor,0.2017,0.1036,0.3219,0.9612,0.0994,0.1424


In [31]:
# predict on test set
holdout_pred_stacked = predict_model(top_3_stacked)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Stacking Regressor,0.1987,0.0922,0.3036,0.9655,0.1061,0.1519


In [26]:
best_model