In [1]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pycaret
from pycaret.regression import *

In [2]:
# check installed version
pycaret.__version__

'3.2.0'

In [3]:
# Load prepared data
df = pd.read_csv('../data/df_prepped.csv')
df.head()

Unnamed: 0,Year,Countries,Sand_1,Sand_2,Sand_3,Sand_4,Sand_5,Sand_6,Sand_7,Clay_1,...,tmax_mean,tmin_mean,spi_mean,maize_lag-1,maize_lag-2,maize_lag-3,pcp_mean_lag-1,tmax_mean_lag-1,tmix_mean_lag-1,spi_mean_lag-1
0,2007,Angola,50,51,51,48,45,46,46,37,...,301.525359,292.421194,0.925277,0.554392,0.721607,0.620005,97.103755,301.939623,292.21402,0.093447
1,2007,Angola,62,64,63,59,58,59,59,27,...,304.262436,288.525057,0.685189,0.117051,0.300217,0.212699,59.292237,301.882929,288.092753,0.182926
2,2007,Angola,69,71,70,67,65,65,66,19,...,305.84436,290.321532,-0.117002,3.093239,4.044452,2.295351,58.196545,302.89142,289.377311,0.991663
3,2007,Angola,60,63,61,57,53,53,53,29,...,299.110089,287.426233,0.417313,0.677797,0.907431,0.783018,149.210195,298.973795,287.311403,0.206751
4,2007,Angola,67,69,68,63,61,61,61,22,...,304.329997,290.368481,1.314301,0.412071,0.675967,0.605584,74.556629,304.00686,290.606725,-0.075621


In [4]:
print('Dataframe shape: ', df.shape)
print('Num unique countries: ', df.Countries.nunique())
print('Num unique farms: ', df.Farm.nunique())
print('Num unique years: ', df.Year.nunique())
print('Years: ', list(df.Year.unique()))

Dataframe shape:  (32359, 46)
Num unique countries:  30
Num unique farms:  3887
Num unique years:  10
Years:  [2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016]


In [5]:
# Drop Countries and Farm
df_label = df.loc[:,['Countries','Farm']]
df = df.drop(['Countries','Farm'], axis=1)

In [6]:
# Separate a test set, the year 2016
df_test = df[df.Year == 2016].sort_values('Year')
df_train = df[df.Year != 2016]

print('The training set has years: ', list(df_train.Year.unique()))
print('The test set has years: ', list(df_test.Year.unique()))

The training set has years:  [2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015]
The test set has years:  [2016]


In [7]:
df_test.shape, df_train.shape

((2930, 44), (29429, 44))

## Pycaret

In [8]:
# init setup
s = setup(data=df_train, 
          test_data = df_test, 
          target = 'Y_maize_major', 
          fold = 5, 
          normalize = True,
          normalize_method = 'robust',
          session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Y_maize_major
2,Target type,Regression
3,Original data shape,"(32359, 44)"
4,Transformed data shape,"(32359, 44)"
5,Transformed train set shape,"(29429, 44)"
6,Transformed test set shape,"(2930, 44)"
7,Numeric features,43
8,Preprocess,True
9,Imputation type,simple


In [9]:
# Select top N models (default hyperparameters, without tuning)
N = 5
best_N = compare_models(n_select = N, sort = 'MAE', exclude=['lar'])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,0.2413,0.1624,0.3984,0.93,0.1222,0.1946,0.494
omp,Orthogonal Matching Pursuit,0.2513,0.1664,0.4026,0.9288,0.1256,0.2156,0.07
ridge,Ridge Regression,0.2526,0.1736,0.4131,0.9242,0.1255,0.2104,0.068
lr,Linear Regression,0.2527,0.1732,0.4125,0.9245,0.1255,0.2106,3.114
br,Bayesian Ridge,0.2527,0.1744,0.414,0.9236,0.1256,0.2104,0.11
et,Extra Trees Regressor,0.2656,0.1955,0.4373,0.914,0.1299,0.2176,8.778
lightgbm,Light Gradient Boosting Machine,0.2782,0.2146,0.456,0.9003,0.1342,0.2193,0.51
knn,K Neighbors Regressor,0.2791,0.2075,0.451,0.9083,0.135,0.2319,0.602
gbr,Gradient Boosting Regressor,0.2793,0.2075,0.4517,0.9059,0.1327,0.2318,7.284
rf,Random Forest Regressor,0.2809,0.2219,0.4653,0.8976,0.135,0.2241,28.188


In [10]:
df_pycaret_results = pull()

In [11]:
df_pycaret_results.to_csv('../experiment_results/pycaret_cv.csv')

In [12]:
# Tune best models
best_N_tuned = [tune_model(model) for model in best_N]

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2126,0.1369,0.3701,0.9056,0.1233,0.2167
1,0.2951,0.2576,0.5076,0.905,0.1389,0.2063
2,0.2462,0.1326,0.3641,0.9436,0.1231,0.1766
3,0.2243,0.1153,0.3395,0.9544,0.1125,0.188
4,0.2218,0.1677,0.4095,0.941,0.1125,0.182
Mean,0.24,0.162,0.3982,0.9299,0.122,0.1939
Std,0.0297,0.0507,0.0591,0.0206,0.0097,0.0151


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2312,0.1557,0.3946,0.8926,0.1262,0.2517
1,0.3075,0.2668,0.5166,0.9016,0.1425,0.223
2,0.2518,0.1378,0.3713,0.9414,0.1232,0.1892
3,0.244,0.1352,0.3677,0.9465,0.1208,0.2027
4,0.2272,0.1677,0.4095,0.941,0.1144,0.1867
Mean,0.2523,0.1727,0.4119,0.9246,0.1254,0.2107
Std,0.029,0.0486,0.0545,0.0227,0.0094,0.0242


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2314,0.1566,0.3957,0.892,0.1263,0.2518
1,0.3076,0.2684,0.518,0.901,0.1425,0.2219
2,0.2522,0.1382,0.3718,0.9412,0.1234,0.1892
3,0.2447,0.1361,0.369,0.9462,0.121,0.203
4,0.2273,0.1677,0.4095,0.941,0.1144,0.1865
Mean,0.2526,0.1734,0.4128,0.9243,0.1255,0.2105
Std,0.0289,0.0489,0.0547,0.0229,0.0093,0.0242


Fitting 5 folds for each of 10 candidates, totalling 50 fits


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2311,0.1553,0.394,0.8929,0.1262,0.2516
1,0.3076,0.2682,0.5179,0.9011,0.1424,0.222
2,0.2524,0.1385,0.3721,0.9411,0.1235,0.1895
3,0.2449,0.1362,0.369,0.9462,0.1211,0.2033
4,0.2274,0.1678,0.4096,0.941,0.1144,0.1866
Mean,0.2527,0.1732,0.4125,0.9245,0.1255,0.2106
Std,0.0289,0.0489,0.0547,0.0226,0.0093,0.024


Fitting 5 folds for each of 2 candidates, totalling 10 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2322,0.1604,0.4005,0.8894,0.1267,0.2529
1,0.3077,0.2689,0.5186,0.9008,0.1425,0.2214
2,0.2519,0.138,0.3715,0.9413,0.1232,0.1889
3,0.2444,0.1361,0.3689,0.9462,0.1209,0.2026
4,0.2272,0.1677,0.4095,0.941,0.1144,0.1863
Mean,0.2527,0.1742,0.4138,0.9237,0.1255,0.2104
Std,0.0289,0.0489,0.0547,0.0237,0.0094,0.0246


Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [14]:
top_3_models = compare_models(n_select = 3, sort = 'MAE', include=best_N_tuned)

In [15]:
df_pycaret_cv_with_tuning = pull()

In [16]:
df_pycaret_cv_with_tuning

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
0,Huber Regressor,0.2413,0.1624,0.3984,0.93,0.1222,0.1946,0.53
1,Orthogonal Matching Pursuit,0.2513,0.1664,0.4026,0.9288,0.1256,0.2156,0.068
2,Ridge Regression,0.2526,0.1734,0.4128,0.9243,0.1255,0.2105,0.072
3,Linear Regression,0.2527,0.1732,0.4125,0.9245,0.1255,0.2106,0.084
4,Bayesian Ridge,0.2527,0.1742,0.4138,0.9237,0.1255,0.2104,0.114


In [17]:
df_pycaret_cv_with_tuning.to_csv('../experiment_results/pycaret_cv_with_tuned_models.csv')

In [25]:
# blend top 3 models
top_3_blended = blend_models(top_3_models)
top_3_blended

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2222,0.1293,0.3596,0.9108,0.1237,0.2358
1,0.3045,0.2681,0.5178,0.9011,0.1428,0.2164
2,0.2402,0.1271,0.3565,0.946,0.1189,0.1745
3,0.2337,0.1292,0.3594,0.9489,0.1163,0.1953
4,0.2216,0.1644,0.4054,0.9422,0.1122,0.1822
Mean,0.2444,0.1636,0.3997,0.9298,0.1227,0.2009
Std,0.0308,0.0541,0.0618,0.0198,0.0107,0.0225


Processing:   0%|          | 0/6 [00:00<?, ?it/s]

In [24]:
# stack models
top_3_stacked = stack_models(top_3_models)
top_3_stacked

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2547,0.1645,0.4056,0.8866,0.1322,0.2809
1,0.3098,0.2613,0.5112,0.9036,0.1428,0.2373
2,0.2659,0.1456,0.3815,0.9381,0.1315,0.1998
3,0.2393,0.1254,0.3541,0.9504,0.1184,0.1959
4,0.2275,0.1702,0.4125,0.9401,0.1148,0.1868
Mean,0.2594,0.1734,0.413,0.9238,0.128,0.2201
Std,0.0284,0.0467,0.0532,0.0244,0.0102,0.0349


In [20]:
# get leaderboard
lb = get_leaderboard()
lb

Unnamed: 0_level_0,Model Name,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,Linear Regression,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.2527,0.1732,0.4125,0.9245,0.1255,0.2106
1,Lasso Regression,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.6614,0.9859,0.9832,0.5902,0.2884,0.7978
2,Ridge Regression,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.2526,0.1736,0.4131,0.9242,0.1255,0.2104
3,Elastic Net,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.4647,0.4926,0.69,0.7973,0.2102,0.5564
4,Lasso Least Angle Regression,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.6614,0.986,0.9832,0.5902,0.2884,0.7978
5,Orthogonal Matching Pursuit,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.2513,0.1664,0.4026,0.9288,0.1256,0.2156
6,Bayesian Ridge,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.2527,0.1744,0.414,0.9236,0.1256,0.2104
7,Passive Aggressive Regressor,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.3875,0.3242,0.5595,0.8619,0.1898,0.3946
8,Huber Regressor,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.2413,0.1624,0.3984,0.93,0.1222,0.1946
9,K Neighbors Regressor,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.2791,0.2075,0.451,0.9083,0.135,0.2319


In [23]:
lb.sort_values(by='MAE', ascending=True)

Unnamed: 0_level_0,Model Name,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
17,Huber Regressor,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.24,0.162,0.3982,0.9299,0.122,0.1939
18,Huber Regressor,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.2413,0.1624,0.3984,0.93,0.1222,0.1946
8,Huber Regressor,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.2413,0.1624,0.3984,0.93,0.1222,0.1946
27,Huber Regressor,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.2413,0.1624,0.3984,0.93,0.1222,0.1946
32,Voting Regressor,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.2444,0.1636,0.3997,0.9298,0.1227,0.2009
20,Orthogonal Matching Pursuit,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.2513,0.1664,0.4026,0.9288,0.1256,0.2156
5,Orthogonal Matching Pursuit,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.2513,0.1664,0.4026,0.9288,0.1256,0.2156
28,Orthogonal Matching Pursuit,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.2513,0.1664,0.4026,0.9288,0.1256,0.2156
19,Orthogonal Matching Pursuit,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.2523,0.1727,0.4119,0.9246,0.1254,0.2107
2,Ridge Regression,"(TransformerWrapper(include=['Year', 'Sand_1',...",0.2526,0.1736,0.4131,0.9242,0.1255,0.2104


In [21]:
# select the best model based on MAE
best_model = lb.sort_values(by='MAE', ascending=True)['Model'].iloc[0]

In [22]:
best_model

In [37]:
# predict on test set
holdout_pred = predict_model(best_model)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,0.1973,0.1064,0.3261,0.9602,0.0989,0.1354


In [38]:
# predict on test set
holdout_pred_stacked = predict_model(top_3_stacked)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Stacking Regressor,0.213,0.1005,0.317,0.9624,0.1156,0.1702


In [39]:
# predict on test set
holdout_pred_blended = predict_model(top_3_blended)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Voting Regressor,0.1987,0.1032,0.3213,0.9614,0.0996,0.1401


In [43]:
best_model