In [1]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pycaret
from pycaret.regression import *

In [2]:
# check installed version
pycaret.__version__

'3.2.0'

In [3]:
# Load prepared data
df = pd.read_csv('../data/df_prepped.csv')
df.head()

Unnamed: 0,Year,Countries,lat,lon,Sand_1,Sand_2,Sand_3,Sand_4,Sand_5,Sand_6,...,maize_lag-2,pcp_mean_lag-2,tmax_mean_lag-2,tmin_mean_lag-2,spi_mean_lag-2,maize_lag-3,pcp_mean_lag-3,tmax_mean_lag-3,tmin_mean_lag-3,spi_mean_lag-3
0,2007,Angola,-8.75,14.75,50,51,51,48,45,46,...,0.721607,129.051864,301.518536,292.496579,1.644698,0.620005,109.983325,301.786056,292.204097,0.514275
1,2007,Angola,-16.25,14.25,62,64,63,59,58,59,...,0.300217,47.697564,303.988747,288.916992,0.909295,0.212699,41.130026,303.298082,288.642853,0.588172
2,2007,Angola,-17.25,14.25,69,71,70,67,65,65,...,4.044452,42.130629,305.494178,290.535403,0.952237,2.295351,35.049776,304.824778,290.284886,0.371446
3,2007,Angola,-11.75,14.75,60,63,61,57,53,53,...,0.907431,159.454723,299.404975,287.724299,1.374616,0.783018,174.08826,298.908208,287.362407,0.643207
4,2007,Angola,-14.25,13.75,67,69,68,63,61,61,...,0.675967,66.69867,304.644632,290.635254,1.144088,0.605584,67.404588,303.930955,290.564185,0.553079


In [4]:
print('Dataframe shape: ', df.shape)
print('Num unique countries: ', df.Countries.nunique())
print('Num unique farms: ', df.Farm.nunique())
print('Num unique years: ', df.Year.nunique())
print('Years: ', list(df.Year.unique()))

Dataframe shape:  (32330, 52)
Num unique countries:  30
Num unique farms:  3883
Num unique years:  10
Years:  [2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016]


In [5]:
# Drop Countries and Farm
df_label = df.loc[:,['Countries','Farm']]
df = df.drop(['Countries','Farm'], axis=1)

In [6]:
# Separate a test set, the year 2016
df_test = df[df.Year == 2016].sort_values('Year')
df_train = df[df.Year != 2016]

print('The training set has years: ', list(df_train.Year.unique()))
print('The test set has years: ', list(df_test.Year.unique()))

The training set has years:  [2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015]
The test set has years:  [2016]


In [7]:
df_test.shape, df_train.shape

((2930, 50), (29400, 50))

In [8]:
list(df.columns)

['Year',
 'lat',
 'lon',
 'Sand_1',
 'Sand_2',
 'Sand_3',
 'Sand_4',
 'Sand_5',
 'Sand_6',
 'Sand_7',
 'Clay_1',
 'Clay_2',
 'Clay_3',
 'Clay_4',
 'Clay_5',
 'Clay_6',
 'Clay_7',
 'OC_1',
 'OC_2',
 'OC_3',
 'OC_4',
 'OC_5',
 'OC_6',
 'OC_7',
 'PAW_1',
 'PAW_2',
 'PAW_3',
 'PAW_4',
 'PAW_5',
 'PAW_6',
 'PAW_7',
 'Y_maize_major',
 'Sow_Maize_month_int',
 'Harvest_Maize_month_int',
 'sow_to_harvest_months',
 'maize_lag-1',
 'pcp_mean_lag-1',
 'tmax_mean_lag-1',
 'tmin_mean_lag-1',
 'spi_mean_lag-1',
 'maize_lag-2',
 'pcp_mean_lag-2',
 'tmax_mean_lag-2',
 'tmin_mean_lag-2',
 'spi_mean_lag-2',
 'maize_lag-3',
 'pcp_mean_lag-3',
 'tmax_mean_lag-3',
 'tmin_mean_lag-3',
 'spi_mean_lag-3']

## Pycaret

In [9]:
# init setup
s = setup(data=df_train, 
          test_data = df_test, 
          target = 'Y_maize_major', 
          fold = 5, 
          normalize = True,
          normalize_method = 'robust',
          session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Y_maize_major
2,Target type,Regression
3,Original data shape,"(32330, 50)"
4,Transformed data shape,"(32330, 50)"
5,Transformed train set shape,"(29400, 50)"
6,Transformed test set shape,"(2930, 50)"
7,Numeric features,49
8,Preprocess,True
9,Imputation type,simple


In [10]:
# Select top N models (default hyperparameters, without tuning)
N = 5
best_N = compare_models(n_select = N, sort = 'MAE', exclude=['lar'])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,0.2411,0.1584,0.3928,0.9326,0.1215,0.1944,0.692
omp,Orthogonal Matching Pursuit,0.2505,0.1662,0.4022,0.9289,0.1252,0.2147,0.094
br,Bayesian Ridge,0.2547,0.1696,0.4069,0.9279,0.1271,0.2138,0.156
ridge,Ridge Regression,0.255,0.1698,0.4072,0.9278,0.1273,0.2145,0.11
lr,Linear Regression,0.2551,0.1699,0.4073,0.9277,0.1274,0.2148,3.644
et,Extra Trees Regressor,0.2663,0.1966,0.4389,0.9135,0.1303,0.2169,10.214
gbr,Gradient Boosting Regressor,0.2755,0.2069,0.45,0.9062,0.1325,0.2321,10.76
lightgbm,Light Gradient Boosting Machine,0.278,0.2177,0.4595,0.8997,0.1355,0.2215,0.95
rf,Random Forest Regressor,0.2809,0.222,0.4647,0.8976,0.1361,0.2238,34.954
knn,K Neighbors Regressor,0.2821,0.2092,0.4531,0.9078,0.1367,0.2326,0.602


In [11]:
df_pycaret_results = pull()

In [12]:
df_pycaret_results.to_csv('../experiment_results/pycaret_cv.csv')

In [13]:
# Tune best models
best_N_tuned = [tune_model(model) for model in best_N]

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2153,0.1169,0.342,0.9194,0.1216,0.2057
1,0.2957,0.255,0.5049,0.9061,0.1384,0.2116
2,0.2423,0.1312,0.3622,0.9458,0.121,0.1726
3,0.2181,0.1118,0.3344,0.9553,0.1096,0.1876
4,0.2258,0.1692,0.4113,0.9398,0.1138,0.183
Mean,0.2394,0.1568,0.391,0.9333,0.1209,0.1921
Std,0.0297,0.053,0.063,0.018,0.0099,0.0145


Fitting 5 folds for each of 10 candidates, totalling 50 fits


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2322,0.125,0.3535,0.9138,0.1291,0.2484
1,0.3123,0.2764,0.5258,0.8982,0.1445,0.2246
2,0.2542,0.1409,0.3754,0.9418,0.1273,0.1951
3,0.2355,0.1318,0.3631,0.9473,0.117,0.2075
4,0.2413,0.1754,0.4188,0.9376,0.1192,0.1983
Mean,0.2551,0.1699,0.4073,0.9277,0.1274,0.2148
Std,0.0296,0.056,0.0633,0.0187,0.0097,0.0197


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2317,0.1247,0.3531,0.914,0.1288,0.2473
1,0.3119,0.2763,0.5257,0.8982,0.1444,0.2239
2,0.2535,0.1405,0.3749,0.9419,0.1268,0.1937
3,0.2353,0.1314,0.3625,0.9474,0.1166,0.2062
4,0.2408,0.175,0.4183,0.9377,0.119,0.1977
Mean,0.2547,0.1696,0.4069,0.9279,0.1271,0.2137
Std,0.0296,0.0561,0.0634,0.0187,0.0098,0.0197


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2312,0.1244,0.3527,0.9142,0.1284,0.2463
1,0.3117,0.2763,0.5256,0.8982,0.1444,0.2235
2,0.2528,0.1401,0.3743,0.9421,0.1264,0.1924
3,0.2353,0.1312,0.3622,0.9475,0.1163,0.2053
4,0.2403,0.1746,0.4178,0.9379,0.1188,0.1971
Mean,0.2543,0.1693,0.4066,0.928,0.1269,0.2129
Std,0.0296,0.0562,0.0636,0.0187,0.0098,0.0198


Fitting 5 folds for each of 10 candidates, totalling 50 fits


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2322,0.125,0.3535,0.9138,0.1291,0.2484
1,0.3123,0.2764,0.5258,0.8982,0.1445,0.2246
2,0.2542,0.1409,0.3754,0.9418,0.1273,0.1951
3,0.2355,0.1318,0.3631,0.9473,0.117,0.2075
4,0.2413,0.1754,0.4188,0.9376,0.1192,0.1983
Mean,0.2551,0.1699,0.4073,0.9277,0.1274,0.2148
Std,0.0296,0.056,0.0633,0.0187,0.0097,0.0197


Fitting 5 folds for each of 2 candidates, totalling 10 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [14]:
top_3_models = compare_models(n_select = 3, sort = 'MAE', include=best_N_tuned)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
0,Huber Regressor,0.2394,0.1568,0.391,0.9333,0.1209,0.1921,0.614
1,Orthogonal Matching Pursuit,0.2505,0.1662,0.4022,0.9289,0.1252,0.2147,0.092
3,Ridge Regression,0.2543,0.1693,0.4066,0.928,0.1269,0.2129,0.084
2,Bayesian Ridge,0.2547,0.1696,0.4069,0.9279,0.1271,0.2138,0.132
4,Linear Regression,0.2551,0.1699,0.4073,0.9277,0.1274,0.2148,0.124


In [15]:
df_pycaret_cv_with_tuning = pull()

In [16]:
df_pycaret_cv_with_tuning

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
0,Huber Regressor,0.2394,0.1568,0.391,0.9333,0.1209,0.1921,0.614
1,Orthogonal Matching Pursuit,0.2505,0.1662,0.4022,0.9289,0.1252,0.2147,0.092
3,Ridge Regression,0.2543,0.1693,0.4066,0.928,0.1269,0.2129,0.084
2,Bayesian Ridge,0.2547,0.1696,0.4069,0.9279,0.1271,0.2138,0.132
4,Linear Regression,0.2551,0.1699,0.4073,0.9277,0.1274,0.2148,0.124


In [17]:
df_pycaret_cv_with_tuning.to_csv('../experiment_results/pycaret_cv_with_tuned_models.csv')

In [18]:
# blend top 3 models
top_3_blended = blend_models(top_3_models)
top_3_blended

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2188,0.118,0.3435,0.9187,0.122,0.23
1,0.3038,0.2686,0.5183,0.901,0.1426,0.2154
2,0.2374,0.1257,0.3545,0.9481,0.1181,0.1683
3,0.2282,0.1266,0.3558,0.9494,0.1136,0.1989
4,0.2269,0.167,0.4086,0.9406,0.1139,0.1843
Mean,0.243,0.1612,0.3961,0.9315,0.122,0.1994
Std,0.0309,0.0564,0.0651,0.0188,0.0107,0.0219


In [19]:
# stack models
top_3_stacked = stack_models(top_3_models)
top_3_stacked

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2237,0.1215,0.3486,0.9162,0.1234,0.2408
1,0.3174,0.2715,0.5211,0.9,0.1461,0.2448
2,0.2643,0.1453,0.3812,0.94,0.1327,0.2026
3,0.2397,0.1266,0.3558,0.9494,0.1175,0.1998
4,0.2504,0.1798,0.424,0.936,0.1237,0.2185
Mean,0.2591,0.169,0.4061,0.9283,0.1287,0.2213
Std,0.032,0.0552,0.0632,0.0178,0.01,0.0187


In [20]:
# get leaderboard
lb = get_leaderboard()
lb

Unnamed: 0_level_0,Model Name,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,Linear Regression,"(TransformerWrapper(include=['Year', 'lat', 'l...",0.2551,0.1699,0.4073,0.9277,0.1274,0.2148
1,Lasso Regression,"(TransformerWrapper(include=['Year', 'lat', 'l...",0.6612,0.9841,0.9825,0.5911,0.2882,0.7962
2,Ridge Regression,"(TransformerWrapper(include=['Year', 'lat', 'l...",0.255,0.1698,0.4072,0.9278,0.1273,0.2145
3,Elastic Net,"(TransformerWrapper(include=['Year', 'lat', 'l...",0.4645,0.4922,0.6898,0.7976,0.21,0.5552
4,Lasso Least Angle Regression,"(TransformerWrapper(include=['Year', 'lat', 'l...",0.6612,0.9841,0.9825,0.5911,0.2882,0.7962
5,Orthogonal Matching Pursuit,"(TransformerWrapper(include=['Year', 'lat', 'l...",0.2505,0.1662,0.4022,0.9289,0.1252,0.2147
6,Bayesian Ridge,"(TransformerWrapper(include=['Year', 'lat', 'l...",0.2547,0.1696,0.4069,0.9279,0.1271,0.2138
7,Passive Aggressive Regressor,"(TransformerWrapper(include=['Year', 'lat', 'l...",0.3798,0.2979,0.5391,0.8751,0.1876,0.4351
8,Huber Regressor,"(TransformerWrapper(include=['Year', 'lat', 'l...",0.2411,0.1584,0.3928,0.9326,0.1215,0.1944
9,K Neighbors Regressor,"(TransformerWrapper(include=['Year', 'lat', 'l...",0.2821,0.2092,0.4531,0.9078,0.1367,0.2326


In [21]:
lb.sort_values(by='MAE', ascending=True)

Unnamed: 0_level_0,Model Name,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
18,Huber Regressor,"(TransformerWrapper(include=['Year', 'lat', 'l...",0.2394,0.1568,0.391,0.9333,0.1209,0.1921
28,Huber Regressor,"(TransformerWrapper(include=['Year', 'lat', 'l...",0.2394,0.1568,0.391,0.9333,0.1209,0.1921
8,Huber Regressor,"(TransformerWrapper(include=['Year', 'lat', 'l...",0.2411,0.1584,0.3928,0.9326,0.1215,0.1944
19,Huber Regressor,"(TransformerWrapper(include=['Year', 'lat', 'l...",0.2411,0.1584,0.3928,0.9326,0.1215,0.1944
33,Voting Regressor,"(TransformerWrapper(include=['Year', 'lat', 'l...",0.243,0.1612,0.3961,0.9315,0.122,0.1994
29,Orthogonal Matching Pursuit,"(TransformerWrapper(include=['Year', 'lat', 'l...",0.2505,0.1662,0.4022,0.9289,0.1252,0.2147
21,Orthogonal Matching Pursuit,"(TransformerWrapper(include=['Year', 'lat', 'l...",0.2505,0.1662,0.4022,0.9289,0.1252,0.2147
5,Orthogonal Matching Pursuit,"(TransformerWrapper(include=['Year', 'lat', 'l...",0.2505,0.1662,0.4022,0.9289,0.1252,0.2147
31,Ridge Regression,"(TransformerWrapper(include=['Year', 'lat', 'l...",0.2543,0.1693,0.4066,0.928,0.1269,0.2129
24,Ridge Regression,"(TransformerWrapper(include=['Year', 'lat', 'l...",0.2543,0.1693,0.4066,0.928,0.1269,0.2129


In [22]:
# select the best model based on MAE
best_model = lb.sort_values(by='MAE', ascending=True)['Model'].iloc[0]

In [23]:
best_model

In [28]:
# predict on test set
holdout_pred = predict_model(best_model)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,0.2,0.1064,0.3262,0.9602,0.0991,0.1399


In [27]:
best_model