# Machine Learning: Regression

In [2]:
from pycaret.datasets import get_data
data = get_data('insurance')

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
from pycaret.regression import *
s = setup(data, target = 'charges', session_id=123,
                  normalize = True, transformation = True, transform_target = True, 
                  combine_rare_levels = True, rare_level_threshold = 0.05,
                  remove_multicollinearity = True, multicollinearity_threshold = 0.95) 

Unnamed: 0,Description,Value
0,session_id,123
1,Target,charges
2,Original Data,"(1338, 7)"
3,Missing Values,False
4,Numeric Features,2
5,Categorical Features,4
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(936, 14)"


The code above will normalize numerical data, transform the target variable to be normally distributed, combine rare levels into a single group if there are less than 5%. This will also remove collinear variables where the correlation is 0.95.  

In [11]:
best = compare_models(n_select=2)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,2318.5636,23095753.5767,4784.4402,0.8346,0.4077,0.2123,0.117
gbr,Gradient Boosting Regressor,2321.4756,23737948.1983,4844.4855,0.832,0.3915,0.19,0.034
lightgbm,Light Gradient Boosting Machine,2595.0164,24737855.0132,4955.4331,0.8219,0.4149,0.2206,0.027
ada,AdaBoost Regressor,3400.0159,25443018.9667,5034.67,0.818,0.4996,0.4605,0.019
et,Extra Trees Regressor,2430.4478,26514493.2893,5109.6941,0.8132,0.4369,0.232,0.109
dt,Decision Tree Regressor,3044.4337,42375147.4035,6444.6387,0.6978,0.5139,0.3249,0.014
ridge,Ridge Regression,4160.5034,61356598.4,7759.2451,0.5764,0.4476,0.2781,0.012
omp,Orthogonal Matching Pursuit,5767.6338,61270291.3597,7801.8334,0.5761,0.7027,0.7093,0.012
br,Bayesian Ridge,4163.8218,61498567.9221,7768.0875,0.5754,0.4476,0.278,0.016
lr,Linear Regression,4181.8438,62269498.0,7816.3182,0.5701,0.4475,0.2777,0.013


- n_select can be used to select the top n models for further evaluation
- Mean Absolute Error of your model refers to the mean of the absolute values of each prediction error on all instances of the test data-set. Prediction error is the difference between the actual value and the predicted value for that instance.
- RMSE and MSE are measures of the accuracy of the data. MSE is the mean squared error, the difference between the observed and predicted values.
- $R^2$ is another measure of accuracy. Indicates how well the model explains the target variable
- RMSLE is similar to the RMSE but now the predicted error is the difference of the log of actual and log of predicted. This helps reduce the effect of outliers.
- MAPE is the mean absolute percentage error. The difference in prediction error over the observed value. 

## Model Tuning

Model parameters are improved using a random grid search based on the number of iterations

### Random Forest

In [14]:
tune_model(create_model('rf',fold=10),n_iter=50)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,2351.1034,21218401.6876,4606.3436,0.8733,0.3915,0.1987
1,2515.7795,30279995.1808,5502.7262,0.8239,0.4371,0.2006
2,2449.9904,22751236.2196,4769.8256,0.7119,0.4613,0.2381
3,2528.6932,22982529.2723,4794.0097,0.8081,0.4448,0.2851
4,2715.542,30012612.9358,5478.3769,0.7768,0.526,0.247
5,2688.5658,22137573.0244,4705.0582,0.8576,0.3499,0.2152
6,2222.3523,20035546.7729,4476.1084,0.8637,0.3302,0.2124
7,2507.6663,25647611.119,5064.3471,0.859,0.4413,0.2069
8,2517.9404,22200435.2212,4711.7338,0.8566,0.3978,0.2763
9,2434.9556,26180748.5978,5116.7127,0.8314,0.445,0.2068


PowerTransformedTargetRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                                max_depth=9, max_features=1.0,
                                max_leaf_nodes=None, max_samples=None,
                                min_impurity_decrease=0,
                                min_impurity_split=None, min_samples_leaf=6,
                                min_samples_split=5,
                                min_weight_fraction_leaf=0.0, n_estimators=180,
                                n_jobs=-1, oob_score=False,
                                power_transformer_method='box-cox',
                                power_transf...
                                regressor=RandomForestRegressor(bootstrap=False,
                                                                ccp_alpha=0.0,
                                                                criterion='mse',
                                                                max_depth=9,
                        

In [15]:
tune_model(create_model('gbr',fold=10),n_iter=50)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,2271.0525,19852454.6214,4455.6093,0.8814,0.4279,0.2326
1,2732.2898,30223163.0338,5497.5597,0.8243,0.4199,0.2216
2,2435.8055,22184428.6976,4710.0349,0.7191,0.505,0.2548
3,2292.2115,20402798.1439,4516.9457,0.8296,0.4088,0.2303
4,2796.0759,29453517.8737,5427.1095,0.7809,0.5235,0.2602
5,2355.8686,19483326.4883,4413.9921,0.8747,0.3208,0.1882
6,2265.3052,21235482.9934,4608.1974,0.8555,0.3271,0.2053
7,2615.5506,27265099.3173,5221.5993,0.8501,0.4459,0.1971
8,2083.0631,18378912.7405,4287.0634,0.8813,0.3493,0.2211
9,2462.0002,28161497.9386,5306.7408,0.8186,0.4717,0.2255


PowerTransformedTargetRegressor(alpha=0.9, ccp_alpha=0.0,
                                criterion='friedman_mse', init=None,
                                learning_rate=0.15, loss='ls', max_depth=9,
                                max_features=1.0, max_leaf_nodes=None,
                                min_impurity_decrease=0.2,
                                min_impurity_split=None, min_samples_leaf=4,
                                min_samples_split=7,
                                min_weight_fraction_leaf=0.0, n_estimators=260,
                                n_iter_no_change=None,
                                power_transformer_method...
                                                                    max_leaf_nodes=None,
                                                                    min_impurity_decrease=0.2,
                                                                    min_impurity_split=None,
                                                                 

## Ensembling

Ensembling is a common machine learning technique used to improve the performance of models (mostly tree based). There are various techniques for ensembling that we will cover in this section. These include Bagging and Boosting:

- Bagging is a parallel method that fits different, considered learners independently from each other, making it possible to train them simultaneously.
- Boosting is a sequential ensemble method that iteratively adjusts the weight of observation as per the last classification. If an observation is incorrectly classified, it increases the weight of that observation.

In [18]:
ensemble_model(tune_model(create_model('rf',fold=10),n_iter=50), method = 'Boosting')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,2565.6495,23185737.5043,4815.1571,0.8615,0.4587,0.3018
1,3090.5624,33367050.5292,5776.4219,0.806,0.4384,0.2453
2,2741.0704,25184982.3541,5018.4641,0.6811,0.5397,0.3513
3,2264.6705,19274195.7302,4390.2387,0.8391,0.4034,0.2561
4,3014.9424,30628890.7933,5534.3374,0.7722,0.5426,0.2864
5,2625.5914,22532796.2671,4746.8723,0.8551,0.3637,0.2207
6,2309.7864,20552420.9735,4533.4778,0.8602,0.3213,0.2006
7,2437.6499,21721285.149,4660.61,0.8806,0.4158,0.1935
8,2241.5165,16867227.0201,4106.973,0.8911,0.3494,0.2571
9,2758.063,28460273.5246,5334.8171,0.8167,0.478,0.2745


PowerTransformedTargetRegressor(base_estimator=RandomForestRegressor(bootstrap=False,
                                                                     ccp_alpha=0.0,
                                                                     criterion='mse',
                                                                     max_depth=9,
                                                                     max_features=1.0,
                                                                     max_leaf_nodes=None,
                                                                     max_samples=None,
                                                                     min_impurity_decrease=0,
                                                                     min_impurity_split=None,
                                                                     min_samples_leaf=6,
                                                                     min_samples_split=5,
                               

In [20]:
bag_rf= ensemble_model(tune_model(create_model('rf',fold=10),n_iter=50), method = 'Bagging')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,2046.0012,18895215.1541,4346.8627,0.8871,0.3478,0.176
1,2664.7459,29539167.7864,5434.9947,0.8282,0.4074,0.2135
2,2185.9372,20192225.086,4493.576,0.7443,0.4364,0.2077
3,2234.8533,20675626.0625,4547.0459,0.8274,0.3804,0.2047
4,2484.3466,26950080.4383,5191.3467,0.7996,0.4769,0.1987
5,2403.1035,19646522.7407,4432.4398,0.8736,0.3259,0.18
6,1955.8332,16806270.6316,4099.5452,0.8856,0.2868,0.1702
7,2182.7888,23426404.5988,4840.0831,0.8712,0.4187,0.162
8,2102.5107,19148389.1427,4375.8872,0.8763,0.3333,0.1997
9,2203.0578,24268847.9598,4926.3422,0.8437,0.4023,0.1731


In [24]:
bag_gbf= ensemble_model(tune_model(create_model('gbr',fold=10),n_iter=50), method = 'Bagging')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,1985.3269,18241145.0503,4270.9654,0.891,0.3684,0.1707
1,2558.4165,29474139.4965,5429.0091,0.8286,0.3992,0.1913
2,2240.8041,20714937.2301,4551.3665,0.7377,0.4716,0.2232
3,2265.4004,19640515.7772,4431.7622,0.836,0.3881,0.2218
4,2570.0188,27555159.1231,5249.3008,0.7951,0.479,0.2063
5,2281.5703,19241216.302,4386.4811,0.8762,0.3113,0.1671
6,2060.079,18856859.5083,4342.4486,0.8717,0.3013,0.1665
7,2320.2895,24733850.7143,4973.3139,0.864,0.438,0.1691
8,1978.0656,18148263.9312,4260.0779,0.8828,0.3254,0.1944
9,2486.7521,26461623.0009,5144.0862,0.8296,0.4461,0.2133


## Blending

Blending is another common technique for ensembling that can be used in PyCaret. It creates multiple models and then averages the individual predictions to form a final prediction.

In [25]:
# blend individual models
blender = blend_models(estimator_list = [bag_rf, bag_gbf])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,1973.7741,18260499.2365,4273.2305,0.8909,0.3547,0.1692
1,2573.3746,29307832.2325,5413.6709,0.8296,0.4009,0.199
2,2195.4247,20211998.6822,4495.7756,0.7441,0.4512,0.2131
3,2214.792,19954271.1566,4467.0204,0.8334,0.3805,0.208
4,2480.6632,26986657.5185,5194.8684,0.7993,0.4759,0.1972
5,2290.3689,19147161.363,4375.7469,0.8768,0.3156,0.1687
6,1982.4177,17608056.4353,4196.1955,0.8802,0.2901,0.1645
7,2173.3479,23801433.4246,4878.6713,0.8691,0.4263,0.1607
8,1998.5357,18481400.0426,4298.9999,0.8806,0.3247,0.1924
9,2286.3961,24944374.997,4994.4344,0.8393,0.4208,0.1862


In [23]:
# blend top2 models from compare_models
blender_top2 = blend_models(best)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,2067.1071,18997725.9787,4358.6381,0.8865,0.369,0.185
1,2564.1394,30740524.927,5544.4138,0.8213,0.4124,0.1939
2,2195.5543,21825276.7383,4671.7531,0.7237,0.4776,0.2166
3,2256.0771,19853431.6083,4455.719,0.8342,0.3822,0.218
4,2571.2632,27852270.118,5277.525,0.7928,0.4906,0.2313
5,2276.2111,19531638.6566,4419.4614,0.8744,0.3186,0.165
6,1909.0689,17813007.1073,4220.5458,0.8788,0.2985,0.1659
7,2361.0858,25038426.6447,5003.8412,0.8623,0.4138,0.167
8,1890.1583,17187570.1075,4145.7894,0.889,0.3032,0.1816
9,2347.7271,25629443.6984,5062.5531,0.8349,0.4428,0.1994


The model returned by the blend_models function is just like any other model that you would create using create_model() or tune_model(). You can use this model for predictions on unseen data using predict_model() in the same way you would for any other model.

In [33]:
predictions = predict_model(blender, data=data)
predictions.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,Label
0,19,female,27.9,0,yes,southwest,16884.924,17572.607536
1,18,male,33.77,1,no,southeast,1725.5523,2399.005146
2,28,male,33.0,3,no,southeast,4449.462,4941.884378
3,33,male,22.705,0,no,northwest,21984.47061,7206.539564
4,32,male,28.88,0,no,northwest,3866.8552,3772.359489
