## Installing Packages

In [268]:
# !pip install --pre pycaret
# !pip install scikit-learn-intelex

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-learn-intelex
  Downloading scikit_learn_intelex-2021.6.3-py37-none-manylinux1_x86_64.whl (87 kB)
[K     |████████████████████████████████| 87 kB 3.1 MB/s 
[?25hCollecting daal4py==2021.6.3
  Downloading daal4py-2021.6.3-py37-none-manylinux1_x86_64.whl (26.0 MB)
[K     |████████████████████████████████| 26.0 MB 29.4 MB/s 
Collecting daal==2021.6.0
  Downloading daal-2021.6.0-py2.py3-none-manylinux1_x86_64.whl (300.3 MB)
[K     |████████████████████████████████| 300.3 MB 17 kB/s 
Collecting tbb==2021.*
  Downloading tbb-2021.7.0-py2.py3-none-manylinux1_x86_64.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 48.7 MB/s 
Installing collected packages: tbb, daal, daal4py, scikit-learn-intelex
Successfully installed daal-2021.6.0 daal4py-2021.6.3 scikit-learn-intelex-2021.6.3 tbb-2021.7.0


In [269]:
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


## Importing Packages

In [270]:
import pandas as pd
from pycaret.regression import *

## Data PreProcessing

In [271]:
data = pd.read_excel('DATASET.xlsx')
data.head()

Unnamed: 0.1,Unnamed: 0,year,Para-1,Para-2,Para-3,Para-4,Para-5,Para-6,Para-7,Para-8,Para-9,Para-10,Para-11,Para-12,Para-13
0,Sec_1,1,324.0,354.5,17.0,0.0,3,117,2600,400,0.0,0.0,0.0,0,1.52
1,,2,324.0,161.1,18.2,0.0,4,106,5950,1190,0.0,3.4,0.0,0,1.62
2,,3,324.0,170.7,18.5,0.0,4,110,5950,1190,0.0,21.3,0.0,4,1.68
3,,4,324.0,223.9,18.9,0.0,3,110,6150,1340,0.0,21.3,0.0,5,1.78
4,,5,324.0,228.2,18.4,0.0,2,113,6340,1450,0.0,23.9,0.0,7,1.8


### Change column names


In [272]:
data.columns = data.columns.str.replace('Unnamed: 0', 'Sections')
data.columns

Index(['Sections', 'year', 'Para-1', 'Para-2', 'Para-3', 'Para-4', 'Para-5',
       'Para-6', 'Para-7', 'Para-8', 'Para-9', 'Para-10', 'Para-11', 'Para-12',
       'Para-13'],
      dtype='object')

### Populate "Sections" column

**Checking if every section contains data for 10 years**

In [273]:
data['year'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [274]:
missing_rows = []
i = 0
while i < len(data):
  section_name = data.iloc[i][0] if data.iloc[i][0] else i
  count = 0
  missing_entries=0

  while count < 10:
    if data.iloc[i+count][1] != count+1:
      missing_entries = missing_entries + 1
      missing_rows.append(i+count)
    count = count + 1
  i = i + 10 - missing_entries

**Since we can not remove a section altogether and we only have 1 missing row, lets duplicate 9th year record for the 10th one**




In [275]:
missing_rows

[919]

In [276]:
for i in missing_rows:
  new_idx = i - 0.5
  dup_rec_idx = i - 1
  data.loc[new_idx] = data.loc[dup_rec_idx]
  data.at[new_idx, 'year'] = i % 10 + 1
  data = data.sort_index().reset_index(drop=True)
data

Unnamed: 0,Sections,year,Para-1,Para-2,Para-3,Para-4,Para-5,Para-6,Para-7,Para-8,Para-9,Para-10,Para-11,Para-12,Para-13
0,Sec_1,1,324.0,354.5,17.0,0.0,3,117,2600,400,0.0,0.0,0.0,0,1.520
1,,2,324.0,161.1,18.2,0.0,4,106,5950,1190,0.0,3.4,0.0,0,1.620
2,,3,324.0,170.7,18.5,0.0,4,110,5950,1190,0.0,21.3,0.0,4,1.680
3,,4,324.0,223.9,18.9,0.0,3,110,6150,1340,0.0,21.3,0.0,5,1.780
4,,5,324.0,228.2,18.4,0.0,2,113,6340,1450,0.0,23.9,0.0,7,1.800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1005,,6,454.0,931.8,13.6,125.0,13,116,4150,830,15.1,1.8,1.0,15,1.681
1006,,7,454.0,703.0,14.5,17.0,13,118,4150,830,24.6,7.9,2.0,15,1.838
1007,,8,454.0,1124.1,14.9,45.0,18,117,4200,840,119.0,15.7,2.0,16,1.862
1008,,9,454.0,1462.3,13.2,62.0,27,117,4400,720,138.2,30.5,6.0,18,1.883


In [277]:
# Populating Sections column with values
j = 0
for i in range(int(len(data)/10)):
    data.iloc[j:j+10, 0] = data.iloc[j, 0]
    j = j + 10
data

Unnamed: 0,Sections,year,Para-1,Para-2,Para-3,Para-4,Para-5,Para-6,Para-7,Para-8,Para-9,Para-10,Para-11,Para-12,Para-13
0,Sec_1,1,324.0,354.5,17.0,0.0,3,117,2600,400,0.0,0.0,0.0,0,1.520
1,Sec_1,2,324.0,161.1,18.2,0.0,4,106,5950,1190,0.0,3.4,0.0,0,1.620
2,Sec_1,3,324.0,170.7,18.5,0.0,4,110,5950,1190,0.0,21.3,0.0,4,1.680
3,Sec_1,4,324.0,223.9,18.9,0.0,3,110,6150,1340,0.0,21.3,0.0,5,1.780
4,Sec_1,5,324.0,228.2,18.4,0.0,2,113,6340,1450,0.0,23.9,0.0,7,1.800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1005,Sec_101,6,454.0,931.8,13.6,125.0,13,116,4150,830,15.1,1.8,1.0,15,1.681
1006,Sec_101,7,454.0,703.0,14.5,17.0,13,118,4150,830,24.6,7.9,2.0,15,1.838
1007,Sec_101,8,454.0,1124.1,14.9,45.0,18,117,4200,840,119.0,15.7,2.0,16,1.862
1008,Sec_101,9,454.0,1462.3,13.2,62.0,27,117,4400,720,138.2,30.5,6.0,18,1.883


### Handle Nan Values


In [278]:
data.isnull().any()

Sections    False
year        False
Para-1      False
Para-2      False
Para-3      False
Para-4      False
Para-5      False
Para-6      False
Para-7      False
Para-8      False
Para-9      False
Para-10     False
Para-11     False
Para-12     False
Para-13      True
dtype: bool

In [279]:
data[data['Para-13'].isnull()]

Unnamed: 0,Sections,year,Para-1,Para-2,Para-3,Para-4,Para-5,Para-6,Para-7,Para-8,Para-9,Para-10,Para-11,Para-12,Para-13
190,Sec_20,1,454.6,145.6,21.7,0.0,3,106,3650,730,0.0,0.0,0.0,0,
260,Sec_27,1,563.9,336.6,21.3,0.0,4,106,6800,1600,0.0,0.0,0.0,0,


In [280]:
data['Para-13'] = data['Para-13'].fillna(0)
data['Para-13'].isnull().any()

False

### Train - Test split

In [281]:
train_data = data[data.iloc[: , 1] != 10]
test_data = data[data.iloc[:, 1] == 10]

## Training

In [283]:
models = []
for i in range(9, 14):
  this_data = train_data.iloc[:, 0:10]
  this_data[f'Para-{i}'] = train_data[f'Para-{i}']
  s = setup(this_data, target=f'Para-{i}')
  best = compare_models(sort='RMSE', n_select=3)
  for model in best:
    models.append(tune_model(model))

Unnamed: 0,Description,Value
0,Session id,7853
1,Target,Para-9
2,Target type,Regression
3,Data shape,"(909, 11)"
4,Train data shape,"(636, 11)"
5,Test data shape,"(273, 11)"
6,Numeric features,9
7,Categorical features,1
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,12.0818,665.3203,25.1692,0.8193,1.1649,2.2484,0.153
rf,Random Forest Regressor,11.1715,744.876,26.0918,0.8264,0.9221,1.5227,0.158
et,Extra Trees Regressor,11.4206,749.9041,26.1927,0.8261,0.9414,1.7252,0.271
lightgbm,Light Gradient Boosting Machine,13.6743,923.4773,29.2674,0.7811,1.2228,2.3581,0.16
ada,AdaBoost Regressor,29.5827,1505.3387,38.3556,0.5938,2.487,9.7322,0.132
dt,Decision Tree Regressor,15.4355,1727.7947,39.1767,0.5991,1.0307,1.7257,0.048
en,Elastic Net,27.7414,1976.406,43.1953,0.5276,2.2445,6.7848,0.046
br,Bayesian Ridge,27.8816,1976.764,43.215,0.5269,2.2534,6.8696,0.045
lasso,Lasso Regression,28.3341,1977.3773,43.2679,0.5247,2.2804,7.0358,0.109
ridge,Ridge Regression,28.4863,1978.0846,43.2947,0.5237,2.2901,7.1442,0.046


Processing:   0%|          | 0/79 [00:00<?, ?it/s]

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,17.1544,1105.6456,33.2513,0.8929,1.4775,1.7313
1,15.9805,1183.571,34.4031,0.7528,1.4802,3.5168
2,15.1171,720.1226,26.8351,0.8035,1.4262,2.0238
3,23.1476,1805.0815,42.4863,0.7654,1.6951,4.553
4,19.6028,1431.7621,37.8386,0.7588,1.5687,3.902
5,12.6317,600.1645,24.4983,0.6081,1.5261,9.883
6,18.0627,909.609,30.1597,0.658,1.6299,5.6963
7,15.5262,686.3303,26.1979,0.8266,1.6911,4.3904
8,13.9391,484.7309,22.0166,0.831,1.6073,3.1483
9,9.8311,277.6285,16.6622,0.8491,1.3603,1.4949


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,18.9163,2207.011,46.9788,0.7862,1.2707,1.182
1,13.6581,852.1236,29.1912,0.822,1.3805,2.4289
2,12.1742,731.7119,27.0502,0.8003,1.21,1.3576
3,20.2251,1892.8392,43.5068,0.754,1.272,2.835
4,16.0504,1310.9075,36.2065,0.7791,1.2693,2.6044
5,10.1048,501.7428,22.3996,0.6724,1.3189,3.6537
6,10.3228,394.4432,19.8606,0.8517,1.3124,5.0216
7,12.2518,598.7345,24.4691,0.8487,1.4853,3.2264
8,12.887,692.6341,26.3179,0.7585,1.4349,2.9618
9,10.1608,301.6186,17.3672,0.8361,1.4366,1.612


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,15.6893,1866.7004,43.2053,0.8191,0.8983,1.0139
1,11.1764,750.2286,27.3903,0.8433,0.8997,0.9061
2,9.3826,510.5265,22.5948,0.8607,0.8939,0.6694
3,15.918,1145.5412,33.8458,0.8511,1.0726,2.467
4,12.7493,778.4148,27.9001,0.8689,0.9037,2.5555
5,8.0804,332.948,18.2469,0.7826,0.8491,1.6832
6,9.3114,348.4925,18.668,0.869,0.8434,1.2483
7,10.0736,471.1355,21.7057,0.881,1.0657,1.372
8,11.072,589.4107,24.2778,0.7945,1.0417,2.2263
9,9.9309,408.5692,20.2131,0.778,0.9373,1.6138


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0,Description,Value
0,Session id,4637
1,Target,Para-10
2,Target type,Regression
3,Data shape,"(909, 11)"
4,Train data shape,"(636, 11)"
5,Test data shape,"(273, 11)"
6,Numeric features,9
7,Categorical features,1
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,21.4064,1265.6644,34.9,0.8678,1.1715,0.7005,0.273
lightgbm,Light Gradient Boosting Machine,22.2043,1310.2654,35.2711,0.8666,1.2404,0.7467,0.091
gbr,Gradient Boosting Regressor,23.5792,1301.5108,35.4993,0.8624,1.3243,0.8907,0.156
rf,Random Forest Regressor,22.1033,1386.0053,36.2994,0.8555,1.1894,0.7366,0.186
br,Bayesian Ridge,29.092,1758.8173,41.3762,0.8187,1.7302,1.666,0.048
lasso,Lasso Regression,29.1714,1758.4328,41.3824,0.8186,1.7334,1.6814,0.077
en,Elastic Net,28.6928,1764.41,41.3967,0.8184,1.7113,1.5886,0.047
ridge,Ridge Regression,29.2775,1759.0721,41.3983,0.8186,1.7359,1.6969,0.047
lr,Linear Regression,29.28,1759.086,41.3987,0.8186,1.736,1.6973,0.268
lar,Least Angle Regression,29.3826,1764.6645,41.4624,0.8181,1.7339,1.7067,0.048


Processing:   0%|          | 0/79 [00:00<?, ?it/s]

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,16.5421,533.7371,23.1028,0.9249,1.1157,1.0608
1,19.0547,710.9129,26.663,0.9209,1.3521,0.5442
2,26.3923,2220.2895,47.1199,0.8235,1.199,0.6756
3,17.8115,636.0886,25.2208,0.9179,1.354,0.3954
4,22.4855,1713.5762,41.3954,0.865,1.2807,1.4193
5,29.5497,2197.0571,46.8728,0.7281,1.3994,1.0511
6,23.5167,1547.9641,39.3442,0.8174,1.3018,0.8291
7,28.4194,1796.0582,42.3799,0.8741,1.388,0.6791
8,25.3159,1948.161,44.138,0.7422,1.7743,1.0466
9,19.5335,1099.6262,33.1606,0.9139,1.1007,0.46


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,13.8223,470.0644,21.681,0.9339,1.0863,0.5818
1,17.7297,805.786,28.3864,0.9103,1.2675,0.4133
2,22.5458,1506.7418,38.8168,0.8802,1.1023,0.5061
3,16.5388,671.5118,25.9135,0.9133,1.2545,0.4078
4,21.8926,1431.4905,37.835,0.8873,1.1228,1.2179
5,29.9017,2033.5952,45.0954,0.7484,1.3003,0.7176
6,23.6352,1431.121,37.8302,0.8311,1.4052,0.8704
7,30.527,1958.2184,44.2518,0.8627,1.4105,0.7466
8,26.7018,1816.4061,42.6193,0.7597,1.6052,1.1403
9,19.182,1278.6095,35.7576,0.8998,0.9199,0.3734


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,21.1451,728.236,26.9858,0.8976,1.2085,1.993
1,19.0578,674.765,25.9762,0.9249,1.461,0.6444
2,25.3339,1656.1507,40.6958,0.8684,1.3637,0.9338
3,21.0896,708.301,26.6139,0.9085,1.5505,0.7883
4,25.6994,1532.1192,39.1423,0.8793,1.4834,1.688
5,31.4654,2192.6308,46.8255,0.7287,1.4434,1.6184
6,26.4873,1610.4885,40.1309,0.81,1.4489,1.1291
7,25.6303,1535.6315,39.1871,0.8923,1.4067,0.9621
8,26.5678,1923.6486,43.8594,0.7455,1.8823,0.8752
9,21.2487,1115.7786,33.4033,0.9126,1.244,0.4367


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0,Description,Value
0,Session id,3495
1,Target,Para-11
2,Target type,Regression
3,Data shape,"(909, 11)"
4,Train data shape,"(636, 11)"
5,Test data shape,"(273, 11)"
6,Numeric features,9
7,Categorical features,1
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,7.979,247.5705,15.1671,0.777,0.8784,0.8075,0.09
et,Extra Trees Regressor,7.2166,248.7033,15.2018,0.7753,0.7877,0.6275,0.267
rf,Random Forest Regressor,7.5489,259.0965,15.3613,0.7594,0.8017,0.7017,0.169
gbr,Gradient Boosting Regressor,8.1694,261.2634,15.4104,0.7669,0.9505,0.8051,0.155
br,Bayesian Ridge,11.2123,343.4391,18.0238,0.6974,1.3484,1.4349,0.046
en,Elastic Net,11.0552,343.8937,18.024,0.6978,1.326,1.3896,0.046
lasso,Lasso Regression,11.2594,343.5363,18.0305,0.6971,1.3546,1.4509,0.074
ridge,Ridge Regression,11.3732,343.6192,18.0407,0.6964,1.3653,1.4809,0.045
lar,Least Angle Regression,11.3743,343.622,18.0409,0.6964,1.3654,1.4812,0.048
lr,Linear Regression,11.3743,343.622,18.0409,0.6964,1.3654,1.4812,0.269


Processing:   0%|          | 0/79 [00:00<?, ?it/s]

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,11.7751,614.4819,24.7887,0.7092,0.771,0.6214
1,9.6449,225.1638,15.0055,0.767,0.8154,0.7499
2,10.3829,226.9521,15.0649,0.87,0.9032,0.8542
3,9.5407,562.3933,23.7148,0.5533,1.2673,0.6883
4,7.815,150.2612,12.2581,0.8043,1.1944,0.6344
5,6.9475,205.1376,14.3226,0.8672,0.8827,0.4873
6,6.6817,126.7223,11.2571,0.8147,0.9425,1.7415
7,9.5722,252.7775,15.899,0.5193,1.3015,1.1994
8,8.3287,215.2561,14.6716,0.8317,0.9591,0.5832
9,8.0106,175.4157,13.2445,0.8231,1.0828,1.1221


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,9.7199,599.8531,24.4919,0.7161,0.6425,0.3208
1,9.3081,265.0089,16.2791,0.7258,0.6421,0.5353
2,10.0405,295.3847,17.1868,0.8308,0.6822,0.3842
3,8.5491,722.1992,26.8738,0.4264,1.028,0.4947
4,6.715,123.0038,11.0907,0.8398,1.0754,0.4462
5,7.5507,240.7635,15.5166,0.8441,0.7561,0.729
6,5.0938,111.8168,10.5743,0.8365,0.7481,0.4439
7,7.7277,188.9803,13.747,0.6406,1.2039,0.7003
8,7.1145,255.4904,15.9841,0.8002,0.6744,0.2524
9,6.7655,145.7071,12.0709,0.853,0.7987,0.7859


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,9.803,592.7558,24.3466,0.7194,0.6531,0.4171
1,8.8563,233.6912,15.287,0.7582,0.6839,0.4949
2,9.2208,266.3466,16.3201,0.8474,0.7085,0.3864
3,8.9659,781.825,27.9611,0.379,1.0,0.5443
4,6.4408,120.296,10.968,0.8433,0.9559,0.5458
5,7.6128,277.4349,16.6564,0.8203,0.7142,0.7798
6,4.5536,92.0186,9.5926,0.8655,0.7516,0.4313
7,7.242,161.6813,12.7154,0.6925,1.2082,0.7911
8,7.217,236.7587,15.387,0.8148,0.7019,0.3265
9,5.8777,108.6509,10.4236,0.8904,0.7241,0.7944


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0,Description,Value
0,Session id,481
1,Target,Para-12
2,Target type,Regression
3,Data shape,"(909, 11)"
4,Train data shape,"(636, 11)"
5,Test data shape,"(273, 11)"
6,Numeric features,9
7,Categorical features,1
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,1.7471,18.5423,3.842,0.6006,0.3794,0.2479,0.279
gbr,Gradient Boosting Regressor,1.7861,20.3741,3.9693,0.564,0.4034,0.2543,0.16
rf,Random Forest Regressor,1.734,20.952,4.0292,0.5258,0.3897,0.2367,0.17
lightgbm,Light Gradient Boosting Machine,1.8759,21.7873,4.1175,0.556,0.4033,0.266,0.094
br,Bayesian Ridge,2.0056,23.7792,4.4044,0.4649,0.4336,0.3169,0.047
lr,Linear Regression,2.0216,23.7675,4.4046,0.4626,0.4344,0.3233,0.284
ridge,Ridge Regression,2.0215,23.7676,4.4046,0.4626,0.4344,0.3232,0.049
lar,Least Angle Regression,2.0216,23.7675,4.4046,0.4626,0.4344,0.3233,0.049
en,Elastic Net,1.9967,23.9769,4.4174,0.4676,0.4347,0.3093,0.047
lasso,Lasso Regression,2.0011,24.0344,4.4225,0.4677,0.436,0.3096,0.078


Processing:   0%|          | 0/79 [00:00<?, ?it/s]

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.6792,8.8096,2.9681,0.3693,0.3723,0.2951
1,1.3092,3.9266,1.9816,0.8285,0.474,0.2438
2,2.4533,87.1259,9.3341,0.3963,0.4295,0.2606
3,1.4885,5.874,2.4236,0.7272,0.4867,0.2016
4,2.0442,41.6725,6.4554,0.2746,0.4346,0.2361
5,1.5165,13.5008,3.6743,0.6081,0.2458,0.2352
6,2.2241,13.1034,3.6199,0.5253,0.5914,0.3894
7,1.5665,7.0559,2.6563,0.7573,0.332,0.3974
8,1.5119,4.4921,2.1195,0.8387,0.2203,0.2153
9,2.116,14.0356,3.7464,0.6896,0.4954,0.2884


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.623,7.5889,2.7548,0.4567,0.3652,0.2861
1,1.6348,12.5913,3.5484,0.4502,0.5452,0.2402
2,2.7156,85.6315,9.2537,0.4067,0.4776,0.274
3,1.5007,5.9192,2.433,0.7251,0.536,0.1786
4,1.9542,41.2512,6.4227,0.2819,0.4275,0.2227
5,1.4657,12.6697,3.5594,0.6323,0.2466,0.2325
6,2.2374,14.3067,3.7824,0.4817,0.6019,0.3691
7,1.5139,6.1847,2.4869,0.7873,0.3171,0.373
8,1.4255,4.0949,2.0236,0.8529,0.1936,0.1922
9,1.8389,13.5461,3.6805,0.7004,0.5166,0.2441


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.8547,7.4754,2.7341,0.4648,0.411,0.3596
1,1.6409,6.3958,2.529,0.7207,0.5427,0.301
2,3.1454,107.7835,10.3819,0.2532,0.5043,0.355
3,1.6853,8.3992,2.8981,0.6099,0.5207,0.2128
4,2.4329,43.894,6.6253,0.2359,0.4862,0.3361
5,1.9733,17.9688,4.239,0.4785,0.3245,0.333
6,2.7433,16.4554,4.0565,0.4039,0.624,0.4503
7,1.9676,11.7681,3.4305,0.5952,0.3629,0.4747
8,2.0301,8.9636,2.9939,0.678,0.2754,0.2553
9,2.5162,20.4019,4.5168,0.5488,0.5361,0.3773


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0,Description,Value
0,Session id,3167
1,Target,Para-13
2,Target type,Regression
3,Data shape,"(909, 11)"
4,Train data shape,"(636, 11)"
5,Test data shape,"(273, 11)"
6,Numeric features,9
7,Categorical features,1
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.1202,0.1845,0.3025,0.783,0.1015,0.0933,0.274
lr,Linear Regression,0.1485,0.1919,0.3223,0.7511,0.1077,0.1161,0.274
lar,Least Angle Regression,0.1485,0.1919,0.3223,0.7511,0.1077,0.1161,0.049
lightgbm,Light Gradient Boosting Machine,0.1503,0.1922,0.3227,0.7554,0.1077,0.1213,0.093
ridge,Ridge Regression,0.1495,0.192,0.3228,0.7506,0.108,0.117,0.048
br,Bayesian Ridge,0.1517,0.1925,0.3241,0.7493,0.1087,0.1191,0.047
dt,Decision Tree Regressor,0.1473,0.2015,0.3381,0.7254,0.1176,0.1117,0.051
rf,Random Forest Regressor,0.1373,0.206,0.3425,0.7142,0.1114,0.102,0.202
omp,Orthogonal Matching Pursuit,0.182,0.2139,0.3621,0.6837,0.1243,0.1449,0.046
ada,AdaBoost Regressor,0.2085,0.2058,0.363,0.6878,0.1309,0.1819,0.127


Processing:   0%|          | 0/79 [00:00<?, ?it/s]

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.1592,0.0636,0.2523,0.8194,0.1209,0.1264
1,0.2461,1.4432,1.2013,0.1345,0.2324,0.0811
2,0.1271,0.0438,0.2093,0.8607,0.0821,0.0986
3,0.1182,0.0353,0.1878,0.8983,0.0901,0.1449
4,0.1041,0.0433,0.208,0.7894,0.0806,0.0899
5,0.1285,0.055,0.2346,0.8071,0.0898,0.1023
6,0.1173,0.0407,0.2016,0.8093,0.0822,0.1005
7,0.1407,0.0526,0.2294,0.8627,0.1058,0.1136
8,0.1511,0.0636,0.2521,0.8143,0.1046,0.1181
9,0.0908,0.0173,0.1315,0.9434,0.0581,0.088


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.1415,0.0761,0.2759,0.784,0.1242,0.1129
1,0.2587,1.4414,1.2006,0.1356,0.233,0.096
2,0.1401,0.0443,0.2104,0.8591,0.0828,0.1107
3,0.1332,0.0363,0.1904,0.8954,0.0935,0.1601
4,0.1276,0.0413,0.2033,0.7988,0.085,0.1141
5,0.1456,0.0724,0.2692,0.7461,0.0999,0.1178
6,0.1242,0.0611,0.2471,0.7135,0.0944,0.1099
7,0.1265,0.0368,0.1918,0.904,0.0878,0.0996
8,0.1893,0.0925,0.3042,0.7297,0.1205,0.1457
9,0.098,0.017,0.1304,0.9443,0.0562,0.0939


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 4 candidates, totalling 40 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.1415,0.0761,0.2759,0.784,0.1242,0.1129
1,0.2587,1.4414,1.2006,0.1356,0.233,0.096
2,0.1401,0.0443,0.2104,0.8591,0.0828,0.1107
3,0.1332,0.0363,0.1904,0.8954,0.0935,0.1601
4,0.1276,0.0413,0.2033,0.7988,0.085,0.1141
5,0.1456,0.0724,0.2692,0.7461,0.0999,0.1178
6,0.1242,0.0611,0.2471,0.7135,0.0944,0.1099
7,0.1265,0.0368,0.1918,0.904,0.0878,0.0996
8,0.1893,0.0925,0.3042,0.7297,0.1205,0.1457
9,0.098,0.017,0.1304,0.9443,0.0562,0.0939


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [None]:
# models_rmse_table contains total square deviation of different models for different "Para"
models_rmse_table = pd.DataFrame(range(len(models)), columns=['models']).set_index('models')

for i in range(10,14):
  this_data = train_data.iloc[:, 0:10]
  this_data[f'Para-{i}'] = train_data[f'Para-{i}']
  s = setup(this_data, target=f'Para-{i}')
  this_rmse = []
  for model in models:
    prediction = predict_model(model, test_data.iloc[:, 0:10])['prediction_label']
    this_rmse.append(sum((prediction - test_data[f'Para-{i}'].reset_index(drop=True))**2))
  models_rmse_table[f'Para-{i}'] = this_rmse
models_rmse_table

In [297]:
final_rmse = []
for i in range(len(models_rmse_table)):
  final_rmse.append((sum(models_rmse_table.iloc[i, :])/(5*len(test_data)))**0.5)
final_rmse

[47.34033208766921,
 41.575826924482826,
 46.78586265223186,
 37.37720719042681,
 38.07211707902503,
 39.54677483196249,
 52.05672842033515,
 52.888743458301406,
 57.89908013801939,
 87.95396232820195,
 88.29155131471478,
 88.3731568951671,
 96.40139430239479,
 57.18324587866645,
 57.18324587866625]

In [298]:
min_idx = final_rmse.index(min(final_rmse))
print(models[min_idx], 'with RMSE = ', final_rmse[min_idx])

ExtraTreesRegressor(n_jobs=-1, random_state=4637) with RMSE =  37.37720719042681
