In [1]:
from pycaret.regression import *

In [2]:
# importing dependecies
import pandas as pd
import numpy as np

In [3]:
def manipulate_data(dataset):


    # reading the data
    data = pd .read_csv(dataset)

    # convert hour to string
    data['hour'] = data.hour.astype('str')

    # create newdatetime data columns
    data["newdatetime"] = data['date'] + ' ' + data['hour'] + ':00'

    # convert the newdatetime column to data time
    data["newdatetime"] = pd.to_datetime(data["newdatetime"])

    # feature enginering
    data["hour_of_day"] = data["newdatetime"].dt.hour
    data["day_of_month"] = data["newdatetime"].dt.day
    data["month_of_year"] = data["newdatetime"].dt.month
    data["year"] = data["newdatetime"].dt.year
    data['quarter_of_year'] = data["newdatetime"].dt.quarter
    data['week_of_year'] = data["newdatetime"].dt.week
    data['day_of_year'] = data["newdatetime"].dt.dayofyear
    data['day_of_week'] = data["newdatetime"].dt.dayofweek

    # drop not needed columns 
    data.drop(["date", "hour", "newdatetime"], axis = 1, inplace = True)

    return data


In [4]:
df = manipulate_data("../data/train_E1GspfA.csv")
print(df.shape)

(18247, 9)


In [5]:
df.head(5)

Unnamed: 0,demand,hour_of_day,day_of_month,month_of_year,year,quarter_of_year,week_of_year,day_of_year,day_of_week
0,91,9,18,8,2018,3,33,230,5
1,21,10,18,8,2018,3,33,230,5
2,23,13,18,8,2018,3,33,230,5
3,104,14,18,8,2018,3,33,230,5
4,81,15,18,8,2018,3,33,230,5


In [6]:
exp_reg101 = setup(data = df, target = 'demand', session_id=123) 

Unnamed: 0,Description,Value
0,session_id,123
1,Target,demand
2,Original Data,"(18247, 9)"
3,Missing Values,False
4,Numeric Features,4
5,Categorical Features,4
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(12772, 31)"


In [7]:
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,27.0081,1036.4565,32.1924,0.4013,0.6125,0.7915,0.072
rf,Random Forest Regressor,27.941,1128.4264,33.5895,0.3483,0.6273,0.7996,0.895
gbr,Gradient Boosting Regressor,28.0735,1167.0421,34.1587,0.3261,0.6254,0.8189,0.331
knn,K Neighbors Regressor,28.5838,1203.4343,34.6858,0.3051,0.6384,0.815,0.039
et,Extra Trees Regressor,29.1813,1258.0167,35.466,0.2729,0.6526,0.8195,1.152
ada,AdaBoost Regressor,29.9445,1361.1729,36.889,0.2137,0.6728,0.9453,0.149
lr,Linear Regression,30.8858,1543.4053,39.2797,0.1091,0.6731,0.9283,0.372
ridge,Ridge Regression,30.8964,1543.8325,39.2852,0.1088,0.6733,0.9286,0.021
br,Bayesian Ridge,30.8838,1543.8172,39.285,0.1088,0.6733,0.9288,0.035
huber,Huber Regressor,30.8071,1560.3063,39.4942,0.0994,0.6679,0.9058,0.26


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=123, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [15]:
lightgbm = create_model('lightgbm')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,26.6092,1023.1748,31.9871,0.3932,0.5941,0.7292
1,27.5629,1056.3429,32.5014,0.4135,0.5892,0.6847
2,27.3905,1033.0898,32.1417,0.3804,0.6319,0.8305
3,27.3276,1039.6028,32.2429,0.4375,0.6166,0.7944
4,26.9402,1021.5798,31.9622,0.4062,0.6389,0.8941
5,26.4364,992.7044,31.5072,0.3862,0.5983,0.7502
6,26.7823,1028.7644,32.0744,0.4203,0.6082,0.7828
7,26.835,1067.9816,32.68,0.4009,0.6182,0.7997
8,27.0023,1040.6168,32.2586,0.3934,0.6228,0.8652
9,27.1943,1060.7082,32.5685,0.3817,0.6066,0.7839


In [17]:
lightgbm_f10 = create_model('lightgbm', fold=10)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,26.6092,1023.1748,31.9871,0.3932,0.5941,0.7292
1,27.5629,1056.3429,32.5014,0.4135,0.5892,0.6847
2,27.3905,1033.0898,32.1417,0.3804,0.6319,0.8305
3,27.3276,1039.6028,32.2429,0.4375,0.6166,0.7944
4,26.9402,1021.5798,31.9622,0.4062,0.6389,0.8941
5,26.4364,992.7044,31.5072,0.3862,0.5983,0.7502
6,26.7823,1028.7644,32.0744,0.4203,0.6082,0.7828
7,26.835,1067.9816,32.68,0.4009,0.6182,0.7997
8,27.0023,1040.6168,32.2586,0.3934,0.6228,0.8652
9,27.1943,1060.7082,32.5685,0.3817,0.6066,0.7839


In [18]:
print(lightgbm_f10)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=123, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)


In [19]:
tuned_lightgbm = tune_model(lightgbm_f10)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,26.5651,1044.1731,32.3137,0.3808,0.598,0.7346
1,27.9944,1111.7017,33.3422,0.3827,0.5968,0.6932
2,27.673,1068.3165,32.6851,0.3592,0.6345,0.8247
3,27.6724,1090.0672,33.0162,0.4102,0.619,0.7893
4,27.5392,1084.5625,32.9327,0.3696,0.641,0.8825
5,27.1195,1059.3681,32.5479,0.345,0.6107,0.7809
6,27.4943,1092.178,33.0481,0.3845,0.6174,0.796
7,27.5448,1131.662,33.6402,0.3651,0.6248,0.7993
8,27.4951,1096.0955,33.1073,0.3611,0.6318,0.8687
9,27.8673,1121.6905,33.4916,0.3461,0.6072,0.7649


In [20]:
tuned_lightgbm_iter10 = tune_model(lightgbm_f10, n_iter=10)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,26.5651,1044.1731,32.3137,0.3808,0.598,0.7346
1,27.9944,1111.7017,33.3422,0.3827,0.5968,0.6932
2,27.673,1068.3165,32.6851,0.3592,0.6345,0.8247
3,27.6724,1090.0672,33.0162,0.4102,0.619,0.7893
4,27.5392,1084.5625,32.9327,0.3696,0.641,0.8825
5,27.1195,1059.3681,32.5479,0.345,0.6107,0.7809
6,27.4943,1092.178,33.0481,0.3845,0.6174,0.796
7,27.5448,1131.662,33.6402,0.3651,0.6248,0.7993
8,27.4951,1096.0955,33.1073,0.3611,0.6318,0.8687
9,27.8673,1121.6905,33.4916,0.3461,0.6072,0.7649


In [21]:
tuned_lightgbm_iter100 = tune_model(lightgbm_f10, n_iter=100)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,26.5125,1033.5083,32.1482,0.3871,0.6016,0.7185
1,28.0135,1108.2291,33.2901,0.3846,0.5963,0.688
2,27.5471,1050.3094,32.4085,0.37,0.634,0.8218
3,27.8941,1098.8633,33.1491,0.4054,0.6229,0.7896
4,27.2857,1062.4251,32.5949,0.3825,0.6446,0.8862
5,27.0066,1054.8866,32.479,0.3478,0.6087,0.7724
6,27.2329,1065.5903,32.6434,0.3995,0.6223,0.791
7,27.1272,1095.581,33.0996,0.3854,0.6218,0.7909
8,27.3115,1079.0867,32.8495,0.371,0.6289,0.8536
9,27.4252,1079.0439,32.8488,0.371,0.6046,0.7605


In [22]:
tuned_lightgbm_iter100

LGBMRegressor(bagging_fraction=0.9, bagging_freq=2, boosting_type='gbdt',
              class_weight=None, colsample_bytree=1.0, feature_fraction=0.5,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=61, min_child_weight=0.001, min_split_gain=0.7,
              n_estimators=290, n_jobs=-1, num_leaves=40, objective=None,
              random_state=123, reg_alpha=0.2, reg_lambda=0.2, silent='warn',
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [23]:
models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Linear Regression,sklearn.linear_model._base.LinearRegression,True
lasso,Lasso Regression,sklearn.linear_model._coordinate_descent.Lasso,True
ridge,Ridge Regression,sklearn.linear_model._ridge.Ridge,True
en,Elastic Net,sklearn.linear_model._coordinate_descent.Elast...,True
lar,Least Angle Regression,sklearn.linear_model._least_angle.Lars,True
llar,Lasso Least Angle Regression,sklearn.linear_model._least_angle.LassoLars,True
omp,Orthogonal Matching Pursuit,sklearn.linear_model._omp.OrthogonalMatchingPu...,True
br,Bayesian Ridge,sklearn.linear_model._bayes.BayesianRidge,True
ard,Automatic Relevance Determination,sklearn.linear_model._bayes.ARDRegression,False
par,Passive Aggressive Regressor,sklearn.linear_model._passive_aggressive.Passi...,True


In [24]:
xgboost = create_model('xgboost', max_depth = 10)

ValueError: Estimator xgboost not available. Please see docstring for list of available estimators.