In [24]:
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, GridSearchCV, ParameterGrid, KFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, r2_score, \
roc_curve, auc, precision_score, recall_score, confusion_matrix
import statsmodels.formula.api as smf
import statsmodels.api as sm
from patsy import dmatrix
#from pyearth import Earth
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor,BaggingClassifier,RandomForestRegressor,RandomForestClassifier
from sklearn.neighbors import KNeighborsRegressor
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import time
from sklearn.model_selection import RandomizedSearchCV

In [3]:
X = pd.read_csv('X.csv')
y = pd.read_csv('y.csv')

In [6]:
X.drop('Unnamed: 0', axis = 1, inplace = True)

In [45]:
y.drop('Unnamed: 0', axis = 1, inplace = True)

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 4)

# XGBoost

In [47]:
start_time = time.time()
param_grid = {'max_depth': [8],
              'learning_rate': [0.08,0.09,0.1],
               'reg_lambda':[0.09],
                'n_estimators':[510,515,520,530],
                'gamma': [0.1,0.2],
                'subsample': [0.75,0.8],
                'colsample_bytree': [0.75,0.8]}

cv = KFold(n_splits=5,shuffle=True,random_state=4)
optimal_params = GridSearchCV(estimator=xgb.XGBRegressor(random_state=4),                                                       
                             param_grid = param_grid,
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv)
optimal_params.fit(X_train,y_train)
print("Optimal parameter values =", optimal_params.best_params_)
print("Optimal cross validation R-squared = ",optimal_params.best_score_)
print("Time taken = ", round((time.time()-start_time)/60), " minutes")

Fitting 5 folds for each of 96 candidates, totalling 480 fits
Optimal parameter values = {'colsample_bytree': 0.8, 'gamma': 0.2, 'learning_rate': 0.09, 'max_depth': 8, 'n_estimators': 510, 'reg_lambda': 0.09, 'subsample': 0.8}
Optimal cross validation R-squared =  0.9299728069693707
Time taken =  9  minutes


In [49]:
np.sqrt(mean_squared_error(optimal_params.best_estimator_.predict(X_test),y_test))

3216.4482763272426

In [54]:
XGBoost_model = xgb.XGBRegressor(random_state = 4, colsample_bytree =0.8, gamma = 0.2, learning_rate = 0.09, 
                                 max_depth = 8, n_estimators = 510, reg_lambda = 0.09, subsample = 0.8)
XGBoost_model.fit(X_train,y_train,eval_set = ([(X_test, y_test)]), early_stopping_rounds = 10)

[0]	validation_0-rmse:24614.82874
[1]	validation_0-rmse:22450.05895
[2]	validation_0-rmse:20508.28559
[3]	validation_0-rmse:18724.28131
[4]	validation_0-rmse:17150.27620
[5]	validation_0-rmse:15703.17855
[6]	validation_0-rmse:14377.14412
[7]	validation_0-rmse:13224.10053
[8]	validation_0-rmse:12147.81539
[9]	validation_0-rmse:11164.38947
[10]	validation_0-rmse:10292.88455
[11]	validation_0-rmse:9491.89669
[12]	validation_0-rmse:8779.45939
[13]	validation_0-rmse:8137.74476
[14]	validation_0-rmse:7539.39859
[15]	validation_0-rmse:7031.85127
[16]	validation_0-rmse:6573.58257
[17]	validation_0-rmse:6155.23913
[18]	validation_0-rmse:5789.79281
[19]	validation_0-rmse:5477.38791
[20]	validation_0-rmse:5196.71140
[21]	validation_0-rmse:4942.45453
[22]	validation_0-rmse:4719.12392
[23]	validation_0-rmse:4528.70100
[24]	validation_0-rmse:4362.38316
[25]	validation_0-rmse:4214.24203
[26]	validation_0-rmse:4090.78937
[27]	validation_0-rmse:3981.43226
[28]	validation_0-rmse:3885.65513
[29]	validati

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.8, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=0.2, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.09, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=8, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=510, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=4, ...)

In [55]:
np.sqrt(mean_squared_error(XGBoost_model.predict(X_test),y_test))

3215.2586419292516

In [68]:
XGB_bagged = BaggingRegressor(base_estimator=XGBoost_model, 
                             n_estimators=20,
                             bootstrap=False,
                            random_state=4,
                            n_jobs = -1).fit(X_train, y_train)

In [70]:
np.sqrt(mean_squared_error(XGB_bagged.predict(X_test),y_test))

3084.8782042684575

# LGBM

In [40]:
from lightgbm import LGBMRegressor

In [57]:
LGMB_model=LGBMRegressor(random_state=4).fit(X_train,y_train)

In [59]:
np.sqrt(mean_squared_error(LGMB_model.predict(X_test),y_test))

4334.862011666212

In [77]:
#K-fold cross validation to find optimal parameters for LightGBM regressor
start_time = time.time()
param_grid = {'max_depth': [4,5,6,7,8],
              'num_leaves': [50,60,70,80,90,100,110,120,130,170,200],
              'learning_rate': [0.05, 0.07,0.09,0.1,0.12,0.15],
               'reg_lambda':[4,5,6,7,8,9,10],
                'n_estimators':[800,900,1000,1100,1200,1300,1400,1500],
                'reg_alpha': [1,2,3,4,5,6,7,8,9,10],
                'subsample': [0.70, 0.75,0.8],
                'colsample_bytree': [0.70,0.75,0.8, 0.9]}

cv = KFold(n_splits=5,shuffle=True,random_state=1)
optimal_params = RandomizedSearchCV(estimator=LGBMRegressor(random_state=1),                                                       
                             param_distributions = param_grid, n_iter = 200,
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv)
optimal_params.fit(X_train,y_train)
print("Optimal parameter values =", optimal_params.best_params_)
print("Optimal cross validation R-squared = ",optimal_params.best_score_)
print("Time taken = ", round((time.time()-start_time)/60), " minutes")

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Optimal parameter values = {'subsample': 0.7, 'reg_lambda': 4, 'reg_alpha': 5, 'num_leaves': 60, 'n_estimators': 1300, 'max_depth': 5, 'learning_rate': 0.15, 'colsample_bytree': 0.7}
Optimal cross validation R-squared =  0.8902672209757044
Time taken =  5  minutes


In [78]:
np.sqrt(mean_squared_error(optimal_params.best_estimator_.predict(X_test),y_test))

4651.048843648547

In [79]:
LGBM_bagged = BaggingRegressor(base_estimator=optimal_params.best_estimator_, 
                             n_estimators=20,
                             bootstrap=False,
                            random_state=4,
                            n_jobs = -1).fit(X_train, y_train)

In [80]:
np.sqrt(mean_squared_error(LGBM_bagged.predict(X_test),y_test))

4580.182816361993