## Machine Learning 

To find out which model works the best, first I am trying different to find out which default estimator works best. Then after that I will perform hyperparameter tuning. The metric I will use is mean of R^2 across 5-fold cross validation.

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set()

In [2]:
trainset = pd.read_csv('./wrangled_data/trainset.csv')
validationset = pd.read_csv('./wrangled_data/validationset.csv')
testset = pd.read_csv('./wrangled_data/testset.csv')

In [3]:
trainset.shape

(3367224, 10)

In [4]:
trainset.head().T

Unnamed: 0,0,1,2,3,4
date_block_num,12.0,12.0,12.0,12.0,12.0
shop_id,2.0,2.0,2.0,2.0,2.0
item_id,5572.0,5643.0,5583.0,7893.0,7894.0
item_category_id,2.0,2.0,0.0,6.0,6.0
item_cnt_month,2.0,1.0,0.0,6.0,5.0
item_cnt_month_lag1,2.0,6.0,0.0,10.0,2.0
item_cnt_month_lag2,2.0,3.0,0.0,3.0,1.0
item_cnt_month_lag3,1.0,2.0,0.0,4.0,3.0
item_cnt_month_lag6,2.0,3.0,0.0,6.0,1.0
item_cnt_month_lag12,0.0,0.0,0.0,3.0,0.0


In [5]:
validationset.shape

(990360, 10)

In [6]:
X_train = trainset.drop(['item_cnt_month', 'date_block_num', 'item_category_id'] , axis=1)
X_train.shape

(3367224, 7)

In [7]:
X_train.head()

Unnamed: 0,shop_id,item_id,item_cnt_month_lag1,item_cnt_month_lag2,item_cnt_month_lag3,item_cnt_month_lag6,item_cnt_month_lag12
0,2,5572,2.0,2.0,1.0,2.0,0.0
1,2,5643,6.0,3.0,2.0,3.0,0.0
2,2,5583,0.0,0.0,0.0,0.0,0.0
3,2,7893,10.0,3.0,4.0,6.0,3.0
4,2,7894,2.0,1.0,3.0,1.0,0.0


In [8]:
y_train = trainset['item_cnt_month'].astype(int)
y_train.shape

(3367224,)

In [9]:
X_validation = validationset.drop(['item_cnt_month', 'date_block_num', 'item_category_id'], axis=1)
X_validation.shape

(990360, 7)

In [10]:
X_validation.head()

Unnamed: 0,shop_id,item_id,item_cnt_month_lag1,item_cnt_month_lag2,item_cnt_month_lag3,item_cnt_month_lag6,item_cnt_month_lag12
0,2,5572,0.0,0.0,0.0,0.0,0.0
1,2,5643,0.0,0.0,0.0,0.0,0.0
2,2,5583,0.0,0.0,0.0,0.0,0.0
3,2,7893,2.0,1.0,4.0,3.0,4.0
4,2,7894,2.0,1.0,5.0,6.0,2.0


In [11]:
y_validation = validationset['item_cnt_month'].astype(int)
y_validation.shape

(990360,)

In [12]:
X_test = testset.drop(['item_cnt_month', 'date_block_num', 'item_category_id'], axis=1)
X_test.shape

(214200, 8)

In [13]:
X_test.head()

Unnamed: 0,ID,shop_id,item_id,item_cnt_month_lag1,item_cnt_month_lag2,item_cnt_month_lag3,item_cnt_month_lag6,item_cnt_month_lag12
0,0,5,5037,0.0,0.0,0.0,0.0,0.0
1,1,5,5320,0.0,0.0,0.0,0.0,0.0
2,2,5,5233,3.0,1.0,2.0,0.0,0.0
3,3,5,5232,0.0,0.0,0.0,0.0,0.0
4,4,5,5268,0.0,0.0,0.0,0.0,0.0


In [14]:
X_train.columns

Index(['shop_id', 'item_id', 'item_cnt_month_lag1', 'item_cnt_month_lag2',
       'item_cnt_month_lag3', 'item_cnt_month_lag6', 'item_cnt_month_lag12'],
      dtype='object')

In [15]:
X_test.columns

Index(['ID', 'shop_id', 'item_id', 'item_cnt_month_lag1',
       'item_cnt_month_lag2', 'item_cnt_month_lag3', 'item_cnt_month_lag6',
       'item_cnt_month_lag12'],
      dtype='object')

In [16]:
X_test = X_test[X_train.columns]
X_test.shape

(214200, 7)

In [17]:
X_test.head()

Unnamed: 0,shop_id,item_id,item_cnt_month_lag1,item_cnt_month_lag2,item_cnt_month_lag3,item_cnt_month_lag6,item_cnt_month_lag12
0,5,5037,0.0,0.0,0.0,0.0,0.0
1,5,5320,0.0,0.0,0.0,0.0,0.0
2,5,5233,3.0,1.0,2.0,0.0,0.0
3,5,5232,0.0,0.0,0.0,0.0,0.0
4,5,5268,0.0,0.0,0.0,0.0,0.0


## Linear Regression

In [18]:
# Import necessary modules
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

In [19]:
# Create a linear regressor
linreg = LinearRegression()

# Fit the regressor to the trainign data
linreg.fit(X_train, y_train)

# make a dictionry to store cross val scores from each estimator
cscores = {}
cscores_means = {}

cross_validation_scores= cross_val_score(linreg, X_train, y_train, cv=5, scoring="r2") 

In [20]:
# store them  in the dictionary
cscores["Linear Regression"] = list(cross_validation_scores)
cscores_means["Linear Regression"] = np.mean(cross_validation_scores)

print("5-fold cross validation scores for Linear Regression: ", cross_validation_scores)
print("Mean R^2: {}".format(np.mean(cross_validation_scores)))

5-fold cross validation scores for Linear Regression:  [0.7347294  0.78872197 0.74010816 0.68682782 0.35517122]
Mean R^2: 0.661111714969716


In [21]:
# Predict on the validation data
y_pred = linreg.predict(X_validation)

# make a dictionary to store rmse from each estimator
rmse_dict = {}

rmse_dict["Linear Regression"] = np.sqrt(mean_squared_error(y_validation, y_pred))

## Ridge Regression

In [22]:
from sklearn.linear_model import Ridge

In [23]:
ridge = Ridge(normalize=True, random_state=42)

ridge.fit(X_train, y_train)

cross_validation_scores= cross_val_score(ridge, X_train, y_train, cv=5, scoring="r2")

In [24]:
cscores['Ridge Regression'] = list(cross_validation_scores)
cscores_means['Ridge Regression'] = np.mean(cross_validation_scores)

print("5-fold cross validation scores for Ridge Regression: ", cross_validation_scores)
print("Mean R^2: {}".format(np.mean(cross_validation_scores)))

5-fold cross validation scores for Ridge Regression:  [0.72475486 0.75482484 0.68116293 0.61779844 0.36735335]
Mean R^2: 0.6291788834612495


In [25]:
# Predict on the validation data
y_pred = ridge.predict(X_validation)

rmse_dict["Ridge Regression"] = np.sqrt(mean_squared_error(y_validation, y_pred))

## Lasso Regression

In [26]:
from sklearn.linear_model import Lasso

In [27]:
lasso = Lasso(normalize=True, random_state=42)

lasso.fit(X_train, y_train)

cross_validation_scores= cross_val_score(lasso, X_train, y_train, cv=5, scoring="r2")

In [28]:
cscores['Lasso Regression'] = list(cross_validation_scores)
cscores_means['Lasso Regression'] = np.mean(cross_validation_scores)

print("5-fold cross validation scores for Lasso Regression: ", cross_validation_scores)
print("Mean R^2: {}".format(np.mean(cross_validation_scores)))

5-fold cross validation scores for Lasso Regression:  [-4.36382590e-04 -4.84159121e-04 -4.94544368e-05 -1.21216363e-03
 -3.50770878e-06]
Mean R^2: -0.0004371334970727059


In [29]:
# Predict on the validation data
y_pred = lasso.predict(X_validation)

rmse_dict["Lasso Regression"] = np.sqrt(mean_squared_error(y_validation, y_pred))

## Decision Tree

In [30]:
from sklearn import tree

In [31]:
dt = tree.DecisionTreeRegressor(random_state=42)

dt.fit(X_train, y_train)

cross_validation_scores= cross_val_score(dt, X_train, y_train, cv=5, scoring="r2")

In [32]:
cscores['Decision Tree'] = list(cross_validation_scores)
cscores_means['Decision Tree'] = np.mean(cross_validation_scores)

print("5-fold cross validation scores for Decison Tree: ", cross_validation_scores)
print("Mean R^2: {}".format(np.mean(cross_validation_scores)))

5-fold cross validation scores for Decison Tree:  [0.5710347  0.61344177 0.53583615 0.60641432 0.18568558]
Mean R^2: 0.5024825050488818


In [33]:
# Predict on the validation data
y_pred = dt.predict(X_validation)

rmse_dict["Decision Tree"] = np.sqrt(mean_squared_error(y_validation, y_pred))

## Bagging Regressor

In [34]:
from sklearn.ensemble import BaggingRegressor

In [35]:
bag = BaggingRegressor(random_state=42)

bag.fit(X_train, y_train)

cross_validation_scores= cross_val_score(bag, X_train, y_train, cv=5, scoring="r2")

In [36]:
cscores['Bagging Tree'] = list(cross_validation_scores)
cscores_means['Bagging Tree'] = np.mean(cross_validation_scores)

print("5-fold cross validation scores for Bagging Tree: ", cross_validation_scores)
print("Mean R^2: {}".format(np.mean(cross_validation_scores)))

5-fold cross validation scores for Bagging Tree:  [0.6426729  0.73759897 0.71777498 0.68703943 0.34209103]
Mean R^2: 0.6254354604666371


In [37]:
# Predict on the validation data
y_pred = bag.predict(X_validation)

rmse_dict["Bagging Tree"] = np.sqrt(mean_squared_error(y_validation, y_pred))

## Random Forest 

In [38]:
from sklearn.ensemble import RandomForestRegressor

In [39]:
rfr = RandomForestRegressor(random_state=42)
rfr.fit(X_train, y_train)

cross_validation_scores= cross_val_score(rfr, X_train, y_train, cv=5, scoring="r2")



In [40]:
cscores['Random Forest'] = list(cross_validation_scores)
cscores_means['Random Forest'] = np.mean(cross_validation_scores)

print("5-fold cross validation scores for Random Forest: ", cross_validation_scores)
print("Mean R^2: {}".format(np.mean(cross_validation_scores)))

5-fold cross validation scores for Random Forest:  [0.64142188 0.74072247 0.71893137 0.68446417 0.34801286]
Mean R^2: 0.626710548598983


In [41]:
# Predict on the validation data
y_pred = rfr.predict(X_validation)

rmse_dict["Random Forest"] = np.sqrt(mean_squared_error(y_validation, y_pred))

## Adaptive Boost Regressor 

In [42]:
from sklearn.ensemble import AdaBoostRegressor

In [43]:
abr = AdaBoostRegressor(random_state=42)
abr.fit(X_train, y_train)

cross_validation_scores= cross_val_score(abr, X_train, y_train, cv=5, scoring="r2")

In [44]:
cscores['Adaptive Boost'] = list(cross_validation_scores)
cscores_means['Adaptive Boost'] = np.mean(cross_validation_scores)

print("5-fold cross validation scores for Adaptive Boost Regression: ", cross_validation_scores)
print("Mean R^2: {}".format(np.mean(cross_validation_scores)))

5-fold cross validation scores for Adaptive Boost Regression:  [  0.43299597   0.15127001   0.14416066   0.467402   -17.49830372]
Mean R^2: -3.260495016655485


In [45]:
# Predict on the validation data
y_pred = abr.predict(X_validation)

rmse_dict["Adaptive Boost"] = np.sqrt(mean_squared_error(y_validation, y_pred))

## Gradient Boost Regressor 

In [46]:
from sklearn.ensemble import GradientBoostingRegressor

In [47]:
gbr = GradientBoostingRegressor(random_state=42)
gbr.fit(X_train, y_train)

cross_validation_scores= cross_val_score(gbr, X_train, y_train, cv=5, scoring="r2")

In [48]:
cscores['Gradient Boost'] = list(cross_validation_scores)
cscores_means['Gradient Boost'] = np.mean(cross_validation_scores)

print("5-fold cross validation scores for Gradient Boost Regression: ", cross_validation_scores)
print("Mean R^2: {}".format(np.mean(cross_validation_scores)))

5-fold cross validation scores for Gradient Boost Regression:  [0.75128461 0.8071292  0.7535773  0.69568235 0.38060628]
Mean R^2: 0.6776559475632237


In [49]:
# Predict on the validation data
y_pred = gbr.predict(X_validation)

rmse_dict["Gradient Boost"] = np.sqrt(mean_squared_error(y_validation, y_pred))

## XGBoost

In [50]:
import xgboost as xgb

In [51]:
xg_reg = xgb.XGBRegressor(random_state=42)
xg_reg.fit(X_train, y_train)


cross_validation_scores= cross_val_score(xg_reg, X_train, y_train, cv=5, scoring="r2")

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \


In [52]:
cscores['XGBoosting'] = list(cross_validation_scores)
cscores_means['XGBoosting'] = np.mean(cross_validation_scores)

print("5-fold cross validation scores for XGBoosting: ", cross_validation_scores)
print("Mean R^2: {}".format(np.mean(cross_validation_scores)))

5-fold cross validation scores for XGBoosting:  [0.74139231 0.7998444  0.75198391 0.69813629 0.3814746 ]
Mean R^2: 0.6745663011061109


In [53]:
# Predict on the validation data
y_pred = xg_reg.predict(X_validation)

rmse_dict["XGBoosting"] = np.sqrt(mean_squared_error(y_validation, y_pred))

## LightGBM

In [54]:
import lightgbm as lgb

In [55]:
lgb_reg = lgb.LGBMRegressor(random_state=42)
lgb_reg.fit(X_train, y_train)


cross_validation_scores= cross_val_score(lgb_reg, X_train, y_train, cv=5, scoring="r2")

In [56]:
cscores['LightGBM'] = list(cross_validation_scores)
cscores_means['LightGBM'] = np.mean(cross_validation_scores)

print("5-fold cross validation scores for LightGBM: ", cross_validation_scores)
print("Mean R^2: {}".format(np.mean(cross_validation_scores)))

5-fold cross validation scores for LightGBM:  [0.69463449 0.77211926 0.73183541 0.68423465 0.38452759]
Mean R^2: 0.6534702808829487


In [57]:
# Predict on the validation data
y_pred = lgb_reg.predict(X_validation)

rmse_dict["LightGBM"] = np.sqrt(mean_squared_error(y_validation, y_pred))

In [58]:
cscores

{'Linear Regression': [0.7347294008523426,
  0.7887219719256653,
  0.7401081625046291,
  0.6868278184930344,
  0.35517122107290866],
 'Ridge Regression': [0.7247548611721013,
  0.7548248385077589,
  0.6811629285424119,
  0.6177984367660057,
  0.36735335231796973],
 'Lasso Regression': [-0.00043638259007772184,
  -0.00048415912117372173,
  -4.945443682990991e-05,
  -0.0012121636285038484,
  -3.507708778327512e-06],
 'Decision Tree': [0.5710347026767146,
  0.6134417718162082,
  0.5358361544942194,
  0.6064143175481598,
  0.1856855787091073],
 'Bagging Tree': [0.6426728967383186,
  0.7375989717423909,
  0.7177749756404138,
  0.6870394280384875,
  0.3420910301735749],
 'Random Forest': [0.6414218807133164,
  0.7407224675994993,
  0.7189313661515362,
  0.6844641678411972,
  0.3480128606893659],
 'Adaptive Boost': [0.4329959654011327,
  0.15127000589003392,
  0.1441606636740076,
  0.46740200308317925,
  -17.49830372132578],
 'Gradient Boost': [0.7512846067033102,
  0.8071292007736355,
  0.75

In [59]:
rmse_dict

{'Linear Regression': 1.349140588520255,
 'Ridge Regression': 1.3516762715640174,
 'Lasso Regression': 2.4531768511027647,
 'Decision Tree': 1.934200057872973,
 'Bagging Tree': 1.5428340401124976,
 'Random Forest': 1.5317249671710738,
 'Adaptive Boost': 2.512667756572795,
 'Gradient Boost': 1.5058270741864581,
 'XGBoosting': 1.5005163855410986,
 'LightGBM': 1.5463498989370228}

In [60]:
index = [0]
cscores_means_df = pd.DataFrame(cscores_means, index=index)
rmse_df = pd.DataFrame(rmse_dict, index=index)

In [61]:
cscores_means_df

Unnamed: 0,Linear Regression,Ridge Regression,Lasso Regression,Decision Tree,Bagging Tree,Random Forest,Adaptive Boost,Gradient Boost,XGBoosting,LightGBM
0,0.661112,0.629179,-0.000437,0.502483,0.625435,0.626711,-3.260495,0.677656,0.674566,0.65347


In [62]:
rmse_df

Unnamed: 0,Linear Regression,Ridge Regression,Lasso Regression,Decision Tree,Bagging Tree,Random Forest,Adaptive Boost,Gradient Boost,XGBoosting,LightGBM
0,1.349141,1.351676,2.453177,1.9342,1.542834,1.531725,2.512668,1.505827,1.500516,1.54635


In [63]:
cscores_means_df = cscores_means_df.T
cscores_means_df

Unnamed: 0,0
Linear Regression,0.661112
Ridge Regression,0.629179
Lasso Regression,-0.000437
Decision Tree,0.502483
Bagging Tree,0.625435
Random Forest,0.626711
Adaptive Boost,-3.260495
Gradient Boost,0.677656
XGBoosting,0.674566
LightGBM,0.65347


In [64]:
cscores_means_df.columns = ['r2']
cscores_means_df

Unnamed: 0,r2
Linear Regression,0.661112
Ridge Regression,0.629179
Lasso Regression,-0.000437
Decision Tree,0.502483
Bagging Tree,0.625435
Random Forest,0.626711
Adaptive Boost,-3.260495
Gradient Boost,0.677656
XGBoosting,0.674566
LightGBM,0.65347


In [65]:
rmse_df = rmse_df.T
rmse_df

Unnamed: 0,0
Linear Regression,1.349141
Ridge Regression,1.351676
Lasso Regression,2.453177
Decision Tree,1.9342
Bagging Tree,1.542834
Random Forest,1.531725
Adaptive Boost,2.512668
Gradient Boost,1.505827
XGBoosting,1.500516
LightGBM,1.54635


In [66]:
rmse_df.columns = ['rmse']
rmse_df

Unnamed: 0,rmse
Linear Regression,1.349141
Ridge Regression,1.351676
Lasso Regression,2.453177
Decision Tree,1.9342
Bagging Tree,1.542834
Random Forest,1.531725
Adaptive Boost,2.512668
Gradient Boost,1.505827
XGBoosting,1.500516
LightGBM,1.54635


In [67]:
score_df = cscores_means_df.join(rmse_df)
score_df

Unnamed: 0,r2,rmse
Linear Regression,0.661112,1.349141
Ridge Regression,0.629179,1.351676
Lasso Regression,-0.000437,2.453177
Decision Tree,0.502483,1.9342
Bagging Tree,0.625435,1.542834
Random Forest,0.626711,1.531725
Adaptive Boost,-3.260495,2.512668
Gradient Boost,0.677656,1.505827
XGBoosting,0.674566,1.500516
LightGBM,0.65347,1.54635


In [68]:
score_df.index.name = 'Algorithm'
score_df

Unnamed: 0_level_0,r2,rmse
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1
Linear Regression,0.661112,1.349141
Ridge Regression,0.629179,1.351676
Lasso Regression,-0.000437,2.453177
Decision Tree,0.502483,1.9342
Bagging Tree,0.625435,1.542834
Random Forest,0.626711,1.531725
Adaptive Boost,-3.260495,2.512668
Gradient Boost,0.677656,1.505827
XGBoosting,0.674566,1.500516
LightGBM,0.65347,1.54635


In [69]:
df = score_df.sort_values(['r2', 'rmse'], ascending=False)
df

Unnamed: 0_level_0,r2,rmse
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1
Gradient Boost,0.677656,1.505827
XGBoosting,0.674566,1.500516
Linear Regression,0.661112,1.349141
LightGBM,0.65347,1.54635
Ridge Regression,0.629179,1.351676
Random Forest,0.626711,1.531725
Bagging Tree,0.625435,1.542834
Decision Tree,0.502483,1.9342
Lasso Regression,-0.000437,2.453177
Adaptive Boost,-3.260495,2.512668


#  With tuned parameters

## Tuned Random Forest

__RandomizedSearchCV gave following as best parameters__

`{'n_estimators': 60,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 6,
 'bootstrap': True}`

In [70]:
rfr = RandomForestRegressor(n_estimators=60, max_depth=6,random_state=42, n_jobs= -1)
rfr.fit(X_train, y_train)

tuned_cscores = {}
tuned_cscores_means = {}
tuned_rmse_dict = {}

cross_validation_scores= cross_val_score(rfr, X_train, y_train, cv=5, scoring="r2")

In [71]:
tuned_cscores['Random Forest'] = list(cross_validation_scores)
tuned_cscores_means['Random Forest'] = np.mean(cross_validation_scores)

print("5-fold cross validation scores for Random Forest: ", cross_validation_scores)
print("Mean R^2: {}".format(np.mean(cross_validation_scores)))

5-fold cross validation scores for Random Forest:  [0.71820147 0.79087646 0.74483862 0.68472169 0.3862905 ]
Mean R^2: 0.6649857473712506


In [72]:
# Predict on the validation data
y_pred = rfr.predict(X_validation)

tuned_rmse_dict["Random Forest"] = np.sqrt(mean_squared_error(y_validation, y_pred))

## Tuned Bagging Tree

__RandomizedSearchCV gave following as best parameters__

`{'n_estimators': 50, 'max_features': 4}`

In [73]:
bag = BaggingRegressor(n_estimators=50, max_features=4,random_state=42, n_jobs= -1)
bag.fit(X_train, y_train)

cross_validation_scores= cross_val_score(bag, X_train, y_train, cv=5, scoring="r2")

In [74]:
tuned_cscores['Bagging Tree'] = list(cross_validation_scores)
tuned_cscores_means['Bagging Tree'] = np.mean(cross_validation_scores)

print("5-fold cross validation scores for Bagging Tree: ", cross_validation_scores)
print("Mean R^2: {}".format(np.mean(cross_validation_scores)))

5-fold cross validation scores for Bagging Tree:  [0.73259884 0.77646986 0.74874723 0.67885589 0.37285165]
Mean R^2: 0.661904693655823


In [75]:
# Predict on the validation data
y_pred = bag.predict(X_validation)

tuned_rmse_dict["Bagging Tree"] = np.sqrt(mean_squared_error(y_validation, y_pred))

## Tuned Gradient Boosting

__RandomizedSearchCV gave following as best parameters__

`{'n_estimators': 50,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 6,
 'learning_rate': 0.1}`

In [76]:
gbr = GradientBoostingRegressor(n_estimators=50, max_features='sqrt', max_depth=6, 
                                min_samples_split=5, min_samples_leaf=2, 
                                learning_rate=0.1, random_state=42)
gbr.fit(X_train, y_train)


cross_validation_scores= cross_val_score(gbr, X_train, y_train, cv=5, scoring="r2")

In [77]:
tuned_cscores['Gradient Boost'] = list(cross_validation_scores)
tuned_cscores_means['Gradient Boost'] = np.mean(cross_validation_scores)

print("5-fold cross validation scores for Gradient Boost Regression: ", cross_validation_scores)
print("Mean R^2: {}".format(np.mean(cross_validation_scores)))

5-fold cross validation scores for Gradient Boost Regression:  [0.73720368 0.79156272 0.76345063 0.69115986 0.39275907]
Mean R^2: 0.6752271908834839


In [78]:
# Predict on the validation data
y_pred = gbr.predict(X_validation)

tuned_rmse_dict["Gradient Boost"] = np.sqrt(mean_squared_error(y_validation, y_pred))

## Tuned XGBoost

__RandomizedSearchCV gave following as best parameters__

`{'silent': False,
 'reg_lambda': 5,
 'n_estimators': 90,
 'min_child_weight': 10,
 'max_depth': 6,
 'learning_rate': 0.1,
 'gamma': 0,
 'colsample_bytree': 0.7}`

In [79]:
xg_reg = xgb.XGBRegressor(n_estimators=90, silent=False, reg_lambda=5, min_child_weight=10,
                                max_depth=6, learning_rate=0.1, gamma=0,
                                min_samples_split=2, min_samples_leaf=2, 
                                colsample_bytree=0.7, random_state=42, n_jobs=-1)
xg_reg.fit(X_train, y_train)


cross_validation_scores= cross_val_score(xg_reg, X_train, y_train, cv=5, scoring="r2")

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \


In [80]:
tuned_cscores['XGBoosting'] = list(cross_validation_scores)
tuned_cscores_means['XGBoosting'] = np.mean(cross_validation_scores)

print("5-fold cross validation scores for XGBoosting: ", cross_validation_scores)
print("Mean R^2: {}".format(np.mean(cross_validation_scores)))

5-fold cross validation scores for XGBoosting:  [0.74093974 0.78866608 0.74247665 0.6989674  0.37457214]
Mean R^2: 0.6691244027317169


In [81]:
# Predict on the validation data
y_pred = xg_reg.predict(X_validation)

tuned_rmse_dict["XGBoosting"] = np.sqrt(mean_squared_error(y_validation, y_pred))

## Tuned LightGBM

__RandomizedSearchCV gave following as best parameters__

`{'subsample': 0.5,
 'silent': False,
 'num_leaves': 100,
 'n_estimators': 150,
 'max_depth': 4,
 'learning_rate': 0.1,
 'colsample_bytree': 0.7}`

In [82]:
lgb_reg = lgb.LGBMRegressor(n_estimators=150, silent=False, 
                                max_depth=4, learning_rate=0.1, 
                                num_leaves=100, subsample=0.5,
                                colsample_bytree=0.7, random_state=42, n_jobs=-1)
lgb_reg.fit(X_train, y_train)


cross_validation_scores= cross_val_score(lgb_reg, X_train, y_train, cv=5, scoring="r2")

In [83]:
tuned_cscores['LightGBM'] = list(cross_validation_scores)
tuned_cscores_means['LightGBM'] = np.mean(cross_validation_scores)

print("5-fold cross validation scores for LightGBM: ", cross_validation_scores)
print("Mean R^2: {}".format(np.mean(cross_validation_scores)))

5-fold cross validation scores for LightGBM:  [0.68003598 0.76674543 0.74670287 0.69016668 0.40003498]
Mean R^2: 0.6567371881781721


In [84]:
# Predict on the validation data
y_pred = lgb_reg.predict(X_validation)

tuned_rmse_dict["LightGBM"] = np.sqrt(mean_squared_error(y_validation, y_pred))

In [85]:
index = [0]
tuned_cscores_means_df = pd.DataFrame(tuned_cscores_means, index=index)
tuned_rmse_df = pd.DataFrame(tuned_rmse_dict, index=index)

tuned_cscores_means_df = tuned_cscores_means_df.T
tuned_rmse_df = tuned_rmse_df.T

tuned_cscores_means_df.columns = ['r2']
tuned_rmse_df.columns = ['rmse']

tuned_score_df = tuned_cscores_means_df.join(tuned_rmse_df)
tuned_score_df.index.name = 'Algorithm'
tuned_score_df

Unnamed: 0_level_0,r2,rmse
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1
Random Forest,0.664986,1.457858
Bagging Tree,0.661905,1.425319
Gradient Boost,0.675227,1.461838
XGBoosting,0.669124,1.468157
LightGBM,0.656737,1.533018


In [86]:
tuned_df = tuned_score_df.sort_values(['r2', 'rmse'], ascending=False)
tuned_df

Unnamed: 0_level_0,r2,rmse
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1
Gradient Boost,0.675227,1.461838
XGBoosting,0.669124,1.468157
Random Forest,0.664986,1.457858
Bagging Tree,0.661905,1.425319
LightGBM,0.656737,1.533018


In [87]:
df

Unnamed: 0_level_0,r2,rmse
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1
Gradient Boost,0.677656,1.505827
XGBoosting,0.674566,1.500516
Linear Regression,0.661112,1.349141
LightGBM,0.65347,1.54635
Ridge Regression,0.629179,1.351676
Random Forest,0.626711,1.531725
Bagging Tree,0.625435,1.542834
Decision Tree,0.502483,1.9342
Lasso Regression,-0.000437,2.453177
Adaptive Boost,-3.260495,2.512668


In [88]:
df.to_csv('scores_default_parameters.csv')
tuned_df.to_csv('scores_tuned_parameters.csv')

In [89]:
tuned_df.columns = ['tuned_r2', 'tuned_rmse']
tuned_df

Unnamed: 0_level_0,tuned_r2,tuned_rmse
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1
Gradient Boost,0.675227,1.461838
XGBoosting,0.669124,1.468157
Random Forest,0.664986,1.457858
Bagging Tree,0.661905,1.425319
LightGBM,0.656737,1.533018


In [90]:
combined_df = df.join(tuned_df,how='outer')
combined_df

Unnamed: 0_level_0,r2,rmse,tuned_r2,tuned_rmse
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Adaptive Boost,-3.260495,2.512668,,
Bagging Tree,0.625435,1.542834,0.661905,1.425319
Decision Tree,0.502483,1.9342,,
Gradient Boost,0.677656,1.505827,0.675227,1.461838
Lasso Regression,-0.000437,2.453177,,
LightGBM,0.65347,1.54635,0.656737,1.533018
Linear Regression,0.661112,1.349141,,
Random Forest,0.626711,1.531725,0.664986,1.457858
Ridge Regression,0.629179,1.351676,,
XGBoosting,0.674566,1.500516,0.669124,1.468157


In [91]:
combined_df = combined_df.sort_values(['r2', 'tuned_r2'], ascending=False)
combined_df

Unnamed: 0_level_0,r2,rmse,tuned_r2,tuned_rmse
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boost,0.677656,1.505827,0.675227,1.461838
XGBoosting,0.674566,1.500516,0.669124,1.468157
Linear Regression,0.661112,1.349141,,
LightGBM,0.65347,1.54635,0.656737,1.533018
Ridge Regression,0.629179,1.351676,,
Random Forest,0.626711,1.531725,0.664986,1.457858
Bagging Tree,0.625435,1.542834,0.661905,1.425319
Decision Tree,0.502483,1.9342,,
Lasso Regression,-0.000437,2.453177,,
Adaptive Boost,-3.260495,2.512668,,


In [105]:
scores_df = pd.DataFrame(cscores).T
scores_df.columns = [1, 2, 3, 4, 5]
scores_df["mean_5"] = scores_df.mean(axis=1)
scores_df["mean_4"] =scores_df[[1, 2, 3, 4]].mean(axis=1)
scores_df = scores_df.sort_values(['mean_5','mean_4'], ascending=False)

tuned_scores_df = pd.DataFrame(tuned_cscores).T
tuned_scores_df.columns = [1, 2, 3, 4, 5]
tuned_scores_df["mean_5"] = tuned_scores_df.mean(axis=1)
tuned_scores_df["mean_4"] = tuned_scores_df[[1, 2, 3, 4]].mean(axis=1)
tuned_scores_df = tuned_scores_df.sort_values(['mean_5', 'mean_4'], ascending=False)

In [106]:
scores_df

Unnamed: 0,1,2,3,4,5,mean_5,mean_4
Gradient Boost,0.751285,0.807129,0.753577,0.695682,0.380606,0.677656,0.751918
XGBoosting,0.741392,0.799844,0.751984,0.698136,0.381475,0.674566,0.747839
Linear Regression,0.734729,0.788722,0.740108,0.686828,0.355171,0.661112,0.737597
LightGBM,0.694634,0.772119,0.731835,0.684235,0.384528,0.65347,0.720706
Ridge Regression,0.724755,0.754825,0.681163,0.617798,0.367353,0.629179,0.694635
Random Forest,0.641422,0.740722,0.718931,0.684464,0.348013,0.626711,0.696385
Bagging Tree,0.642673,0.737599,0.717775,0.687039,0.342091,0.625435,0.696272
Decision Tree,0.571035,0.613442,0.535836,0.606414,0.185686,0.502483,0.581682
Lasso Regression,-0.000436,-0.000484,-4.9e-05,-0.001212,-4e-06,-0.000437,-0.000546
Adaptive Boost,0.432996,0.15127,0.144161,0.467402,-17.498304,-3.260495,0.298957


In [107]:
tuned_scores_df

Unnamed: 0,1,2,3,4,5,mean_5,mean_4
Gradient Boost,0.737204,0.791563,0.763451,0.69116,0.392759,0.675227,0.745844
XGBoosting,0.74094,0.788666,0.742477,0.698967,0.374572,0.669124,0.742762
Random Forest,0.718201,0.790876,0.744839,0.684722,0.38629,0.664986,0.73466
Bagging Tree,0.732599,0.77647,0.748747,0.678856,0.372852,0.661905,0.734168
LightGBM,0.680036,0.766745,0.746703,0.690167,0.400035,0.656737,0.720913
