In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import re
import csv
import pickle
import requests

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor 

In [2]:
store_sharing_filename = 'store_sharing.csv'

model_filename = "model"
model_extension = "pkl"

## Loading and Preparing the Data

- Let's load the data from the cvs file:

In [3]:
df = pd.read_csv(store_sharing_filename)
print("Dataset read from file: {}".format(store_sharing_filename))

Dataset read from file: store_sharing.csv


In [4]:
df.head(3)

Unnamed: 0,timestamp,cnt,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season
0,2015-01-04 00:00:00,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0
1,2015-01-04 01:00:00,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0
2,2015-01-04 02:00:00,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0


In [5]:
df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%d')

In [6]:
df['year'] = pd.DatetimeIndex(df['timestamp']).year
df['month'] = pd.DatetimeIndex(df['timestamp']).month
df['day'] = pd.DatetimeIndex(df['timestamp']).day
df['hour'] = pd.DatetimeIndex(df['timestamp']).hour

In [7]:
df = df.drop(columns=['timestamp'], axis=1)

In [8]:
df.head(3)

Unnamed: 0,cnt,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season,year,month,day,hour
0,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0,2015,1,4,0
1,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0,2015,1,4,1
2,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0,2015,1,4,2


In [9]:
columns = ['t1', 't2', 'hum', 'wind_speed']
scaler = StandardScaler()
df.loc[:, columns] = scaler.fit_transform(df[columns])
df.head()

Unnamed: 0,cnt,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season,year,month,day,hour
0,182,-1.699331,-1.43929,1.444517,-1.255717,3.0,0.0,1.0,3.0,2015,1,4,0
1,138,-1.699331,-1.363703,1.444517,-1.38239,1.0,0.0,1.0,3.0,2015,1,4,1
2,134,-1.789071,-1.363703,1.689054,-2.015755,1.0,0.0,1.0,3.0,2015,1,4,2
3,72,-1.878811,-1.43929,1.93359,-2.015755,1.0,0.0,1.0,3.0,2015,1,4,3
4,47,-1.878811,-1.741635,1.444517,-1.192381,1.0,0.0,1.0,3.0,2015,1,4,4


#### Creating a function for onehot encoder

In [10]:
def OneHotEncoder(df, column, prefix):
    df_column = pd.DataFrame(df, columns=[column])
    ohe_df = pd.get_dummies(df_column, columns=[column], prefix=[prefix], prefix_sep='-')
    df = df.join(ohe_df)
    return df.drop(columns=[column], axis=1)

In [11]:
df = OneHotEncoder(df, 'weather_code', 'W')
df = OneHotEncoder(df, 'season', 'Season')
df.head(3)

Unnamed: 0,cnt,t1,t2,hum,wind_speed,is_holiday,is_weekend,year,month,day,...,W-2.0,W-3.0,W-4.0,W-7.0,W-10.0,W-26.0,Season-0.0,Season-1.0,Season-2.0,Season-3.0
0,182,-1.699331,-1.43929,1.444517,-1.255717,0.0,1.0,2015,1,4,...,0,1,0,0,0,0,0,0,0,1
1,138,-1.699331,-1.363703,1.444517,-1.38239,0.0,1.0,2015,1,4,...,0,0,0,0,0,0,0,0,0,1
2,134,-1.789071,-1.363703,1.689054,-2.015755,0.0,1.0,2015,1,4,...,0,0,0,0,0,0,0,0,0,1


In [12]:
df.insert(22, 'count', df['cnt'])
df = df.drop(columns=['cnt'], axis=1)

In [13]:
df.head(2)

Unnamed: 0,t1,t2,hum,wind_speed,is_holiday,is_weekend,year,month,day,hour,...,W-3.0,W-4.0,W-7.0,W-10.0,W-26.0,Season-0.0,Season-1.0,Season-2.0,Season-3.0,count
0,-1.699331,-1.43929,1.444517,-1.255717,0.0,1.0,2015,1,4,0,...,1,0,0,0,0,0,0,0,1,182
1,-1.699331,-1.363703,1.444517,-1.38239,0.0,1.0,2015,1,4,1,...,0,0,0,0,0,0,0,0,1,138


- Let's get our X and y vectors (the cnt column is the target):

In [14]:
X = df.iloc[:, :21].values.reshape(-1, 21)                                               
y = df.iloc[:, -1].values.reshape(-1, 1).ravel()

Let's now split the data into training and test sets:

In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17414 entries, 0 to 17413
Data columns (total 22 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   t1          17414 non-null  float64
 1   t2          17414 non-null  float64
 2   hum         17414 non-null  float64
 3   wind_speed  17414 non-null  float64
 4   is_holiday  17414 non-null  float64
 5   is_weekend  17414 non-null  float64
 6   year        17414 non-null  int64  
 7   month       17414 non-null  int64  
 8   day         17414 non-null  int64  
 9   hour        17414 non-null  int64  
 10  W-1.0       17414 non-null  uint8  
 11  W-2.0       17414 non-null  uint8  
 12  W-3.0       17414 non-null  uint8  
 13  W-4.0       17414 non-null  uint8  
 14  W-7.0       17414 non-null  uint8  
 15  W-10.0      17414 non-null  uint8  
 16  W-26.0      17414 non-null  uint8  
 17  Season-0.0  17414 non-null  uint8  
 18  Season-1.0  17414 non-null  uint8  
 19  Season-2.0  17414 non-nul

Let's check the dimensions:

In [17]:
train_size, n_columns = X_train.shape

test_size = X_test.shape[0]

assert(X_test.shape[1]==n_columns)

n_digits = len(set(Y_train).union(set(list(Y_test))))

print(" train_size=%d \n test_size=%d \n n_columns=%d" % (train_size, test_size, n_columns))

 train_size=12189 
 test_size=5225 
 n_columns=21


### Create and Train Models

I will create a few models and pick the best based on the metric "mean_squared_error"
- Let's start with Linear Regression and its variants

In [18]:
linear_regressor = LinearRegression()

linear_regressor.fit(X_train, Y_train)

MSE_LR = mean_squared_error(Y_test, linear_regressor.predict(X_test))
RMSE_LR = np.sqrt(mean_squared_error(Y_test, linear_regressor.predict(X_test)))

print(" MSE_Linear_Regression: %d \n RMSE_Linear_Regression: %d " % (MSE_LR, RMSE_LR) )

 MSE_Linear_Regression: 789266 
 RMSE_Linear_Regression: 888 


In [19]:
lasso_regressor = LassoCV(cv=5, random_state=42).fit(X_train, Y_train)

MSE_Lasso = mean_squared_error(Y_test, lasso_regressor.predict(X_test))
RMSE_Lasso = np.sqrt(mean_squared_error(Y_test, lasso_regressor.predict(X_test)))

print(" MSE_Lasso: %d \n RMSE_Lasso: %d " % (MSE_Lasso, RMSE_Lasso) )

 MSE_Lasso: 790027 
 RMSE_Lasso: 888 


In [20]:
ridge_regressor = RidgeCV(cv=5).fit(X_train, Y_train)

MSE_Ridge = mean_squared_error(Y_test, ridge_regressor.predict(X_test))
RMSE_Ridge = np.sqrt(mean_squared_error(Y_test, ridge_regressor.predict(X_test)))

print(" MSE_Ridge: %d \n RMSE_Ridge: %d " % (MSE_Ridge, RMSE_Ridge) )

 MSE_Ridge: 789261 
 RMSE_Ridge: 888 


In [21]:
elastic_net_regressor = ElasticNetCV(cv=5, random_state=42).fit(X_train, Y_train)

MSE_elastic_net = mean_squared_error(Y_test, elastic_net_regressor.predict(X_test))
RMSE_elastic_net = np.sqrt(mean_squared_error(Y_test, elastic_net_regressor.predict(X_test)))

print(" MSE_elastic_net: %d \n RMSE_elastic_net: %d " % (MSE_elastic_net, RMSE_elastic_net) )

 MSE_elastic_net: 898258 
 RMSE_elastic_net: 947 


- Among the linear models simple **Linear Regression** is the best but it is also very close to the **Ridge Regression.** 

Let's now try **KNeighborsRegressor** using **GridSearchCV** to find the best hyperparameters

In [22]:
tuned_parameters = [{'weights': ['uniform', 'distance'],
                     'n_neighbors': range(2,100)}]                                                                                             

knn_regressor = GridSearchCV(estimator=KNeighborsRegressor(), param_grid=tuned_parameters, cv=5, 
                             scoring='neg_mean_squared_error')

knn_regressor.fit(X_train, Y_train)

MSE_kNN = mean_squared_error(Y_test, knn_regressor.predict(X_test))
RMSE_kNN = np.sqrt(mean_squared_error(Y_test, knn_regressor.predict(X_test)))

print(" MSE_kNN: %d \n RMSE_kNN: %d " % (MSE_kNN, RMSE_kNN) )


 MSE_kNN: 233813 
 RMSE_kNN: 483 


Now, let's try with the **RandomForestRegressor**

In [23]:
tuned_parameters = [{'max_depth': range(2,10),
                     'n_estimators': range(10,12)}]

rf_regressor = GridSearchCV(RandomForestRegressor(), tuned_parameters, cv=5, scoring='neg_mean_squared_error')

rf_regressor.fit(X_train, Y_train)

MSE_RF = mean_squared_error(Y_test, rf_regressor.predict(X_test))
RMSE_RF = np.sqrt(mean_squared_error(Y_test, rf_regressor.predict(X_test)))

print(" MSE_RF: %d \n RMSE_RF: %d " % (MSE_RF, RMSE_RF) )

 MSE_RF: 83115 
 RMSE_RF: 288 


Now, let's try with the **XGBoostRegressor**

In [24]:
tuned_parameters = [{'max_depth': range(9,12), 
                     'min_child_weight':range(5,8)}]

xgb_regressor = GridSearchCV(XGBRegressor(), tuned_parameters, cv=5, scoring='neg_mean_squared_error')

xgb_regressor.fit(X_train, Y_train)

MSE_xgb = mean_squared_error(Y_test, xgb_regressor.predict(X_test))
RMSE_xgb = np.sqrt(MSE_xgb)

print(" MSE_RF: %d \n RMSE_RF: %d " % (MSE_xgb, RMSE_xgb) )

 MSE_RF: 39525 
 RMSE_RF: 198 


Now, let's try with the **SupportVectorClassifierRegressor**

In [24]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV


svr = GridSearchCV(SVR(kernel='rbf', gamma=0.1), 
      param_grid={"C": [1e0, 1e1, 1e2, 1e3], "gamma": np.logspace(-2, 2, 5)}, cv=5, scoring='neg_mean_squared_error')

svr.fit(X_train, Y_train)

MSE_SVR = mean_squared_error(Y_test, svr.predict(X_test))
RMSE_SVR = np.sqrt(mean_squared_error(Y_test, svr.predict(X_test)))

print(" MSE_SVR: %d \n RMSE_SVR: %d " % (MSE_SVR, RMSE_SVR) )

 MSE_SVR: 183141 
 RMSE_SVR: 427 


In [25]:
models_df = pd.DataFrame(columns = ['Model', 'MSE', 'RMSE'])

data_LR = [{'Model':'Linear Regression', 'MSE':MSE_LR, 'RMSE':RMSE_LR}]
models_df = models_df.append(data_LR , ignore_index=True)

data_lasso = [{'Model':'Lasso Regression', 'MSE':MSE_Lasso, 'RMSE':RMSE_Lasso}]
models_df = models_df.append(data_lasso, ignore_index=True)

data_ridge = [{'Model':'Ridge Regression', 'MSE':MSE_Ridge, 'RMSE':RMSE_Ridge}]
models_df = models_df.append(data_ridge, ignore_index=True)

data_elastic_net = [{'Model':'ElasticNet Regression', 'MSE':MSE_elastic_net, 'RMSE':RMSE_elastic_net}]
models_df = models_df.append(data_elastic_net, ignore_index=True)

data_knn = [{'Model':'KNeighbours Regression', 'MSE':MSE_kNN, 'RMSE':RMSE_kNN}]
models_df = models_df.append(data_knn, ignore_index=True)

data_RF = [{'Model':'Random Forest Regression', 'MSE':MSE_RF, 'RMSE':RMSE_RF}]
models_df = models_df.append(data_RF, ignore_index=True)

data_xgb = [{'Model':'XGBoost Regression', 'MSE':MSE_xgb, 'RMSE':RMSE_xgb}]
models_df = models_df.append(data_xgb, ignore_index=True)

# data_SVR = [{'Model':'Support Vector Regression', 'MSE':MSE_SVR, 'RMSE':RMSE_SVR}]
# models_df = models_df.append(data_SVR, ignore_index=True)

models_df

Unnamed: 0,Model,MSE,RMSE
0,Linear Regression,789266.595151,888.406773
1,Lasso Regression,790027.816258,888.835089
2,Ridge Regression,789261.712145,888.404025
3,ElasticNet Regression,898258.951731,947.765241
4,KNeighbours Regression,233813.49396,483.54265
5,Random Forest Regression,83115.971594,288.298407
6,XGBoost Regression,39525.701924,198.810719


It looks like XGBoost Regressor results are by far the best.

In [26]:
pickle.dump(xgb_regressor, open(model_filename+"."+model_extension,'wb'))