# OOP Bike Model

In [275]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression

#regression
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR, LinearSVC, SVC
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.gaussian_process.kernels import ExpSineSquared
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

# data analysis stack
import numpy as np
import pandas as pd

# data visualization stack
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')

# data pre-processing stack
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import ElasticNet


# miscellaneous
import scipy.stats as ss
# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

# data visualisation and manipulation
from matplotlib import style

style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)

#import the necessary modelling algos.
#model selection
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

#evaluation metrics
from sklearn.metrics import mean_squared_log_error,mean_squared_error, r2_score,mean_absolute_error # for regression
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score  # for classification
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder,
    PolynomialFeatures
)
import time

In [276]:
from sklearn.pipeline import make_pipeline
import sklearn as skl 

In [277]:
df = pd.read_csv('train_bike.csv', parse_dates=True)
df.columns


Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count'],
      dtype='object')

In [289]:
def convert_date(dft):
    
    
    dft['datetime'] = pd.to_datetime(dft['datetime'])
    dft['month'] = dft['datetime'].dt.month
    dft['hour'] = dft['datetime'].dt.hour
    dft['weekday'] = dft['datetime'].dt.dayofweek
    dft["day"]=dft["datetime"].dt.day
    dft["year"]=dft["datetime"].dt.year
    dft['dayofweek'] = dft['datetime'].dt.dayofweek
    dft['month_start'] = dft['datetime'].dt.is_month_start
    dft['month'] =  dft['month'].astype(int)+(dft["year"]-dft["year"].min())*12
    dft = dft.drop(columns = ["datetime"])

    return dft

df_new = convert_date(df)
df_new.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,month,hour,weekday,day,year,dayofweek,month_start
0,1,0,0,1,9.84,14.395,81,0.0,3,13,16,1,0,5,1,2011,5,True
1,1,0,0,1,9.02,13.635,80,0.0,8,32,40,1,1,5,1,2011,5,True
2,1,0,0,1,9.02,13.635,80,0.0,5,27,32,1,2,5,1,2011,5,True
3,1,0,0,1,9.84,14.395,75,0.0,3,10,13,1,3,5,1,2011,5,True
4,1,0,0,1,9.84,14.395,75,0.0,0,1,1,1,4,5,1,2011,5,True


In [291]:
df_new.columns

Index(['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp',
       'humidity', 'windspeed', 'casual', 'registered', 'count', 'month',
       'hour', 'weekday', 'day', 'year', 'dayofweek', 'month_start'],
      dtype='object')

In [302]:
numerical_features = ['atemp', 'windspeed', 'humidity', 'day']
categorical_features = ['holiday', 'workingday', 'weather', 'month', 'hour', 'weekday', 'dayofweek']


#start_features = ['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count']
test_features = ['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed']

features = numerical_features + categorical_features

target = 'count'

X,y = df_new[features],df_new[target]

X1,y1 = df[features], df[target]

X.head()

Unnamed: 0,atemp,windspeed,humidity,day,holiday,workingday,weather,month,hour,weekday,dayofweek
0,14.395,0.0,81,1,0,0,1,1,0,5,5
1,13.635,0.0,80,1,0,0,1,1,1,5,5
2,13.635,0.0,80,1,0,0,1,1,2,5,5
3,14.395,0.0,75,1,0,0,1,1,3,5,5
4,14.395,0.0,75,1,0,0,1,1,4,5,5


In [283]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=42)
X1_train.shape, X1_test.shape

((8708, 7), (2178, 7))

In [303]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((8708, 11), (2178, 11))

In [296]:
class DateImputer(BaseEstimator, TransformerMixin):
    """
    imputer for date-time columns
    """
    def __init__(self):
        print('>>>>>>> Date-time init() called.')
    def fit(self, X, y = None):
        print('>>>>>>> Date-time fit() called.')
        return self
    def transform(self, X, y = None):
        X_ = X.copy() # creating a copy to avoid changes to original dataset
        X_ = convert_date(X_)
        print('>>>>>>> Date-time transform() called.', X_[features])
        return X_[features]

###  **Preprocessing**

In [304]:
# scaling and polynomial features
numerical_transformer = Pipeline(
    steps=[
       # ('datetime', DateImputer()),
        ('scaler', StandardScaler())
    ]
)

# one-hot encoding
categorical_transformer = Pipeline(
    steps=[
        ('ohe', OneHotEncoder(drop='first'))
    ]
) 
preprocessor = ColumnTransformer(
    transformers = [
        # ("datetime", date_transformer, features), 
        ("num", numerical_transformer, numerical_features), 
        ("cat", categorical_transformer, categorical_features)
    ]
)


estimator = Pipeline(
    steps = [
        ('preprocessor', preprocessor), # preprocessing step
        ('rf', RandomForestRegressor()) # random forest regression
    ]
)

In [305]:
X_train

Unnamed: 0,atemp,windspeed,humidity,day,holiday,workingday,weather,month,hour,weekday,dayofweek
2815,31.820,6.0032,83,6,0,1,1,7,5,2,2
8695,40.910,19.9995,39,4,0,0,1,20,16,5,5
8406,35.605,16.9979,41,11,0,1,1,19,15,2,2
1543,18.180,7.0015,93,10,0,0,2,4,4,6,6
4952,15.150,16.9979,45,19,0,0,1,11,10,5,5
...,...,...,...,...,...,...,...,...,...,...,...
5734,8.335,11.0014,47,14,0,0,1,13,2,5,5
5191,12.880,19.0012,61,10,0,0,1,12,9,5,5
5390,13.635,16.9979,48,18,0,0,1,12,16,6,6
860,19.695,35.0008,17,19,0,0,1,2,7,5,5


In [306]:
estimator.fit(X_train, y_train)
test_pred = estimator.predict(X_test)
test_pred

array([127.96,  34.82, 141.42, ..., 441.02,   9.71, 183.49])

In [133]:
models={'RandomForestRegressor': [RandomForestRegressor(), 0],
        'Lasso LinearRegressor': [Lasso(), 0],
        'Ridge LinearRegressor': [Ridge(), 0],
        'Elastic LinearRegressor': [ElasticNet(), 0],
        'BaggingRegressor': [BaggingRegressor(), 0], 
        'KNeighborsRegressor()': [KNeighborsRegressor(), 0]}


for model in models:
    clf = models[model][0]
   # pipe_model = make_pipeline(DateImputer(), OneHotEncoder(drop='first'), clf)
    pipe = Pipeline(steps=[('datetime', DateImputer()), ("scaler", OneHotEncoder(drop='first')),  ("regressor", clf)])
    clf.fit(X_train, y_train)
    test_pred = clf.predict(X_test)
    models[model][1] = (np.sqrt(mean_squared_error(y_test, test_pred)))

models

>>>>>>> Date-time init() called.
>>>>>>> Date-time init() called.
>>>>>>> Date-time init() called.
>>>>>>> Date-time init() called.
>>>>>>> Date-time init() called.
>>>>>>> Date-time init() called.


{'RandomForestRegressor': [RandomForestRegressor(), 39.387878054082904],
 'Lasso LinearRegressor': [Lasso(), 141.35522688241565],
 'Ridge LinearRegressor': [Ridge(), 141.31937034086312],
 'Elastic LinearRegressor': [ElasticNet(), 141.3321374485149],
 'BaggingRegressor': [BaggingRegressor(), 43.2969874592756],
 'KNeighborsRegressor()': [KNeighborsRegressor(), 121.06622826099917]}

In [134]:
from sklearn.model_selection import RandomizedSearchCV# Number of trees in random forest



 ## As  we can see, best result we have with the RandomForestRegressor  

### RandomForestRegressor: 39.71424383006143

In [309]:

rf__n_estimators = [int(x) for x in np.linspace(start = 300, stop = 600, num = 3)]
# Number of features to consider at every split
rf__max_features = ['sqrt']  #['sqrt', 'log', 0.2]
# Maximum number of levels in tree
rf__max_depth = [] # [int(x) for x in np.linspace(5, 20, num = 3)]
rf__max_depth.append(None)
# Minimum number of samples required to split a node
rf__min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
rf__min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
rf__bootstrap = [False]# Create the random grid
param_grid = {'rf__n_estimators': rf__n_estimators,
               'rf__max_features': rf__max_features,
               'rf__max_depth': rf__max_depth,
               'rf__min_samples_split': rf__min_samples_split,
               'rf__min_samples_leaf': rf__min_samples_leaf,
               'rf__bootstrap': rf__bootstrap,
               'rf__criterion' : ['squared_error']
               }
print(param_grid)

{'rf__n_estimators': [300, 450, 600], 'rf__max_features': ['sqrt'], 'rf__max_depth': [None], 'rf__min_samples_split': [2, 5], 'rf__min_samples_leaf': [1, 2], 'rf__bootstrap': [False], 'rf__criterion': ['squared_error']}


In [148]:
gscv = GridSearchCV(
    estimator=estimator,
    param_grid=param_grid,
    scoring='r2',
    cv=5, 
    n_jobs=-1,
    verbose=1
)

In [149]:
# initial time
ti = time.time()

# grid-search cross-validation
gscv.fit(X1_train, y1_train)

# final time 
tf = time.time()

# time taken
print(f"time taken: {round(tf-ti,2)} sec")

>>>>>>> Date-time init() called.
Fitting 5 folds for each of 12 candidates, totalling 60 fits
>>>>>>> Date-time init() called.
>>>>>>> Date-time init() called.
>>>>>>> Date-time init() called.
>>>>>>> Date-time init() called.
>>>>>>> Date-time init() called.
>>>>>>> Date-time init() called.
>>>>>>> Date-time init() called.
>>>>>>> Date-time init() called.
>>>>>>> Date-time init() called.
>>>>>>> Date-time init() called.
>>>>>>> Date-time init() called.
>>>>>>> Date-time init() called.
>>>>>>> Date-time init() called.
>>>>>>> Date-time init() called.
>>>>>>> Date-time init() called.
>>>>>>> Date-time init() called.
>>>>>>> Date-time init() called.
>>>>>>> Date-time init() called.
>>>>>>> Date-time init() called.
>>>>>>> Date-time init() called.
>>>>>>> Date-time init() called.
>>>>>>> Date-time init() called.
>>>>>>> Date-time init() called.
>>>>>>> Date-time init() called.
>>>>>>> Date-time init() called.
>>>>>>> Date-time init() called.
>>>>>>> Date-time init() called.
>>>>>>> Date-ti

In [150]:
# all results
gscv.cv_results_


{'mean_fit_time': array([ 7.28390803, 11.03341041, 15.14670281,  5.64789281,  9.28990617,
        11.29834094,  5.45814939,  8.54467201, 10.85773134,  5.49485431,
         7.37724347,  8.03912177]),
 'std_fit_time': array([0.16272099, 0.49576669, 0.67737641, 0.43294127, 0.60061487,
        0.69020939, 0.21076939, 0.46343533, 0.19076724, 0.29438403,
        0.28723419, 0.31482222]),
 'mean_score_time': array([0.31296172, 0.42282863, 0.53461432, 0.20565124, 0.3049077 ,
        0.4135283 , 0.20978565, 0.32012072, 0.36783581, 0.20054131,
        0.20783081, 0.21138458]),
 'std_score_time': array([0.05865269, 0.03237353, 0.06729327, 0.01183156, 0.01546358,
        0.06914843, 0.01155444, 0.00989675, 0.01718811, 0.02109577,
        0.01442957, 0.0106165 ]),
 'param_rf__bootstrap': masked_array(data=[False, False, False, False, False, False, False, False,
                    False, False, False, False],
              mask=[False, False, False, False, False, False, False, False,
              

In [151]:
gscv.best_params_

{'rf__bootstrap': False,
 'rf__criterion': 'squared_error',
 'rf__max_depth': None,
 'rf__max_features': 'sqrt',
 'rf__min_samples_leaf': 1,
 'rf__min_samples_split': 2,
 'rf__n_estimators': 300}

In [152]:
round(gscv.best_score_,6)

0.886337

In [153]:
best_model = gscv.best_estimator_
best_model

In [154]:
test_data = pd.read_csv('test_bike.csv', parse_dates=True)
test_data.columns

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed'],
      dtype='object')

In [300]:
test_data_new =  convert_date(test_data)

In [310]:
pred_bike = estimator.predict(test_data_new)
prediction = pd.DataFrame(pred_bike.ravel(), columns=["count"])
prediction = pd.concat([test_data["datetime"], prediction], axis=1)
prediction.to_csv("prediction_bike_few_base.csv", index=False)

In [155]:
pred_bike = best_model.predict(test_data)

>>>>>>> Date-time transform() called.       workingday  weather   temp   atemp  humidity  windspeed  month  day  \
0              1        1  10.66  11.365        56    26.0027      1   20   
1              1        1  10.66  13.635        56     0.0000      1   20   
2              1        1  10.66  13.635        56     0.0000      1   20   
3              1        1  10.66  12.880        56    11.0014      1   20   
4              1        1  10.66  12.880        56    11.0014      1   20   
...          ...      ...    ...     ...       ...        ...    ...  ...   
6488           1        2  10.66  12.880        60    11.0014     12   31   
6489           1        2  10.66  12.880        60    11.0014     12   31   
6490           1        1  10.66  12.880        60    11.0014     12   31   
6491           1        1  10.66  13.635        56     8.9981     12   31   
6492           1        1  10.66  13.635        65     8.9981     12   31   

      season  hour  dayofweek  woy  


In [156]:
prediction = pd.DataFrame(pred_bike.ravel(), columns=["count"])
prediction = pd.concat([test_data["datetime"], prediction], axis=1)
prediction.head(5)

Unnamed: 0,datetime,count
0,2011-01-20 00:00:00,15.943333
1,2011-01-20 01:00:00,10.52
2,2011-01-20 02:00:00,9.893333
3,2011-01-20 03:00:00,7.686667
4,2011-01-20 04:00:00,7.463333


In [157]:
prediction.to_csv("prediction_bike_few.csv", index=False)