# OOP Bike Model

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression

#regression
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR, LinearSVC, SVC
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.gaussian_process.kernels import ExpSineSquared
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

# data analysis stack
import numpy as np
import pandas as pd

# data visualization stack
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')

# data pre-processing stack
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import ElasticNet


# miscellaneous
import scipy.stats as ss
# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

# data visualisation and manipulation
from matplotlib import style

style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)

#import the necessary modelling algos.
#model selection
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

#evaluation metrics
from sklearn.metrics import mean_squared_log_error,mean_squared_error, r2_score,mean_absolute_error # for regression
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score  # for classification
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder,
    PolynomialFeatures
)
import time

In [11]:
df = pd.read_csv('../data/train_bike.csv', parse_dates=True)
df.columns


Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count'],
      dtype='object')

In [12]:
def convert_date(dft):
    
    
    dft['datetime'] = pd.to_datetime(dft['datetime'])
    dft['month'] = dft['datetime'].dt.month
    dft['hour'] = dft['datetime'].dt.hour
    dft['weekday'] = dft['datetime'].dt.dayofweek
    dft["day"]=dft["datetime"].dt.day
    dft["year"]=dft["datetime"].dt.year
    dft['dayofweek'] = dft['datetime'].dt.dayofweek
    dft['month_start'] = dft['datetime'].dt.is_month_start
    dft['woy'] = dft['datetime'].dt.isocalendar().week.astype(int)
    
    return dft

df_new = convert_date(df)
df_new.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,month,hour,weekday,day,year,dayofweek,month_start,woy
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,1,0,5,1,2011,5,True,52
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,1,1,5,1,2011,5,True,52
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,1,2,5,1,2011,5,True,52
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,1,3,5,1,2011,5,True,52
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,1,4,5,1,2011,5,True,52


In [13]:
df.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
datetime,10886.0,2011-12-27 05:56:22.399411968,2011-01-01 00:00:00,2011-07-02 07:15:00,2012-01-01 20:30:00,2012-07-01 12:45:00,2012-12-19 23:00:00,
season,10886.0,2.506614,1.0,2.0,3.0,4.0,4.0,1.116174
holiday,10886.0,0.028569,0.0,0.0,0.0,0.0,1.0,0.166599
workingday,10886.0,0.680875,0.0,0.0,1.0,1.0,1.0,0.466159
weather,10886.0,1.418427,1.0,1.0,1.0,2.0,4.0,0.633839
temp,10886.0,20.23086,0.82,13.94,20.5,26.24,41.0,7.79159
atemp,10886.0,23.655084,0.76,16.665,24.24,31.06,45.455,8.474601
humidity,10886.0,61.88646,0.0,47.0,62.0,77.0,100.0,19.245033
windspeed,10886.0,12.799395,0.0,7.0015,12.998,16.9979,56.9969,8.164537
casual,10886.0,36.021955,0.0,4.0,17.0,49.0,367.0,49.960477


In [14]:
numerical_features = ["holiday", "workingday", "weather", "temp", "atemp", "humidity", "windspeed", "year", "month", 'day', 'season',  "hour", "dayofweek"]
start_features = ['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count']
test_features = ['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed']

categorical_features = []

features = numerical_features + categorical_features

target = 'count'

X,y = df[features],df[target]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((8708, 13), (2178, 13))

###  **Preprocessing**

In [16]:
# scaling and polynomial features
numerical_transformer = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('polynomial', PolynomialFeatures())
    ]
)
# one-hot encoding
categorical_transformer = Pipeline(
    steps=[
        ('ohe', OneHotEncoder(drop='first'))
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, test_features)
        #("cat", categorical_transformer, categorical_features)
    ]
)


In [17]:

estimator = Pipeline(
    steps=[
        ('preprocessor', preprocessor),   # preprocessing step
        ('model', RandomForestRegressor(random_state=42)) # RF regression
    ]
)
rfr = RandomForestRegressor(random_state=42)


In [18]:
from sklearn.model_selection import RandomizedSearchCV# Number of trees in random forest


In [19]:

n_estimators = [int(x) for x in np.linspace(start = 10, stop = 800, num = 5)]
# Number of features to consider at every split
max_features = ['sqrt', 'log', 0.2]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 40, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'criterion' : ['squared_error','poisson']
               }
print(random_grid)

{'n_estimators': [10, 207, 405, 602, 800], 'max_features': ['sqrt', 'log', 0.2], 'max_depth': [5, 13, 22, 31, 40, None], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False], 'criterion': ['squared_error', 'poisson']}


In [20]:

gscv = GridSearchCV(
    estimator=rfr,
    param_grid=random_grid,
    scoring='r2',
    cv=5, 
    n_jobs=-1,
    verbose=1
)


In [21]:
# initial time
ti = time.time()

# grid-search cross-validation
gscv.fit(X_train, y_train)

# final time 
tf = time.time()

# time taken
print(f"time taken: {round(tf-ti,2)} sec")

Fitting 5 folds for each of 2160 candidates, totalling 10800 fits


In [1]:

# all results
gscv.cv_results_


NameError: name 'gscv' is not defined

In [None]:
# list of columns to show
column_list = ['param_preprocessor__num__polynomial__degree',
               'param_preprocessor__num__polynomial__interaction_only',
               'param_lasso__alpha',
               'param_lasso__max_iter',
               'mean_test_score',
               'std_test_score',
               'rank_test_score'
              ]
# create result dataframe
result_df = pd.DataFrame(gscv.cv_results_)[column_list]

# rename columns
result_df.rename(
    columns=lambda name: name.split('__')[-1],inplace=True
)

# order by rank
result_df.sort_values(
    by='rank_test_score', ascending=True, inplace=True, ignore_index=True
)

result_df