In [62]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
import datetime
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import cross_val_score


In [63]:
bike_data = pd.read_csv('/Users/esrasaydam/Documents/Springboard/GitHub/Capstone Project #2/day.csv')

In [64]:
bike_data.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [65]:
# let's make some numbers categorical...
bike_data['yr'] = bike_data.yr.astype('category')
bike_data['season'] = bike_data.season.astype('category')
bike_data['mnth'] = bike_data.mnth.astype('category')
bike_data['holiday'] = bike_data.holiday.astype('category')
bike_data['weekday'] = bike_data.weekday.astype('category')
bike_data['workingday'] = bike_data.workingday.astype('category')
bike_data['weathersit'] = bike_data.weathersit.astype('category')


In [66]:
#Create dummy or indicator features for categorical variables
df1 = bike_data.copy()

df1 = pd.get_dummies(df1,
                     columns = ['season', 'weathersit', 'season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday'])
display(df1)


Unnamed: 0,instant,dteday,temp,atemp,hum,windspeed,casual,registered,cnt,season_1,...,holiday_1,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,workingday_0,workingday_1
0,1,2011-01-01,0.344167,0.363625,0.805833,0.160446,331,654,985,1,...,0,0,0,0,0,0,0,1,1,0
1,2,2011-01-02,0.363478,0.353739,0.696087,0.248539,131,670,801,1,...,0,1,0,0,0,0,0,0,1,0
2,3,2011-01-03,0.196364,0.189405,0.437273,0.248309,120,1229,1349,1,...,0,0,1,0,0,0,0,0,0,1
3,4,2011-01-04,0.200000,0.212122,0.590435,0.160296,108,1454,1562,1,...,0,0,0,1,0,0,0,0,0,1
4,5,2011-01-05,0.226957,0.229270,0.436957,0.186900,82,1518,1600,1,...,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726,727,2012-12-27,0.254167,0.226642,0.652917,0.350133,247,1867,2114,1,...,0,0,0,0,0,1,0,0,0,1
727,728,2012-12-28,0.253333,0.255046,0.590000,0.155471,644,2451,3095,1,...,0,0,0,0,0,0,1,0,0,1
728,729,2012-12-29,0.253333,0.242400,0.752917,0.124383,159,1182,1341,1,...,0,0,0,0,0,0,0,1,1,0
729,730,2012-12-30,0.255833,0.231700,0.483333,0.350754,364,1432,1796,1,...,0,1,0,0,0,0,0,0,1,0


In [67]:
#let's make sure of the feature types.
bike_data.dtypes

instant          int64
dteday          object
season        category
yr            category
mnth          category
holiday       category
weekday       category
workingday    category
weathersit    category
temp           float64
atemp          float64
hum            float64
windspeed      float64
casual           int64
registered       int64
cnt              int64
dtype: object

In [68]:
# Standardize the magnitude of numeric features using a scaler
#drop the 'dteday' column which is an object and also 'registered', 'casual' and 'atemp' to simplify the numbers
df1.drop(['dteday', 'registered', 'casual', 'atemp'],  axis=1, inplace=True)

df1.head()

Unnamed: 0,instant,temp,hum,windspeed,cnt,season_1,season_2,season_3,season_4,weathersit_1,...,holiday_1,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,workingday_0,workingday_1
0,1,0.344167,0.805833,0.160446,985,1,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
1,2,0.363478,0.696087,0.248539,801,1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
2,3,0.196364,0.437273,0.248309,1349,1,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
3,4,0.2,0.590435,0.160296,1562,1,0,0,0,1,...,0,0,0,1,0,0,0,0,0,1
4,5,0.226957,0.436957,0.1869,1600,1,0,0,0,1,...,0,0,0,0,1,0,0,0,0,1


In [69]:
# Making a Scaler object
scaler = StandardScaler()
# Fitting data to the scaler object
scaled_df = scaler.fit_transform(df1)
scaled_df = pd.DataFrame(scaled_df, columns= df1.columns)


In [70]:
#Split into testing and training datasets 
X_train, X_test, y_train, y_test = train_test_split(df1.drop(columns='cnt'), 
                                                   df1.cnt, test_size=0.3, 
                                                    random_state=47)


In [71]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report,confusion_matrix,roc_curve,roc_auc_score
from sklearn.metrics import accuracy_score,log_loss
from matplotlib import pyplot
from sklearn.model_selection import cross_val_score


Models to try
1 - Linear Regression model
2 - Elastic Net
3 - Random Forest Regression
4 - Gradient Boosting
5 - Decision Tree
6- Extra Tree Regressor
7 - Bagging Regressor
8- Ada Boost Regressor
9- Huber Regressor

In [75]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.svm import SVR
from sklearn.ensemble import BaggingRegressor
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import AdaBoostRegressor
from sklearn import model_selection
from sklearn.linear_model import HuberRegressor, LinearRegression
from sklearn.model_selection import train_test_split
from datetime import datetime as dt
from scipy.stats import loguniform



In [83]:


models = [LinearRegression(),
          AdaBoostRegressor(),
          Ridge(),
          HuberRegressor(),
          ElasticNetCV(),
          DecisionTreeRegressor(), 
          ExtraTreesRegressor(),
          GradientBoostingRegressor(),
          RandomForestRegressor(),
          BaggingRegressor()]


# A function is wrtten to find out the cross validation score based on mean absolute error

def compare_models(model):
    #also counting running time
    start_time = dt.now()
    kfold = model_selection.KFold(n_splits=10)
    mean_dev_scores = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring='neg_mean_absolute_error')
    r2_scores = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring='r2')
    Scores= pd.DataFrame({'Mean deviation':[np.mean(mean_dev_scores)],'R Square':[np.mean(r2_scores)]})
    print(Scores)
    end_time = dt.now()
    print('Running time: {}'.format(end_time - start_time))

for model in models:
    compare_models(model)

   Mean deviation  R Square
0     -601.039529   0.80564
Running time: 0:00:00.076800
   Mean deviation  R Square
0     -696.262702   0.79137
Running time: 0:00:01.075941
   Mean deviation  R Square
0     -600.992095  0.803111
Running time: 0:00:00.040511


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

   Mean deviation  R Square
0     -663.536781   0.75955
Running time: 0:00:00.928161
   Mean deviation  R Square
0    -1240.342298  0.365735
Running time: 0:00:00.491159
   Mean deviation  R Square
0     -670.641667  0.725621
Running time: 0:00:00.086439
   Mean deviation  R Square
0     -497.833518  0.853449
Running time: 0:00:03.358704
   Mean deviation  R Square
0     -461.229775   0.87482
Running time: 0:00:01.286478
   Mean deviation  R Square
0     -476.710663  0.860604
Running time: 0:00:03.978140
   Mean deviation  R Square
0     -493.476395  0.852314
Running time: 0:00:00.481864


GradientBoostingRegressor() is the model with the highest R score of 0.87443.
The second best would be BaggingRegressor() with 0.852314 of score because of its fast running time. ExtraTreesRegressor() has also the score of 0.85 but its running time is over 3 seconds!

NameError: name 'model' is not defined