In [28]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
import datetime
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import cross_val_score


In [29]:
bike_data = pd.read_csv('/Users/esrasaydam/Documents/Springboard/GitHub/Capstone Project #2/day.csv')

In [30]:
bike_data.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [49]:
#Create dummy or indicator features for categorical variables
df1 = bike_data.copy()

df1 = pd.get_dummies(df1,
                     columns = ['season', 'weathersit', 'season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday'])
display(df1)


Unnamed: 0,instant,dteday,temp,atemp,hum,windspeed,casual,registered,cnt,season_1,...,holiday_1,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,workingday_0,workingday_1
0,1,2011-01-01,0.344167,0.363625,0.805833,0.160446,331,654,985,1,...,0,0,0,0,0,0,0,1,1,0
1,2,2011-01-02,0.363478,0.353739,0.696087,0.248539,131,670,801,1,...,0,1,0,0,0,0,0,0,1,0
2,3,2011-01-03,0.196364,0.189405,0.437273,0.248309,120,1229,1349,1,...,0,0,1,0,0,0,0,0,0,1
3,4,2011-01-04,0.200000,0.212122,0.590435,0.160296,108,1454,1562,1,...,0,0,0,1,0,0,0,0,0,1
4,5,2011-01-05,0.226957,0.229270,0.436957,0.186900,82,1518,1600,1,...,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726,727,2012-12-27,0.254167,0.226642,0.652917,0.350133,247,1867,2114,1,...,0,0,0,0,0,1,0,0,0,1
727,728,2012-12-28,0.253333,0.255046,0.590000,0.155471,644,2451,3095,1,...,0,0,0,0,0,0,1,0,0,1
728,729,2012-12-29,0.253333,0.242400,0.752917,0.124383,159,1182,1341,1,...,0,0,0,0,0,0,0,1,1,0
729,730,2012-12-30,0.255833,0.231700,0.483333,0.350754,364,1432,1796,1,...,0,1,0,0,0,0,0,0,1,0


In [43]:
#let's make sure of the feature types.
bike_data.dtypes

instant         int64
dteday         object
season          int64
yr              int64
mnth            int64
holiday         int64
weekday         int64
workingday      int64
weathersit      int64
temp          float64
atemp         float64
hum           float64
windspeed     float64
casual          int64
registered      int64
cnt             int64
dtype: object

In [50]:
# Standardize the magnitude of numeric features using a scaler
#drop the 'dteday' column which is an object and also 'registered' and 'casual' to simplify the numbers
df1.drop(['dteday', 'registered', 'casual'],  axis=1, inplace=True)

df1.head()

Unnamed: 0,instant,temp,atemp,hum,windspeed,cnt,season_1,season_2,season_3,season_4,...,holiday_1,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,workingday_0,workingday_1
0,1,0.344167,0.363625,0.805833,0.160446,985,1,0,0,0,...,0,0,0,0,0,0,0,1,1,0
1,2,0.363478,0.353739,0.696087,0.248539,801,1,0,0,0,...,0,1,0,0,0,0,0,0,1,0
2,3,0.196364,0.189405,0.437273,0.248309,1349,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,4,0.2,0.212122,0.590435,0.160296,1562,1,0,0,0,...,0,0,0,1,0,0,0,0,0,1
4,5,0.226957,0.22927,0.436957,0.1869,1600,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1


In [45]:
# Making a Scaler object
scaler = StandardScaler()
# Fitting data to the scaler object
scaled_df = scaler.fit_transform(df1)
scaled_df = pd.DataFrame(scaled_df, columns= df1.columns)


In [51]:
#Split into testing and training datasets 
X_train, X_test, y_train, y_test = train_test_split(df1.drop(columns='cnt'), 
                                                   df1.cnt, test_size=0.3, 
                                                    random_state=47)
