In [3]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeClassifier
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
# import mglearn
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.kernel_ridge import KernelRidge

In [4]:
bikedata = pd.read_csv("bikeShareHour.csv")
# Checking for missing values (found none in any method)
bikedata.info()
bikedata.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     17379 non-null  int64  
 1   dteday      17379 non-null  object 
 2   season      17379 non-null  int64  
 3   yr          17379 non-null  int64  
 4   mnth        17379 non-null  int64  
 5   hr          17379 non-null  int64  
 6   holiday     17379 non-null  int64  
 7   weekday     17379 non-null  int64  
 8   workingday  17379 non-null  int64  
 9   weathersit  17379 non-null  int64  
 10  temp        17379 non-null  float64
 11  atemp       17379 non-null  float64
 12  hum         17379 non-null  float64
 13  windspeed   17379 non-null  float64
 14  casual      17379 non-null  int64  
 15  registered  17379 non-null  int64  
 16  cnt         17379 non-null  int64  
dtypes: float64(4), int64(12), object(1)
memory usage: 2.3+ MB


instant       0
dteday        0
season        0
yr            0
mnth          0
hr            0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
casual        0
registered    0
cnt           0
dtype: int64

In [5]:
# Create a peak hour dummy
# This clever code converts the boolean to an integer by multiplying by 1
bikedata['peakDummy'] = 1*(((bikedata.hr>=7) & (bikedata.hr<=9))|((bikedata.hr>=17)&(bikedata.hr<=19)))

In [6]:
# X = bikedata.filter(['hum', 'hr', 'holiday', 'weekday', 'workingday', 'atemp', 'windspeed','peakDummy'], axis=1)
y = bikedata['cnt']
# All data for regression and decision tree
X = bikedata.filter(['yr', 'hr', 'season', 'holiday', 'weekday', 'workingday', 'weathersit', 'atemp', 'windspeed', 'WeekendDummy', 'peakDummy'], axis=1)
X.head()
print(len(y))

17379


In [7]:
X.describe()
y.head()

0    16
1    40
2    32
3    13
4     1
Name: cnt, dtype: int64

In [8]:
# Linear regression
nmc = 250
lrtrainscore = np.zeros(nmc)
lrtestscore = np.zeros(nmc)
for i in range(nmc):
        X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
        lrm = LinearRegression()
        trainFitlr = lrm.fit(X_train, y_train)
        lrtrainscore[i] = trainFitlr.score(X_train,y_train)
        lrtestscore[i] =  trainFitlr.score(X_test,y_test)      
print("Train score using Linear Reg", np.mean(lrtrainscore))
print("Test score using Linear Reg", np.mean(lrtestscore))

Train score using Linear Reg 0.5295155722148212
Test score using Linear Reg 0.5294832546717476


In [9]:
# Ridge
from sklearn.preprocessing import StandardScaler

nmc = 250
rtrainscore = []
rtestscore = []
rtrainstd = []
rteststd = []
alphas = np.array([0.5,1.,2.])
for a in alphas:
    trainScore = np.zeros(nmc)
    testScore = np.zeros(nmc)
    for i in range(nmc):
          X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)   
          rr = Ridge(alpha=a)
          scaler = StandardScaler().fit(X_train)
          X_train2 = scaler.transform(X_train)
          X_test2  = scaler.transform(X_test)
          trainFit = rr.fit(X_train2,y_train)
          trainScore[i] = trainFit.score(X_train2,y_train)
          testScore[i] =  trainFit.score(X_test2,y_test)
    rtrainscore.append(np.mean(trainScore))
    rtestscore.append(np.mean(testScore))
    rteststd.append(np.std(testScore))
    rtrainstd.append(np.std(trainScore))
print("Train score using Ridge and alpha = 0.5, 1 and 2 \n ", rtrainscore)
print("Test score using Ridge and alpha = 0.5, 1 and 2 \n ", rtestscore)

Train score using Ridge and alpha = 0.5, 1 and 2 
  [0.5298099235842673, 0.5297501975557354, 0.5293990423114927]
Test score using Ridge and alpha = 0.5, 1 and 2 
  [0.5282880213638405, 0.5285336469643208, 0.529978563102125]


### Do the same thing with a pipeline

In [10]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit
from sklearn.pipeline import make_pipeline

In [11]:
nmc = 250
alphas = np.array([0.5,1.0,2.0])
shuffle = ShuffleSplit(n_splits=nmc, test_size=.2)
for a in alphas:
    fullModel = make_pipeline(StandardScaler(), Ridge(alpha=a))
    CVInfo = cross_validate(fullModel, X, y, cv=shuffle,return_train_score=True)
    print("Alpha=",a)
    print(np.mean(CVInfo['train_score']))
    print(np.mean(CVInfo['test_score']))

Alpha= 0.5
0.5296555646550598
0.5289095944013901
Alpha= 1.0
0.5298799584871667
0.5280302617076609
Alpha= 2.0
0.5297694749419468
0.5284658615443248


### Lasso

In [12]:
nmc = 250
alphas = np.array([0.001,0.03,0.05])
shuffle = ShuffleSplit(n_splits=nmc, test_size=.2,)
for a in alphas:
    fullModel = make_pipeline(StandardScaler(), Lasso(alpha=a))
    CVInfo = cross_validate(fullModel, X, y, cv=shuffle,return_train_score=True)
    print("Alpha=",a)
    print(np.mean(CVInfo['train_score']))
    print(np.mean(CVInfo['test_score']))

Alpha= 0.001
0.5299098857102539
0.5278889816180402
Alpha= 0.03
0.5297724698486017
0.5284394149132027
Alpha= 0.05
0.5298151994653717
0.5283016839891815


In [13]:
# Use lasso to find important variables
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)   
scaler = StandardScaler().fit(X_train)
X_train2 = scaler.transform(X_train)
lasso = Lasso(alpha=0.03)
lasso.fit(X_train2,y_train)
print(lasso.coef_)

[ 44.08473199  53.08656437  16.65627757  -3.14715181   5.20183989
   1.10423663 -18.59398193  57.67043514  12.28209672  75.99399901]


In [14]:
nmc = 100
shuffle = ShuffleSplit(n_splits=nmc, test_size=.2,)
from sklearn.neighbors import KNeighborsRegressor
kneighbors = [1, 5, 25, 100]
for k in kneighbors:
        fullModel = make_pipeline(StandardScaler(), KNeighborsRegressor(n_neighbors=k))
        CVInfo = cross_validate(fullModel, X, y, cv=shuffle,return_train_score=True)
        print("k=",k)
        print(np.mean(CVInfo['train_score']))
        print(np.mean(CVInfo['test_score']))

k= 1
0.9987948927478663
0.7480481397122452
k= 5
0.8883503940738177
0.8295894769104086
k= 25
0.8282213381305245
0.8114429257654061
k= 100
0.7544829753116723
0.7480294060302586


In [15]:
# tree regression
from sklearn.tree import DecisionTreeRegressor
nmc = 100
shuffle = ShuffleSplit(n_splits=nmc, test_size=.2,)
depth = np.array([5,10,25,50])
for d in depth:
        treeModel= DecisionTreeRegressor(max_depth=d)
        CVInfo = cross_validate(treeModel, X, y, cv=shuffle,return_train_score=True)
        print("depth=",d)
        print(np.mean(CVInfo['train_score']))
        print(np.mean(CVInfo['test_score']))

depth= 5
0.7077135690536243
0.7037506051969136
depth= 10
0.9292189505014998
0.8982237558891073
depth= 25
0.9993776440650288
0.8781184409760742
depth= 50
0.9993761039234619
0.8798869626008816


In [16]:
# Random forest regression
from sklearn.ensemble import RandomForestRegressor
nmc = 100
shuffle = ShuffleSplit(n_splits=nmc, test_size=.2,)
depth = np.array([5,10,25,50])
for d in depth:
        forestModel= RandomForestRegressor(n_estimators=25,max_depth=d)
        CVInfo = cross_validate(forestModel, X, y, cv=shuffle,return_train_score=True)
        print("depth=",d)
        print(np.mean(CVInfo['train_score']))
        print(np.mean(CVInfo['test_score']))

depth= 5
0.7455035599392309
0.7413059718118135
depth= 10
0.9426641655035742
0.9201907859884656
depth= 25
0.9889801563446126
0.932294086612515
depth= 50
0.9890191155462498
0.9307449223939099


In [27]:
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import pandas as pd
nmc = 25
shuffle = ShuffleSplit(n_splits=nmc, test_size=.25,)
# for a in [0.5, 1.0, 1.5, 2.0]:
for g in [0.01,0.05,0.5,1.0, 1.5]:
    fullModel = make_pipeline(StandardScaler(),KernelRidge(alpha=1.5, gamma=g))
    CVInfo = cross_validate(forestModel, X, y, cv=shuffle,return_train_score=True)
    print("param=",g)
    print(np.mean(CVInfo['train_score']))
    print(np.mean(CVInfo['test_score']))

param= 0.01
0.9888054422035379
0.9317961522161906
param= 0.05
0.9889462263388944
0.9301198103034638
param= 0.5
0.9888423063496496
0.9305537411370689
param= 1.0
0.9890321788218138
0.9303093433015007
param= 1.5
0.9889695925877711
0.9295069250041821


In [19]:
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import pandas as pd
nmc = 100
score_used = 'r2' 

shuffle = ShuffleSplit(n_splits=nmc, test_size=.2,)
# Set up pipeline with StandardScaler
fullModel = make_pipeline(StandardScaler(),KernelRidge())

# set up dictionary for grid search
param_grid={'kernelridge__alpha':[0.005,0.05,0.1,0.2, 0.3, 0.5,1.],'kernelridge__gamma':[0.001, 0.01,0.1,0.2,0.5,1.0,2.0],'kernelridge__kernel':['rbf']}

# set up cross-validation shuffles
shuffle_split = ShuffleSplit(test_size=0.25, n_splits=25)

# set up search
grid_search=GridSearchCV(fullModel,param_grid,cv=shuffle_split,scoring=score_used, 
                              return_train_score=True)
# implement search
grid_search.fit(X,y)
# move results into DataFrame
results = pd.DataFrame(grid_search.cv_results_)
presults = results[['rank_test_score','mean_test_score','param_kernelridge__alpha','param_kernelridge__gamma']]
print(presults)

KeyboardInterrupt: 

In [18]:
y

0         16
1         40
2         32
3         13
4          1
        ... 
17374    119
17375     89
17376     90
17377     61
17378     49
Name: cnt, Length: 17379, dtype: int64