# Bike sharing data set (source ML repository)

In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
bikedata = pd.read_csv("bikeShareHour.csv")
# Checking for missing values (found none in any method)
bikedata.info()
bikedata.isnull()
bikedata.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     17379 non-null  int64  
 1   dteday      17379 non-null  object 
 2   season      17379 non-null  int64  
 3   yr          17379 non-null  int64  
 4   mnth        17379 non-null  int64  
 5   hr          17379 non-null  int64  
 6   holiday     17379 non-null  int64  
 7   weekday     17379 non-null  int64  
 8   workingday  17379 non-null  int64  
 9   weathersit  17379 non-null  int64  
 10  temp        17379 non-null  float64
 11  atemp       17379 non-null  float64
 12  hum         17379 non-null  float64
 13  windspeed   17379 non-null  float64
 14  casual      17379 non-null  int64  
 15  registered  17379 non-null  int64  
 16  cnt         17379 non-null  int64  
dtypes: float64(4), int64(12), object(1)
memory usage: 2.3+ MB


instant       0
dteday        0
season        0
yr            0
mnth          0
hr            0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
casual        0
registered    0
cnt           0
dtype: int64

In [3]:
# Create a peak hour dummy
# This clever code converts the boolean to an integer by multiplying by 1
bikedata['peakDummy'] = 1*(((bikedata.hr>=7) & (bikedata.hr<=9))|((bikedata.hr>=17)&(bikedata.hr<=19)))

In [4]:
X = bikedata.filter(['hum', 'hr', 'holiday', 'weekday', 'workingday', 'atemp', 'windspeed','peakDummy'], axis=1)
Y = bikedata['cnt']
X.head()

Unnamed: 0,hum,hr,holiday,weekday,workingday,atemp,windspeed,peakDummy
0,0.81,0,0,6,0,0.2879,0.0,0
1,0.8,1,0,6,0,0.2727,0.0,0
2,0.8,2,0,6,0,0.2727,0.0,0
3,0.75,3,0,6,0,0.2879,0.0,0
4,0.75,4,0,6,0,0.2879,0.0,0


In [5]:
X.describe()

Unnamed: 0,hum,hr,holiday,weekday,workingday,atemp,windspeed,peakDummy
count,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0
mean,0.627229,11.546752,0.02877,3.003683,0.682721,0.475775,0.190098,0.25128
std,0.19293,6.914405,0.167165,2.005771,0.465431,0.17185,0.12234,0.433762
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.48,6.0,0.0,1.0,0.0,0.3333,0.1045,0.0
50%,0.63,12.0,0.0,3.0,1.0,0.4848,0.194,0.0
75%,0.78,18.0,0.0,5.0,1.0,0.6212,0.2537,1.0
max,1.0,23.0,1.0,6.0,1.0,1.0,0.8507,1.0


In [25]:
# Linear regression
lrtrainscore = np.zeros(1000)
lrtestscore = np.zeros(1000)
nmc = 1000
for i in range(nmc):
        X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.25)
        lrm = LinearRegression()
        trainFitlr = lrm.fit(X_train, y_train)
        lrtrainscore[i] = trainFitlr.score(X_train,y_train)
        lrtestscore[i] =  trainFitlr.score(X_test,y_test)      
print("Train score using Linear Reg", np.mean(lrtrainscore))
print("Test score using Linear Reg", np.mean(lrtestscore))

Train score using Linear Reg 0.49994373815701704
Test score using Linear Reg 0.499273627401233
