In [8]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [9]:
df_train = pd.read_csv('./train.csv')

In [10]:
#Extend the df by splitting the datetime into new categories
df_train['year'] = df_train.datetime.apply(lambda x: x.split()[0].split('-')[0])
df_train['month'] = df_train.datetime.apply(lambda x: x.split()[0].split('-')[1])
df_train['day'] = df_train.datetime.apply(lambda x: x.split()[0].split('-')[2])
df_train['hour'] = df_train.datetime.apply(lambda x : x.split()[1].split(":")[0])

#Infer Day Name
sr = pd.to_datetime(df_train['datetime']) 
df_train['Day_name'] = sr.dt.day_name()


#Season List 
season_dict = {1: 'Spring', 2: 'Summer', 3: 'Fall', 4: 'Winter'}
df_train['Season_Exp'] = df_train['season'].map(season_dict)

#Weather List
weather_dict = {1:'clear' , 2:'mist' , 3:'light_precip' , 4: 'heavy_precip' }
df_train['Weather_Exp'] = df_train['weather'].map(weather_dict)

# Dummy a Subset of Columns # 

In [15]:
df_te_train = df_train

#Dummify everything
column_list_to_dummify = ['Season_Exp', 'Weather_Exp', 'year', 'month', 'Day_name', 'hour']

#Dummify everything
df_dummy_train = pd.get_dummies(df_te_train[column_list_to_dummify], drop_first = True )
df_dummy_train.head()

#Concatenate with df_train
dummy_train = pd.concat([df_train,df_dummy_train], axis = 1)
dummy_train.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,...,0,0,0,0,0,0,0,0,0,0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,...,0,0,0,0,0,0,0,0,0,0
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,...,0,0,0,0,0,0,0,0,0,0
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,...,0,0,0,0,0,0,0,0,0,0
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
dummy_train.columns

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count',
       'year', 'month', 'day', 'hour', 'Day_name', 'Season_Exp', 'Weather_Exp',
       'Season_Exp_Spring', 'Season_Exp_Summer', 'Season_Exp_Winter',
       'Weather_Exp_heavy_precip', 'Weather_Exp_light_precip',
       'Weather_Exp_mist', 'year_2012', 'month_02', 'month_03', 'month_04',
       'month_05', 'month_06', 'month_07', 'month_08', 'month_09', 'month_10',
       'month_11', 'month_12', 'Day_name_Monday', 'Day_name_Saturday',
       'Day_name_Sunday', 'Day_name_Thursday', 'Day_name_Tuesday',
       'Day_name_Wednesday', 'hour_01', 'hour_02', 'hour_03', 'hour_04',
       'hour_05', 'hour_06', 'hour_07', 'hour_08', 'hour_09', 'hour_10',
       'hour_11', 'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16',
       'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22',
       'hour_23'],
      dtype='object')

In [17]:
#Define data and train sets

Xtrain = dummy_train.loc[:,'Season_Exp_Spring':'hour_23']
ytrain = dummy_train['count']


In [18]:
Xtrain.shape, ytrain.shape

((10886, 47), (10886,))

In [19]:
#Scikit Linear Reg
m = LinearRegression()


In [20]:
m.fit(Xtrain, ytrain), m.score(Xtrain, ytrain)

(LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
          normalize=False), 0.6828150545274851)

In [21]:
ypred = m.predict(Xtrain)
mean_squared_error(ytrain, ypred) #Scikit

10406.932863167141

In [22]:
m.coef_

array([-3.00098458e+14, -9.45107362e+13, -1.13974851e+15, -2.10405680e+02,
       -8.98443103e+01, -1.85637099e+01,  9.61577840e+01,  2.40425340e+01,
        6.01291369e+01, -2.05587722e+14, -2.05587722e+14, -2.05587722e+14,
       -3.00098458e+14, -3.00098458e+14, -3.00098458e+14,  8.39650057e+14,
        8.39650057e+14,  8.39650057e+14, -9.85043800e+00,  2.81871338e+00,
       -1.81845859e+01,  3.97390838e-01, -5.17236873e+00, -4.16040963e+00,
       -1.93572434e+01, -3.25735957e+01, -4.70093083e+01, -5.01357232e+01,
       -3.52514110e+01,  2.37831923e+01,  1.61659868e+02,  3.10543726e+02,
        1.68427176e+02,  1.21252722e+02,  1.56061409e+02,  2.03134507e+02,
        2.03266834e+02,  1.89381134e+02,  2.01655670e+02,  2.62571861e+02,
        4.16535215e+02,  3.79050410e+02,  2.60624389e+02,  1.74029461e+02,
        1.18445627e+02,  7.89625790e+01,  3.68681868e+01])