In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()

In [None]:
df = pd.read_csv('data/processed_data.csv', index_col=0, parse_dates=True)

In [None]:
df.head(5)

In [None]:
df['Date'] = pd.to_datetime(df['Date'])
df.dtypes

In [None]:
df = df.set_index('Date')
df.head(5)

In [None]:
df.shape

In [None]:
df.columns

In [None]:
# fit_intercept : whether to calculate the intercept for this model. 
# If set to False, no intercept will be used in calculations (e.g. data is expected to be already centered).
# In this use case, we'll set fit_intercept = False, 
# because the daily flags essentially operate as their own day-specific intercepts
indep_cols = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun',
       'holiday', 'DayLightHrs', 'AvgTempInC', 'PRCP_IN', 'DryDay','YearsCount']
x = df[indep_cols]
y = df['TotalBikesCount']

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression(fit_intercept=False)
model.fit(x, y)

In [None]:
predictions = model.predict(x)
predictions

In [None]:
df1= df[['TotalBikesCount']].copy()
df1['Predictions'] = predictions
df1.head(5)

In [None]:
import datetime as dt
df1.plot(alpha=0.4, figsize=(18,5), color=['r','g'])
plt.legend(loc='upper left')
plt.show()
# If x-axis doesn't show years, make sure the 'Date' column was converted to datetime from String, before it was set as index

It is evident that we have missed some key features, especially during the summer time. Either our features are not complete (i.e., people decide whether to ride to work based on more than just these) or there are some nonlinear relationships that we have failed to take into account (e.g., perhaps people ride less at both high and low temperatures).

Nevertheless, our rough approximation is enough to give us some insights, and we can take a look at the coefficients of the linear model to estimate how much each feature contributes to the daily bicycle count:

In [None]:
model.coef_

In [None]:
coeffs = pd.Series(model.coef_, index=x.columns)
coeffs

These numbers are difficult to interpret without some measure of their uncertainty. 

We can compute these uncertainties quickly using bootstrap resamplings of the data:

In [None]:
# Karthik's Playground
# The bootstrap/resapling is mainly used for test statistics rather than regression coefficients.
from sklearn.utils import resample
model.fit(x, y)
print('model.coef_',model.coef_)
model.fit(*resample(x, y))
print('model.coef_',model.coef_)
model.fit(*resample(x, y))
print('model.coef_',model.coef_)
print('Why are the  coefficients different with data resampling/shuffling? Only the order of the data is changed and not the data itself, right?')


temp = np.asarray([
                [1,1,2,3,4], # 1st row
                [2,6,7,8,9], # 2nd row
                [3,6,7,8,9], # 3rd row
                [4,6,7,8,9], # 4th row
                [5,6,7,8,9]  # 5th row
              ])
print('temp :',temp)
print('temp resampled :',resample(temp))
print('Resampled data is different and seems close to the original one. And now you know why the coefficients are different.')

In [None]:
from sklearn.utils import resample
np.random.seed(1)
coeffs_tmp = [model.fit(*resample(x, y)).coef_ for i in range(1000)]
err = np.std(coeffs_tmp,0) #Columnar STD

In [None]:
# With these errors estimated, let's again look at the results:
print(pd.DataFrame({'effect': coeffs.round(0),
                    'error': err.round(0)}))

#### Observations
* We first see that there is a relatively stable trend in the weekly baseline: there are many more riders on weekdays than on weekends and holidays.
* We see that for each additional hour of daylight, 121 ± 7 more people choose to ride; 
* A temperature increase of one degree Celsius encourages 390 ± 16 people to grab their bicycle; 
* A dry day means an average of 513 ± 25 more riders, and 
* Each inch of precipitation means 180406 ± 11018 more people leave their bike at home. 
* Once all these effects are accounted for, we see a modest increase of 30 ± 6 new daily riders each year.