In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

np.set_printoptions(precision=3)
pd.set_option('precision', 3)

sns.set()

### London bike sharing

##### Dataset
https://www.kaggle.com/hmavrodiev/london-bike-sharing-dataset

"timestamp" - timestamp field for grouping the data</br>
"cnt" - the count of a new bike shares</br>
"t1" - real temperature in C</br>
"t2" - temperature in C "feels like"</br>
"hum" - humidity in percentage</br>
"windspeed" - wind speed in km/h</br>
"weathercode" - category of the weather</br>
"isholiday" - boolean field - 1 holiday / 0 non holiday</br>
"isweekend" - boolean field - 1 if the day is weekend</br>
"season" - category field meteorological seasons: 0-spring ; 1-summer; 2-fall; 3-winter.</br>

"weathe_code" category description:</br>
1 = Clear ; mostly clear but have some values with haze/fog/patches of fog/ fog in vicinity 2 = scattered clouds / few clouds 3 = Broken clouds 4 = Cloudy 7 = Rain/ light Rain shower/ Light rain 10 = rain with thunderstorm 26 = snowfall 94 = Freezing Fog


In [None]:
bike_data = pd.read_csv('london_bikes_sample.csv')
bike_data.head()

In [None]:
bike_data.info()

In [None]:
bike_data['year'] = pd.DatetimeIndex(bike_data['timestamp']).year
bike_data['month'] = pd.DatetimeIndex(bike_data['timestamp']).month
bike_data['hour'] = pd.DatetimeIndex(bike_data['timestamp']).hour

bike_data.drop('timestamp', axis = 1, inplace = True)

bike_data.head()

In [None]:
bike_data.describe()

In [None]:
plt.subplots(figsize=(12,7))
sns.boxplot(x=bike_data['season'], y=bike_data['cnt'])

In [None]:
sns.pairplot(bike_data)

In [None]:
plt.figure(figsize=(18,20))
for i, column in enumerate(bike_data.columns):
    plt.subplot(4, 4, i + 1)
    plt.scatter(bike_data[column], bike_data['cnt'])
    plt.xlabel(column)
    plt.ylabel('count')

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(bike_data.corr(method='spearman'), annot = True, linewidths=0.7, ax=ax )

In [None]:
X, Y = bike_data.drop('cnt', axis = 1), bike_data['cnt']
X.head()

In [None]:
import sklearn.metrics as metrics

from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, RidgeCV
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split (X, Y, test_size = 0.3, shuffle = False)
X_train

In [None]:
std_scaler = StandardScaler()

X_train_scaled = std_scaler.fit_transform(X_train)
X_test_scaled = std_scaler.transform(X_test)

X_train_scaled

In [None]:
simple_LR = LinearRegression()

simple_LR.fit(X_train_scaled, Y_train)
simple_LR_predictions = simple_LR.predict(X_test_scaled)

simple_LR_MSE = metrics.mean_squared_error(Y_test, simple_LR_predictions)

print (f'MSE = {round(simple_LR_MSE,3)}')

In [None]:
simple_LR_RMSE = metrics.mean_squared_error(Y_test, simple_LR_predictions, squared = False)

print (f'RMSE = {round(simple_LR_RMSE,3)}')

In [None]:
print (f'Test std: {round(Y_test.std(),3)}')

In [None]:
plt.figure(figsize=(8, 8))
plt.scatter(simple_LR_predictions, Y_test)
plt.xlabel('Predictions')
plt.ylabel('Target')

In [None]:
simple_LR.intercept_, simple_LR.coef_

In [None]:
pd.DataFrame(simple_LR.coef_, bike_data.columns[1:], columns=['Coefficients'])

In [None]:
model = Lasso()

model.fit(X_train_scaled, Y_train)
predictions = model.predict(X_test_scaled)

print (f'RMSE = {round(metrics.mean_squared_error(Y_test, predictions, squared=False), 3)}')

pd.DataFrame(model.coef_, bike_data.columns[1:], columns=['Coefficients'])

In [None]:
model = LassoCV(random_state=10)

model.fit(X_train_scaled, Y_train)
predictions = model.predict(X_test_scaled)

print (f'RMSE = {round(metrics.mean_squared_error(Y_test, predictions, squared=False), 3)}')

pd.DataFrame(model.coef_, bike_data.columns[1:], columns=['Coefficients'])

In [None]:
model = Ridge()

model.fit(X_train_scaled, Y_train)
predictions = model.predict(X_test_scaled)

print (f'RMSE = {round(metrics.mean_squared_error(Y_test, predictions, squared=False), 3)}')

pd.DataFrame(model.coef_, bike_data.columns[1:], columns=['Coefficients'])

In [None]:
model = RidgeCV()

model.fit(X_train_scaled, Y_train)
predictions = model.predict(X_test_scaled)

print (f'RMSE = {round(metrics.mean_squared_error(Y_test, predictions, squared=False), 3)}')

pd.DataFrame(model.coef_, bike_data.columns[1:], columns=['Coefficients'])

In [None]:
alphas = np.linspace(1, 10, 20)
alphas

In [None]:
model = LassoCV(random_state = 10, alphas=alphas)

model.fit(X_train_scaled, Y_train)
predictions = model.predict(X_test_scaled)

print (f'RMSE = {round(metrics.mean_squared_error(Y_test, predictions, squared=False), 3)}')

pd.DataFrame(model.coef_, bike_data.columns[1:], columns=['Coefficients'])

In [None]:
bike_data_dummy = pd.concat([bike_data.drop('weather_code', axis=1), 
                             pd.get_dummies(bike_data['weather_code'])], axis = 1)
bike_data_dummy.head()

In [None]:
X, Y = bike_data_dummy.drop(['cnt'], axis=1), bike_data_dummy['cnt']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, shuffle=False)

std_scaler = StandardScaler()

X_train_scaled = std_scaler.fit_transform(X_train)
X_test_scaled = std_scaler.transform(X_test)

model = LassoCV(alphas=alphas)

model.fit(X_train_scaled, Y_train)
predictions = model.predict(X_test_scaled)

print (f'RMSE = {round(metrics.mean_squared_error(Y_test, predictions, squared=False), 3)}')

pd.DataFrame(model.coef_, bike_data_dummy.columns[1:], 
             columns=['Coefficients']).sort_values('Coefficients')

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state = 10, n_estimators = 100)

model.fit(X_train, Y_train)
predictions = model.predict(X_test)

print (f'RMSE = {round(metrics.mean_squared_error(Y_test, predictions, squared=False), 3)}')

pd.DataFrame(model.feature_importances_, bike_data_dummy.columns[1:], 
             columns=['Importance']).sort_values('Importance')

In [None]:
plt.figure(figsize=(8, 8))
plt.scatter(predictions, Y_test)
plt.xlabel('Predictions')
plt.ylabel('Target')