# Bike sharing demand in Washington, D.C.

https://www.kaggle.com/c/bike-sharing-demand/data


- hourly rental data spanning two years
- the training set is comprised of the first 19 days of each month, while the test set is the 20th to the end of the month
- goal: predict the total **count** of bikes rented during each hour covered by the test set, using only information available prior to the rental period
- evaluation based on [Root Mean Squared Logarithmic Error](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_log_error.html)

In [None]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import PoissonRegressor
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer, TransformedTargetRegressor
from sklearn.pipeline import make_pipeline
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder, StandardScaler

In [None]:
train = pd.read_csv('../data/bicycles/train.csv', parse_dates=[0])
test = pd.read_csv('../data/bicycles/test.csv', parse_dates=[0])
submission = pd.read_csv('../data/bicycles/sampleSubmission.csv', parse_dates=[0])
train.head()

In [None]:
def create_date_features(df):
    x = df['datetime']
    return pd.DataFrame({
        'month': x.dt.month,
        'hour': x.dt.hour,
        'week': x.dt.isocalendar().week,
        'weekday': x.dt.weekday
    })

In [None]:
datetime_pipeline = make_pipeline(
    FunctionTransformer(create_date_features, validate=False),
    OneHotEncoder(handle_unknown='ignore')
)


transformer = make_column_transformer(
    (datetime_pipeline, ['datetime']),
    (StandardScaler(), ['temp', 'atemp', 'humidity', 'windspeed']),         
    (OneHotEncoder(handle_unknown='ignore'), ['season', 'weather']),
    ('passthrough', ['workingday', 'holiday'])
)

In [None]:
model_pipeline = make_pipeline(
    transformer, 
    PolynomialFeatures(), 
    PoissonRegressor(max_iter=1000)
)

In [None]:
# to see all parameters use:
# model_pipeline.get_params().keys()

In [None]:
param_grid = {
    'polynomialfeatures__degree': [1, 2], 
    'poissonregressor__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
}

In [None]:
X_train = train.drop(['casual','registered', 'count'], axis=1)
y_train = train['count']

In [None]:
def rmsle(y, y_pred):   
    """root mean square log, capped at zero"""
    y_pred[y_pred < 0] = 0
        
    return np.sqrt(metrics.mean_squared_log_error(y, y_pred))

neg_rmsle_score = metrics.make_scorer(rmsle, greater_is_better=False)

In [None]:
cv = GridSearchCV(
    model_pipeline, 
    param_grid, 
    return_train_score=True, 
    scoring=neg_rmsle_score, 
    cv=5, 
    n_jobs=4, 
    refit=True,
    verbose=1
)
cv.fit(X_train, y_train)
cv_res = pd.DataFrame(cv.cv_results_)

In [None]:
cv_res[[
    'param_polynomialfeatures__degree',
    'param_poissonregressor__alpha',
    'mean_train_score', 
    'mean_test_score'
]].sort_values('mean_test_score').abs()

In [None]:
row_filter = cv_res['param_polynomialfeatures__degree'] == 2
cv_res.loc[row_filter, [
    'param_poissonregressor__alpha',
    'mean_train_score', 
    'mean_test_score'
]].abs().plot(x='param_poissonregressor__alpha', logx=True)

In [None]:
cv.best_params_

In [None]:
y_pred = cv.best_estimator_.predict(test)
y_pred[y_pred < 0] = 0
assert all(y_pred >= 0)

In [None]:
submission['count'] = y_pred

In [None]:
submission.to_csv('submission.csv', index=False)