# Utilizing Expert Knowledge

In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
citibike = pd.read_csv("data/citibike.csv")

In [None]:
citibike.head()

In [None]:
citibike["start station name"].unique()

In [None]:
citibike['one'] = 1
citibike['starttime'] = pd.to_datetime(citibike['starttime'])
citibike = citibike.set_index("starttime")
citibike = citibike.resample("3h").sum().fillna(0)

In [None]:
citibike = citibike["one"]

In [None]:
citibike.head()

In [None]:
import matplotlib.dates as mdates

plt.figure(figsize=(10, 3))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%a %m-%d"))
plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=2))
plt.plot(citibike, linewidth=1)
plt.gcf().autofmt_xdate()
plt.xlabel("Date")
plt.ylabel("Rentals")
plt.show()

In [None]:
# extract the target values (number of rentals)
y = citibike.values

In [None]:
# convert to POSIX time by dividing by 10**9
X = citibike.index.view(np.int64).reshape(-1, 1) // 10 ** 9

In [None]:
X.shape

In [None]:
X[:5]

In [None]:
# use the first 184 data points for training, and the rest for testing
n_train = 184
xticks = pd.date_range(start=citibike.index.min(), end=citibike.index.max(), freq='D')

# function to evaluate and plot a regressor on a given feature set
def eval_on_features(features, target, regressor):
    # split the given features into a training and a test set
    X_train, X_test = features[:n_train], features[n_train:]
    # also split the target array
    y_train, y_test = target[:n_train], target[n_train:]
    regressor.fit(X_train, y_train)
    print(f"Test-set R^2: {regressor.score(X_test, y_test):.2f}")
    y_pred = regressor.predict(X_test)
    y_pred_train = regressor.predict(X_train)
    plt.figure(figsize=(10, 3))
    plt.xticks(range(0, len(X), 8), xticks.strftime("%a %m-%d"), rotation=90, ha="left")
    plt.plot(range(n_train), y_train, label="train")
    plt.plot(range(n_train, len(y_test) + n_train), y_test, '-', label="test")
    plt.plot(range(n_train), y_pred_train, '--', label="prediction train")
    plt.plot(range(n_train, len(y_test) + n_train), y_pred, '--', label="prediction test")
    plt.legend(loc=(1.01, 0))
    plt.xlabel("Date")
    plt.ylabel("Rentals")

In [None]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators=100, random_state=0)
eval_on_features(X, y, regressor)

In [None]:
X_hour = citibike.index.hour.values.reshape(-1, 1)

In [None]:
print(X_hour[:12])

In [None]:
eval_on_features(X_hour, y, regressor)

In [None]:
X_hour_week = np.hstack([citibike.index.dayofweek.values.reshape(-1, 1), citibike.index.hour.values.reshape(-1, 1)])

In [None]:
X_hour_week[:10]

In [None]:
eval_on_features(X_hour_week, y, regressor)

In [None]:
from sklearn.linear_model import LinearRegression

eval_on_features(X_hour_week, y, LinearRegression())

In [None]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder()

X_hour_week_onehot = enc.fit_transform(X_hour_week).toarray()

In [None]:
X_hour_week_onehot[:5]

In [None]:
from sklearn.linear_model import Ridge

eval_on_features(X_hour_week_onehot, y, Ridge())

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly_transformer = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

X_hour_week_onehot_poly = poly_transformer.fit_transform(X_hour_week_onehot)

lr = Ridge()
eval_on_features(X_hour_week_onehot_poly, y, lr)

In [None]:
hour = ["%02d:00" % i for i in range(0, 24, 3)]
day = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
features =  day + hour

In [None]:
features_poly = poly_transformer.get_feature_names_out(features)
features_nonzero = np.array(features_poly)[lr.coef_ != 0]
coef_nonzero = lr.coef_[lr.coef_ != 0]

In [None]:
plt.figure(figsize=(15, 2))
plt.plot(coef_nonzero, 'o')
plt.xticks(np.arange(len(coef_nonzero)), features_nonzero, rotation=90)
plt.xlabel("Feature name")
plt.ylabel("Feature magnitude")
plt.show()