# Bike Sharing Demand Prediction

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
import mlflow

## Configuration

In [None]:
NUMERIC_FEATURES = ["temp", "hum", "windspeed", "cnt_lag_1", "cnt_lag_2", "cnt_lag_3", "cnt_lag_4", "cnt_lag_5", "cnt_lag_6"]
CATEGORICAL_FEATURES = ["season", "mnth", "holiday", "weekday", "workingday", "weathersit"]

TARGET = "cnt"

RANDOM_STATE = 42

In [None]:
mlflow.set_tracking_uri("http://localhost:4000")
mlflow.set_experiment("Bike sharing demand")

mlflow.sklearn.autolog(log_datasets=False)
mlflow.xgboost.autolog(log_datasets=False)

## Data Ingestion & Preparation

In [None]:
data = pd.read_csv("../data/day.csv")
data.head()

In [None]:
data["cnt_lag_1"] = data["cnt"].shift(1)
data["cnt_lag_2"] = data["cnt"].shift(2)
data["cnt_lag_3"] = data["cnt"].shift(3)
data["cnt_lag_4"] = data["cnt"].shift(4)
data["cnt_lag_5"] = data["cnt"].shift(5)
data["cnt_lag_6"] = data["cnt"].shift(6)

data = data.dropna()

In [None]:
feature_columns = NUMERIC_FEATURES + CATEGORICAL_FEATURES
features = data[feature_columns + [TARGET]]
features.head()

In [None]:
train_data, test_data = train_test_split(features, random_state=RANDOM_STATE)

train_input = train_data[feature_columns]
train_output = train_data[TARGET]

## Linear Regression

In [None]:
numeric_transformer = Pipeline([
    ("scaler", StandardScaler()),
])

categorical_transformer = Pipeline([
    ("encoder", OneHotEncoder()),
])

preprocessor = ColumnTransformer([
    ("numeric", numeric_transformer, NUMERIC_FEATURES),
    ("categorical", categorical_transformer, CATEGORICAL_FEATURES),
])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimator", LinearRegression())
])

In [None]:
with mlflow.start_run():
    pipeline.fit(train_input, train_output)

    mlflow.evaluate(
        model=pipeline.predict,
        data=test_data,
        targets=TARGET,
        model_type="regressor"
    )

## XGBoost with Hyperparameter Tuning

In [None]:
numeric_transformer = Pipeline([
    ("scaler", StandardScaler()),
])

categorical_transformer = Pipeline([
    ("encoder", OneHotEncoder()),
])

preprocessor = ColumnTransformer([
    ("numeric", numeric_transformer, NUMERIC_FEATURES),
    ("categorical", categorical_transformer, CATEGORICAL_FEATURES),
])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimator", XGBRegressor(random_state=RANDOM_STATE))
])

In [None]:
param_grid = {
    'estimator__n_estimators': [100, 200, 300],
    'estimator__learning_rate': [0.01, 0.1, 0.2],
    'estimator__max_depth': [3, 5, 7],
    'estimator__subsample': [0.8, 1.0],
    'estimator__colsample_bytree': [0.8, 1.0]
}

with mlflow.start_run():
    grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring="neg_mean_absolute_error", verbose=2, n_jobs=-1)
    grid_search.fit(train_input, train_output)

    mlflow.evaluate(
        model=grid_search.best_estimator_.predict,
        data=test_data,
        targets=TARGET,
        model_type="regressor"
    )