## Model Selection
Testing, evaluating, and tuning a set of models to predict the number of bike rentals on each hour of the 20th day of each month. 

In [4]:
# setup
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

train = pd.read_csv("data/train_clean.csv", parse_dates = True)
test = pd.read_csv("data/test_clean.csv", parse_dates = True)

x_train = train.drop(["casual","registered","count"], axis=1)
y_train= train["count"]

x_test = test

In [None]:
# Define parameter grid for Random Forest
param_grid = {
    'classifier__n_estimators': [10, 50, 100],
    'classifier__max_depth': [None, 10, 20, 30]
}

models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Support Vector Machine': SVR()
}

pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', RandomForestClassifier())])
grid_search = GridSearchCV(pipeline, param_grid, cv=5)

# Perform grid search
grid_search.fit(X_train, y_train)

for key in models.keys():
    
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('key', models[key])
    ])

    pipeline.fit(x_train, y_train)

    y_pred = pipeline.predict(x_test)

    mse = np.round(mean_squared_error(y_test, y_pred), 2)
    print(f'Mean Squared Error: {mse}')

    r2score = np.round(r2_score(y_test, y_pred), 2)