In [41]:
import pandas as pd
import seaborn as sns
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from xgboost import XGBClassifier, XGBRegressor

In [30]:
df = pd.read_csv("./outputs/train_processed.csv")
df.sample(20)

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,month,day,hour
682,1,0,0,1,12.3,13.635,28,22.0028,30,84,114,2,12,11
7587,2,0,1,1,25.42,31.06,38,27.9993,91,554,645,6,4,19
2947,3,0,1,1,28.7,32.575,51,15.0013,92,251,343,7,15,20
6335,1,0,1,1,17.22,21.21,77,15.0013,1,28,29,3,8,5
6907,2,0,0,1,14.76,18.18,66,7.0015,2,5,7,4,14,5
3589,3,0,0,1,30.34,34.85,70,19.9995,104,125,229,9,4,20
2882,3,0,1,1,30.34,34.09,58,12.998,0,4,4,7,13,3
4235,4,0,1,3,22.14,25.76,94,6.0032,13,41,54,10,12,23
5048,4,0,1,2,12.3,15.15,70,11.0014,11,115,126,12,9,10
8698,3,0,1,1,31.98,34.85,40,11.0014,79,175,254,8,13,13


In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [None]:
regressors = {
    "lasso": {
        "model": Lasso(),
        "params": {
            "alpha": [0.1, 0.5, 1.0]
        }
    },
    "ridge": {
        "model": Ridge(),
        "params": {
            "alpha": [0.1, 0.5, 1.0]
        }
    },
    "elasticnet": {
        "model": ElasticNet(),
        "params": {
            "alpha": [0.1, 0.5, 1.0]
        }
    },
    "xgb": {
        "model": XGBRegressor(),
        "params": {
            "n_estimators": [100, 200, 300],
            "learning_rate": [0.1, 0.01, 0.001]
        }
    },
    "random_forest": {
        "model": RandomForestRegressor(),
        "params": {
            "n_estimators": [100, 200, 300],
            "max_depth": [5, 10, 15]
        }
    }
}

In [44]:
def run_regressors(
    regressors: dict[str, dict], X_train: pd.DataFrame, y_train: pd.Series
) -> Pipeline:

    for regressor, regressor_content in regressors.items():
        if regressor == "xgboost":
            clf = GridSearchCV(
                estimator=regressor_content["model"],
                param_grid=regressor_content["params"],
                cv=2,
                verbose=2,
            )
            le = LabelEncoder()
            le_y_train = le.fit_transform(y_train)
            clf.fit(X_train, le_y_train)
        else:
            clf = GridSearchCV(
                estimator=regressor_content["model"],
                param_grid=regressor_content["params"],
                cv=2,
                verbose=2,
            )
            clf.fit(X_train, y_train)

    return clf.best_estimator_

In [45]:
clf = run_regressors(regressors=regressors, X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits




[CV] END ..........svc__C=1, svc__gamma=0.1, svc__kernel=rbf; total time=  11.0s
[CV] END ..........svc__C=1, svc__gamma=0.1, svc__kernel=rbf; total time=  10.4s
[CV] END ..........svc__C=1, svc__gamma=0.1, svc__kernel=rbf; total time=  10.4s
[CV] END ..........svc__C=1, svc__gamma=0.1, svc__kernel=rbf; total time=  10.3s
[CV] END ..........svc__C=1, svc__gamma=0.1, svc__kernel=rbf; total time=  10.5s
[CV] END .......svc__C=1, svc__gamma=0.1, svc__kernel=linear; total time=  13.8s
[CV] END .......svc__C=1, svc__gamma=0.1, svc__kernel=linear; total time=  13.6s
[CV] END .......svc__C=1, svc__gamma=0.1, svc__kernel=linear; total time=  13.5s
[CV] END .......svc__C=1, svc__gamma=0.1, svc__kernel=linear; total time=  13.8s
[CV] END .......svc__C=1, svc__gamma=0.1, svc__kernel=linear; total time=  13.9s
[CV] END .........svc__C=1, svc__gamma=0.01, svc__kernel=rbf; total time=   9.4s
[CV] END .........svc__C=1, svc__gamma=0.01, svc__kernel=rbf; total time=   9.3s
[CV] END .........svc__C=1, 

In [None]:
if not os.path.exists("./models"):
    os.makedirs("./models")

joblib.dump(clf.best_estimator_, "./models/classifier.pkl")