## Pipeline Model-Training/-Testing

In [None]:
import sys
import os
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#1. load environment variables and data

# load environment variables
load_dotenv()

#add working directory to sys path to execute utils/dataset.py
working_dir = os.environ.get("WORKING_DIRECTORY")
sys.path.insert(0, working_dir)
from utils.dataset import get_data 
df = get_data()
df.head(10)

In [None]:
from sklearn.model_selection import train_test_split
df = df.dropna()
#delete all data from df where quality > 10
df = df[df["quality"] <= 10]
#onehot encode column "wine type"
df = pd.get_dummies(df, columns=["wine type"])

X = df.drop(columns=["quality"])
y = df["quality"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#import pipeline and standard scaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn import svm
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline(steps=[])

In [None]:
models = [
    {
        "name": "LinearRegression",
        "estimator": LinearRegression(),
        "hyperparameters":
            {
                "model__fit_intercept": [True, False],
                "model__copy_X": [True, False],
                "model__n_jobs": [1,5,10,20],
            }
    },
    {
        "name": "DecisionTreeRegressor",
        "estimator": DecisionTreeRegressor(),
        "hyperparameters":
            {
                "model__criterion": ["squared_error"],
                "model__splitter": ["best", "random"],
                "model__max_depth": [None, 2, 5, 10],
                "model__min_samples_split": [2, 5, 10],
                "model__min_samples_leaf": [1, 5, 10]
            }
    },
    {
        "name": "RandomForestRegressor",
        "estimator": RandomForestRegressor(),
        "hyperparameters":
            {
                "model__n_estimators": [100, 200],
                "model__criterion": ["squared_error"],
                "model__max_depth": [None, 2, 5, 10],
                "model__min_samples_split": [2, 5, 10],
                "model__min_samples_leaf": [1, 5, 10]
            }
    },
    {
        "name": "Gradient Boosting Regressor",
        "estimator": GradientBoostingRegressor(),
        "hyperparameters":
        {
                "model__n_estimators": [100, 200, 500],
                "model__max_depth": [None, 2, 5, 10],
                "model__min_samples_split": [2, 5, 10],
                "model__learning_rate": [0.01, 0.011, 0.012],
                "model__loss": ["squared_error"],
        }
    },
    {
        "name": "Support Vector Machine",
        "estimator": svm.SVR(),
        "hyperparameters":
        {
            "model__kernel": ["linear", "poly", "rbf", "sigmoid"],
            "model__degree": [1, 2, 3, 4, 5],
            "model__gamma": ["scale", "auto"],
            "model__C": [0.1, 1, 10, 100, 1000],
            "model__epsilon": [0.1, 0.2, 0.3, 0.4, 0.5]
    }
    }
]
for model in models:
    print(model["name"])
    print("-"*len(model["name"]))

    pipeline = Pipeline([
        ("model", model["estimator"])
    ])

    grid = GridSearchCV(pipeline, model["hyperparameters"], cv=2)
    grid.fit(X_train, y_train)

    print("Best Parameters:")
    print(grid.best_params_)
    print("")

    model["best_params"] = grid.best_params_
    model["best_score"] = grid.best_score_
    model["best_estimator"] = grid.best_estimator_
    model["best_model"] = grid.best_estimator_.named_steps["model"]

In [None]:
import json
models_to_save = []
for model in models:
    m = {
        "name": model["name"],
        "best_params": model["best_params"],
        "best_score": model["best_score"],
    }
    models_to_save.append(m)

with open("best_models.json", "w") as f:
    json.dump(models_to_save, f, indent=4)


In [None]:
def load_best_models(file: str = "best_models.json"):
    try:
        with open(file, "r") as f:
            best_models = json.load(f)
    except FileNotFoundError:
        print(f"File {file} not found")
        return None
    
    for model in best_models:
        if model["name"] == "LinearRegression":
            model["estimator"] = LinearRegression()
        elif model["name"] == "DecisionTreeRegressor":
            model["estimator"] = DecisionTreeRegressor()
        elif model["name"] == "RandomForestRegressor":
            model["estimator"] = RandomForestRegressor()
        elif model["name"] == "Gradient Boosting Regressor":
            model["estimator"] = GradientBoostingRegressor()
        elif model["name"] == "Support Vector Machine":
            model["estimator"] = svm.SVR()
        else:
            raise ValueError(f"Model {model['name']} not found")
    return best_models