## Pipeline Model-Training/-Testing

In [8]:
import sys
import os
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#1. load environment variables and data

# load environment variables
load_dotenv()

#add working directory to sys path to execute utils/dataset.py
working_dir = os.environ.get("WORKING_DIRECTORY")
sys.path.insert(0, working_dir)
from utils.dataset import get_data 
df = get_data()
df.head(10)

Loading data from wines: 8000it [00:00, 14933.86it/s]


Unnamed: 0,wine type,fixed acidity,volatile acidity,citric acid,residual sugar,magnesium,flavanoids,minerals,calcium,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,Pinot noir,5.8,0.15,0.49,1.1,76.729301,894.94,186.639301,109.91,0.048,21.0,98.0,0.9929,3.19,0.48,9.2,5
1,Merlot,6.6,0.25,0.32,5.6,4.795712,1160.95,251.875712,247.08,0.039,15.0,68.0,0.99163,2.96,0.52,11.1,6
2,Chardonnay,6.7,0.21,0.34,1.5,85.19371,789.82,304.70371,219.51,0.035,45.0,123.0,0.98949,3.24,0.36,12.6,7
3,Merlot,8.3,0.28,0.27,17.5,11.976525,777.86,237.586525,225.61,0.045,48.0,253.0,1.00014,3.02,0.56,9.1,6
4,Merlot,7.5,0.42,0.19,6.9,5.599673,785.72,95.399673,89.8,0.041,62.0,150.0,0.99508,3.23,0.37,10.0,6
5,Merlot,7.3,0.34,0.3,1.3,22.403749,1044.95,289.523749,267.12,0.057,25.0,173.0,0.9948,3.26,0.51,9.1,6
6,Merlot,7.6,0.21,0.49,2.5,23.875866,888.61,133.545866,109.67,0.047,20.0,130.0,0.99178,3.15,0.48,11.1,5
7,Chardonnay,6.0,0.25,0.4,5.7,23.309699,1381.79,266.529699,243.22,0.052,56.0,152.0,0.99398,3.16,0.88,10.5,6
8,Cabernet Sauvignon,6.7,0.18,0.19,4.7,49.165745,1456.41,269.915745,220.75,0.046,57.0,161.0,0.9946,3.32,0.66,10.5,6
9,Gamay,7.7,0.28,0.39,8.9,54.450579,929.44,377.690579,323.24,0.036,8.0,117.0,0.9935,3.06,0.38,12.0,2


In [12]:
from sklearn.model_selection import train_test_split
df = df.dropna()
#delete all data from df where quality > 10
df = df[df["quality"] <= 10]
#onehot encode column "wine type"
df = pd.get_dummies(df, columns=["wine type"])

X = df.drop(columns=["quality"])
y = df["quality"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
#import pipeline and standard scaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
#import LinearRegression and DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn import svm
from sklearn.model_selection import GridSearchCV



In [39]:
models = [
    {
        "name": "LinearRegression",
        "estimator": LinearRegression(),
        "hyperparameters":
            {
                "model__fit_intercept": [True, False],
                "model__copy_X": [True, False],
                "model__n_jobs": [1,5,10,20],
            }
    },
    {
        "name": "DecisionTreeRegressor",
        "estimator": DecisionTreeRegressor(),
        "hyperparameters":
            {
                "model__criterion": ["squared_error"],
                "model__splitter": ["best", "random"],
                "model__max_depth": [None, 2, 5, 10],
                "model__min_samples_split": [2, 5, 10],
                "model__min_samples_leaf": [1, 5, 10]
            }
    },
    {
        "name": "RandomForestRegressor",
        "estimator": RandomForestRegressor(),
        "hyperparameters":
            {
                "model__n_estimators": [100, 200],
                "model__criterion": ["squared_error"],
                "model__max_depth": [None, 2, 5, 10],
                "model__min_samples_split": [2, 5, 10],
                "model__min_samples_leaf": [1, 5, 10]
            }
    },
    {
        "name": "Gradient Boosting Regressor",
        "estimator": GradientBoostingRegressor(),
        "hyperparameters":
        {
                "model__n_estimators": [100, 200, 500],
                "model__max_depth": [None, 2, 5, 10],
                "model__min_samples_split": [2, 5, 10],
                "model__learning_rate": [0.01, 0.011, 0.012],
                "model__loss": "squared_error",
        }
    },
    {
        "name": "Support Vector Machine",
        "estimator": svm.SVR(),
        "hyperparameters":
        {
            "model__kernel": ["linear", "poly", "rbf", "sigmoid"],
            "model__degree": [1, 2, 3, 4, 5],
            "model__gamma": ["scale", "auto"],
            "model__C": [0.1, 1, 10, 100, 1000],
            "model__epsilon": [0.1, 0.2, 0.3, 0.4, 0.5]
    }
    }
]

In [41]:
models = [
    {
        "name": "Gradient Boosting Regressor",
        "estimator": GradientBoostingRegressor(),
        "hyperparameters":
        {
                "model__n_estimators": [100, 200, 500],
                "model__max_depth": [None, 2, 5, 10],
                "model__min_samples_split": [2, 5, 10],
                "model__learning_rate": [0.01, 0.011, 0.012],
                "model__loss": ["squared_error"],
        }
    },
    {
        "name": "Support Vector Machine",
        "estimator": svm.SVR(),
        "hyperparameters":
        {
            "model__kernel": ["linear", "poly", "rbf", "sigmoid"],
            "model__degree": [1, 2, 3, 4, 5],
            "model__gamma": ["scale", "auto"],
            "model__C": [0.1, 1, 10, 100, 1000],
            "model__epsilon": [0.1, 0.2, 0.3, 0.4, 0.5]
    }}
]

for model in models:
    print(model["name"])
    print("-"*len(model["name"]))

    pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("model", model["estimator"])
    ])

    grid = GridSearchCV(pipeline, model["hyperparameters"], cv=2)
    grid.fit(X_train, y_train)

    print("Best Parameters:")
    print(grid.best_params_)
    print("")

    model["best_params"] = grid.best_params_
    model["best_score"] = grid.best_score_
    model["best_estimator"] = grid.best_estimator_
    model["best_model"] = grid.best_estimator_.named_steps["model"]

Gradient Boosting Regressor
---------------------------


In [29]:
import json
models_to_save = []
for model in models:
    m = {
        "name": model["name"],
        "best_params": model["best_params"],
        "best_score": model["best_score"],
    }
    models_to_save.append(m)

with open("best_models.json", "w") as f:
    json.dump(models_to_save, f, indent=4)


In [31]:
def load_best_models(file: str = "best_models.json"):
    try:
        with open(file, "r") as f:
            best_models = json.load(f)
    except FileNotFoundError:
        print(f"File {file} not found")
        return None
    
    for model in best_models:
        if model["name"] == "LinearRegression":
            model["estimator"] = LinearRegression()
        elif model["name"] == "DecisionTreeRegressor":
            model["estimator"] = DecisionTreeRegressor()
        elif model["name"] == "RandomForestRegressor":
            model["estimator"] = RandomForestRegressor()
        elif model["name"] == "Gradient Boosting Regressor":
            model["estimator"] = GradientBoostingRegressor()
        elif model["name"] == "Support Vector Machine":
            model["estimator"] = svm.SVR()
        else:
            raise ValueError(f"Model {model['name']} not found")
    return best_models

In [33]:
best_models = load_best_models()
best_models[0]["estimator"].fit(X_train, y_train)
best_models[0]["estimator"].score(X_test, y_test)

0.5658985686532998