# God Pipeline

In [1]:
import sys


import os
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#1. load environment variables and data

# load environment variables
load_dotenv()

#add working directory to sys path to execute utils/dataset.py
working_dir = os.environ.get("WORKING_DIRECTORY")
sys.path.insert(0, working_dir)

from utils.dataset import get_data 

from utils.pipeline_moduls import fs_colinearity, fs_vif, outlier_label, outlier_num, dim_reduction

df = get_data()

df.head(10)


Loading data from wines: 8000it [00:00, 22563.27it/s]


Unnamed: 0,wine type,fixed acidity,volatile acidity,citric acid,residual sugar,magnesium,flavanoids,minerals,calcium,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,Pinot noir,5.8,0.15,0.49,1.1,76.729301,894.94,186.639301,109.91,0.048,21.0,98.0,0.9929,3.19,0.48,9.2,5
1,Merlot,6.6,0.25,0.32,5.6,4.795712,1160.95,251.875712,247.08,0.039,15.0,68.0,0.99163,2.96,0.52,11.1,6
2,Chardonnay,6.7,0.21,0.34,1.5,85.19371,789.82,304.70371,219.51,0.035,45.0,123.0,0.98949,3.24,0.36,12.6,7
3,Merlot,8.3,0.28,0.27,17.5,11.976525,777.86,237.586525,225.61,0.045,48.0,253.0,1.00014,3.02,0.56,9.1,6
4,Merlot,7.5,0.42,0.19,6.9,5.599673,785.72,95.399673,89.8,0.041,62.0,150.0,0.99508,3.23,0.37,10.0,6
5,Merlot,7.3,0.34,0.3,1.3,22.403749,1044.95,289.523749,267.12,0.057,25.0,173.0,0.9948,3.26,0.51,9.1,6
6,Merlot,7.6,0.21,0.49,2.5,23.875866,888.61,133.545866,109.67,0.047,20.0,130.0,0.99178,3.15,0.48,11.1,5
7,Chardonnay,6.0,0.25,0.4,5.7,23.309699,1381.79,266.529699,243.22,0.052,56.0,152.0,0.99398,3.16,0.88,10.5,6
8,Cabernet Sauvignon,6.7,0.18,0.19,4.7,49.165745,1456.41,269.915745,220.75,0.046,57.0,161.0,0.9946,3.32,0.66,10.5,6
9,Gamay,7.7,0.28,0.39,8.9,54.450579,929.44,377.690579,323.24,0.036,8.0,117.0,0.9935,3.06,0.38,12.0,2


In [2]:
categorical_features = df.select_dtypes(include=['object']).columns
numerical_features = df.select_dtypes(include=[np.number]).columns
#drop 'quality' from numerical features (its a series)
numerical_features = numerical_features.drop('quality')
label = pd.Series('quality')

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

## Cleaning Pipeline

### Modules

#### Label Outlier Detection

In [4]:
outlier_detection_label = FunctionTransformer(outlier_label).set_output(transform="pandas")




#### Feature Outlier Detection

In [5]:
outlier_detection = FunctionTransformer(outlier_num).set_output(transform="pandas")

#### Feature Selection

In [6]:
import json

def feature_selection(df,colinearity_threshold=0.5, correlation_threshold=0.1, vif_threshold=5):
    dropped_features = []
    dropped_features_set = set(dropped_features)

    # Add elements from fs_colinearity to dropped_features_set
    dropped_features_set.update(fs_colinearity(df, colinearity_threshold, correlation_threshold))

    # Add elements from fs_vif to dropped_features_set
    dropped_features_set.update(fs_vif(df, correlation_threshold, vif_threshold))

    # Convert dropped_features_set back to a list
    dropped_features = list(dropped_features_set)
    print("Dropping Features: ", dropped_features)
    # Drop the features in dropped_features from the DataFrame
    df = df.drop(columns=dropped_features)

    # Save dropped features list to a JSON file
    with open('dropped_features.json', 'w') as f:
        json.dump(dropped_features, f)
    return df
feature_selection = FunctionTransformer(feature_selection).set_output(transform="pandas")

### Feature Scaling

In [7]:
cleaning_pipeline = Pipeline(steps=[
])

cleaning_pipeline_scaled = Pipeline(steps=[
])

scaler_minmax = MinMaxScaler()

#### Sub-Pipeline: Categorical Features

In [8]:
categorical_imputer = SimpleImputer(strategy="most_frequent").set_output(transform="pandas")

#pipeline for categorical features
categorical_pipeline = Pipeline(steps=[])
categorical_pipeline.steps.append(('imputer', categorical_imputer))
categorical_pipeline.steps.append(('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False).set_output(transform="pandas")))


#### Sub-Pipeline: Numerical Features

In [9]:
#pipeline for numerical features
numeric_pipeline = Pipeline(steps=[])
numerical_imputer = SimpleImputer(strategy="mean").set_output(transform="pandas")

numeric_pipeline.steps.append(('imputer', numerical_imputer))
numeric_pipeline.steps.append(('outlier_detection', outlier_detection))

#pipeline_scaled for numerical features
numeric_pipeline_scaled = Pipeline(steps=[])

numeric_pipeline_scaled.steps.append(('imputer', numerical_imputer))
numeric_pipeline_scaled.steps.append(('outlier_detection', outlier_detection))
numeric_pipeline_scaled.steps.append(('scaler', scaler_minmax))

#### Sub-Pipeline: Label

In [10]:
#pipeline for label
label_pipeline = Pipeline(steps=[])
label_pipeline.steps.append(('imputer', numerical_imputer))


### Pipeline

In [11]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features),
        ('label', label_pipeline, label)
    ]).set_output(transform="pandas")
cleaning_pipeline.steps.append(('preprocessor', preprocessor))
cleaning_pipeline.steps.append(("outlier_detection_label", outlier_detection_label))
cleaning_pipeline.steps.append(('feature_selection', feature_selection))
#cleaning_pipeline

preprocessor_scaled = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline_scaled, numerical_features),
        ('cat', categorical_pipeline, categorical_features),
        ('label', label_pipeline, label)
    ]).set_output(transform="pandas")
cleaning_pipeline_scaled.steps.append(('preprocessor', preprocessor_scaled))
cleaning_pipeline_scaled.steps.append(("outlier_detection_label", outlier_detection_label))
cleaning_pipeline_scaled.steps.append(('feature_selection', feature_selection))
cleaning_pipeline_scaled

## Training Pipeline

In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor

models = [
    {
        "name": "LinearRegression",
        "estimator": LinearRegression(),
        "hyperparameters":
            {
                "fit_intercept": [True, False],
                "copy_X": [True, False]
            },
        "scalable": 0
    },
    {
        "name": "DecisionTreeRegressor",
        "estimator": DecisionTreeRegressor(),
        "hyperparameters":
            {
                "criterion": ["squared_error", "friedman_mse"],
                "splitter": ["best", "random"],
                "max_depth": [None, 2, 5, 10],
                "min_samples_split": [2, 5, 10],
                "min_samples_leaf": [1, 5, 10]
            },
        "scalable": 0
    },
    {
        "name": "RandomForestRegressor",
        "estimator": RandomForestRegressor(),
        "hyperparameters":
            {
                "n_estimators": [100, 200],
                "criterion": ["squared_error", "friedman_mse"],
                "max_depth": [None, 2, 5, 10],
                "min_samples_split": [2, 5, 10],
                "min_samples_leaf": [1, 5, 10]
            },
        "scalable": 0
    },
    {
        "name": "Gradient Boosting Regressor",
        "estimator": GradientBoostingRegressor(),
        "hyperparameters":
        {       
                "n_estimators": [100, 200, 500],
                "max_depth": [None, 3, 5, 10],
                "min_samples_split": [2, 5, 10],
                "learning_rate": [0.1, 0.05, 0.001],
                "loss": ['squared_error', 'absolute_error', 'huber'],
        },
        "scalable": 0
    }, 
     {
        "name": "Support Vector Machine",
        "estimator": SVR(),
        "hyperparameters": {
            "C": [1, 10, 100],
            "kernel": ["rbf", "linear", "poly"]
        },
        "best_score": 0.5567442927702857,
        "scalable": 1
    },
    {
        "name": "ANN",
        "estimator": MLPRegressor(),
        "hyperparameters": {
            'activation': ['identity', 'logistic', 'tanh', 'relu'],
            'alpha': [0.0001, 0.001, 0.01],
            'hidden_layer_sizes': [(100, ),(100, 50), (100, 50, 25), (100, 75, 50, 25)],
            'learning_rate': ['constant', 'invscaling', 'adaptive'],
            'solver': ['adam', 'lbfgs']
        },
        "best_score": 0.5567442927702857,
        "scalable": 1
    }, 
    {
        "name": "KNN",
        "estimator": KNeighborsRegressor(),
        "hyperparameters": {
            'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
            'weights' : ['uniform', 'distance']
        },
        "best_score": 0.5567442927702857,
        "scalable": 1
    }
]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import pickle


best_models = []

def god_function(dirty_df):
    for model in models:
        print(model["name"])
        print("-"*len(model["name"]))
        if model["scalable"] is not None:
            if model["scalable"] == 0:
                clean_df = pd.DataFrame(cleaning_pipeline.fit_transform(dirty_df))
            if model["scalable"] == 1:
                clean_df = pd.DataFrame(cleaning_pipeline_scaled.fit_transform(dirty_df))
        X = clean_df.drop('label__quality', axis=1)
        y = clean_df['label__quality']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
        grid = GridSearchCV(model["estimator"], model["hyperparameters"], cv=10, n_jobs=-1)
        grid = grid.fit(X_train, y_train)
        print("Best Parameters:")
        print(grid.best_params_)
        print("")
        print("Best Score:", grid.best_score_, "\t", "Test Score:", grid.score(X_test, y_test))
        print("Fit Time:", grid.refit_time_)
        print("")
        m = {
            "name": model["name"],
            "best_params": grid.best_params_,
            "best_score": grid.best_score_,
            "fit_time": grid.refit_time_,
            "test_score":  grid.score(X_test, y_test)
        }
        best_models.append(m)
        
        #save best models to json
        with open("./models/best_models.json", "w") as f:
            json.dump(best_models, f, indent=4)
            
        #save best estimator from grid with pickle
        with open("./models/" + model["name"] + '.pkl', 'wb') as f:
            pickle.dump(grid.best_estimator_, f)

god_function(df)

### Training with 7000 samples

In [16]:
import pickle

best_models = []
def god_function_goes_server(dirty_df):
    df_validation = dirty_df.sample(n=1000, random_state=42)
    dirty_df = dirty_df.drop(df_validation.index)
    for model in models:
        print(model["name"])
        print("-"*len(model["name"]))
        if model["scalable"] is not None:
            if model["scalable"] == 0:
                clean_df = pd.DataFrame(cleaning_pipeline.fit_transform(dirty_df))
            if model["scalable"] == 1:
                clean_df = pd.DataFrame(cleaning_pipeline_scaled.fit_transform(dirty_df))
        X = clean_df.drop('label__quality', axis=1)
        y = clean_df['label__quality']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
        grid = GridSearchCV(model["estimator"], model["hyperparameters"], cv=5, n_jobs=-1)
        grid = grid.fit(X_train, y_train)
        print("Best Parameters:")
        print(grid.best_params_)
        print("")
        print("Best Score:", grid.best_score_, "\t", "Test Score:", grid.score(X_test, y_test))
        print("Fit Time:", grid.refit_time_)
        print("")
        m = {
            "name": model["name"],
            "best_params": grid.best_params_,
            "best_score": grid.best_score_,
            "fit_time": grid.refit_time_,
        }
        best_models.append(m)
        
        #save best estimator from grid with pickle
        with open("./models/7000_samples/" + model["name"] + '.pkl', 'wb') as f:
            pickle.dump(grid.best_estimator_, f)
    
    #save best models to json
    print("Saving best models to json…", end=" ")
    with open("./models/best_models_7000_samples.json", "w") as f:
        json.dump(best_models, f, indent=4)
    print("Done")

god_function_goes_server(df)

LinearRegression
----------------


  vif = 1. / (1. - r_squared_i)


Dropping Features:  ['num__free sulfur dioxide', 'num__calcium', 'num__minerals', 'cat__wine type_Pinot noir', 'num__residual sugar']
Best Parameters:
{'copy_X': True, 'fit_intercept': True}

Best Score: 0.5327285592690533 	 Test Score: 0.5375349921589769
Fit Time: 0.009988546371459961

DecisionTreeRegressor
---------------------


  vif = 1. / (1. - r_squared_i)


Dropping Features:  ['num__free sulfur dioxide', 'num__calcium', 'num__minerals', 'cat__wine type_Pinot noir', 'num__residual sugar']
Best Parameters:
{'criterion': 'squared_error', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best'}

Best Score: 0.7073883989242278 	 Test Score: 0.7744701384175121
Fit Time: 0.07996821403503418

RandomForestRegressor
---------------------


  vif = 1. / (1. - r_squared_i)


Dropping Features:  ['num__free sulfur dioxide', 'num__calcium', 'num__minerals', 'cat__wine type_Pinot noir', 'num__residual sugar']
Best Parameters:
{'criterion': 'friedman_mse', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}

Best Score: 0.8172918706792054 	 Test Score: 0.8486744346798154
Fit Time: 13.01500129699707

Gradient Boosting Regressor
---------------------------


  vif = 1. / (1. - r_squared_i)


Dropping Features:  ['num__free sulfur dioxide', 'num__calcium', 'num__minerals', 'cat__wine type_Pinot noir', 'num__residual sugar']
Best Parameters:
{'learning_rate': 0.05, 'loss': 'absolute_error', 'max_depth': None, 'min_samples_split': 10, 'n_estimators': 500}

Best Score: 0.848391925827347 	 Test Score: 0.8800458159047987
Fit Time: 198.70731782913208

Support Vector Machine
----------------------


  vif = 1. / (1. - r_squared_i)


Dropping Features:  ['num__free sulfur dioxide', 'num__calcium', 'num__minerals', 'cat__wine type_Pinot noir', 'num__residual sugar']
Best Parameters:
{'C': 100, 'kernel': 'rbf'}

Best Score: 0.6261900880869085 	 Test Score: 0.6410895596826263
Fit Time: 38.09536671638489

ANN
---


  vif = 1. / (1. - r_squared_i)


Dropping Features:  ['num__free sulfur dioxide', 'num__calcium', 'num__minerals', 'cat__wine type_Pinot noir', 'num__residual sugar']




Best Parameters:
{'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (100, 75, 50, 25), 'learning_rate': 'constant', 'solver': 'adam'}

Best Score: 0.6141883495943115 	 Test Score: 0.6303336825710211
Fit Time: 43.284379959106445

KNN
---


  vif = 1. / (1. - r_squared_i)


Dropping Features:  ['num__free sulfur dioxide', 'num__calcium', 'num__minerals', 'cat__wine type_Pinot noir', 'num__residual sugar']
Best Parameters:
{'n_neighbors': 15, 'weights': 'distance'}

Best Score: 0.7970711096671389 	 Test Score: 0.8511037942759834
Fit Time: 0.033989667892456055

Saving best models to json… Done


In [17]:
import numpy as np
print(np.__version__)

1.24.3


# TESTING

In [None]:
def calculate_significant_features(X, y, model):
    coefficients = model.coef_
    intercept = model.intercept_


    residuals = y - model.predict(X)

    n = len(y)
    p = X.shape[1]
    df = n - p - 1

    mse = np.sum(residuals ** 2) / df
    variance_covariance_matrix = mse * np.linalg.inv(np.dot(X.T, X))
    standard_errors = np.sqrt(np.diagonal(variance_covariance_matrix))


    t_values = coefficients / standard_errors
    p_values = 2 * (1 - stats.t.cdf(np.abs(t_values), df))

    headers = ['Feature', 'Coefficient', 'Standard Error', 't-value', 'p-value']

    prediction_metrics = pd.DataFrame(columns=headers)
    for i in range(len(coefficients)):
        prediction_metrics.loc[i] = [X.columns.values[i], coefficients[i], standard_errors[i], t_values[i], p_values[i]]

    #remove rows with p-value > 0.05
    features_to_remove = prediction_metrics[prediction_metrics['p-value'] > 0.05]['Feature'].values
    print("Removing features: ", features_to_remove)
    prediction_metrics = prediction_metrics[prediction_metrics['p-value'] < 0.05]
    return prediction_metrics

In [None]:
test_models = [{
        "name": "LinearRegression",
        "estimator": LinearRegression(),
        "hyperparameters":
            {
                "fit_intercept": [True, False],
                "copy_X": [True, False],
                "n_jobs": [-1]
            }
    },
    {
        "name": "Support Vector Machine",
        "estimator": SVR(),
        "hyperparameters":
        {
            "kernel": ["linear", "poly", "rbf", "sigmoid"],
            "degree": [1, 2, 3, 4, 5],
            "gamma": ["scale", "auto"],
            "C": [0.1, 1, 10, 100, 1000],
            "epsilon": [0.1, 0.2, 0.3, 0.4, 0.5]
    }
    }
    ]
dirty_df = df.copy(deep=True)
for model in test_models:
    print(model["name"])
    print("-"*len(model["name"]))
    pipeline = cleaning_pipeline
    #pipeline.steps.pop(2)
    clean_df = pd.DataFrame(pipeline.fit_transform(dirty_df))
    X = clean_df.drop('label__quality', axis=1)
    y = clean_df['label__quality']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
    grid = GridSearchCV(model["estimator"], model["hyperparameters"], cv=5, n_jobs=-1)
    grid = grid.fit(X_train, y_train)
    print("Best Parameters:")
    print(grid.best_params_)
    print("")
    print("Best Score:", grid.best_score_, "\t", "Test Score:", grid.score(X_test, y_test))
    print("Fit Time:", grid.refit_time_)
    print("")
    best_model = grid.best_estimator_
    significant_features = calculate_significant_features(X_train, y_train, best_model)
    #keep columns of X only if they are present in significant_features
    X_train = X_train[significant_features['Feature'].values]
    X_test = X_test[significant_features['Feature'].values]
    grid = GridSearchCV(model["estimator"], model["hyperparameters"], cv=5, n_jobs=-1)
    grid = grid.fit(X_train, y_train)
    print("Best Parameters:")
    print(grid.best_params_)
    print("")
    print("Best Score:", grid.best_score_, "\t", "Test Score:", grid.score(X_test, y_test))
    print("Fit Time:", grid.refit_time_)
    print("")


In [None]:
best_model.score()

# END TESTING

# Simulation Meth-Daten

In [None]:
#load best model with pickle
best_model = pickle.load(open('best_model__random_forest.pkl','rb'))
#select randomly 1000 data points from df and drop selected ones
df_validation = df.sample(n=1000, random_state=1)
clean_df = df.drop(df_validation.index)

X = clean_df.drop('label__quality', axis=1)
y = clean_df['label__quality']

X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.2, random_state=200, stratify=y)

print(best_model.score(X_validation, y_validation))

In [None]:
df_validation = df.sample(n=1000, random_state=42)

clean_df = df.drop(df_validation.index)
clean_df = pd.DataFrame(cleaning_pipeline.fit_transform(clean_df))

X_clean = clean_df.drop('label__quality', axis=1)
y_clean = clean_df['label__quality']

X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(X_clean, y_clean, test_size=0.2, random_state=200, stratify=y_clean)
best_model = RandomForestRegressor(criterion = 'squared_error', max_depth = None, min_samples_leaf = 1, min_samples_split = 2, n_estimators = 200)
best_model = best_model.fit(X_train_clean, y_train_clean)


In [None]:
best_model.score(X_test_clean, y_test_clean)

In [None]:
df_validation = pd.DataFrame(cleaning_pipeline.fit_transform(df_validation))
X_validation = df_validation.drop('label__quality', axis=1)
y_validation = df_validation['label__quality']

best_model.score(X_validation, y_validation)