# God Pipeline

In [110]:
import sys


import os
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#1. load environment variables and data

# load environment variables
load_dotenv()

#add working directory to sys path to execute utils/dataset.py
working_dir = os.environ.get("WORKING_DIRECTORY")
sys.path.insert(0, working_dir)

from utils.dataset import get_data 

from utils.pipeline_moduls import fs_colinearity, fs_vif, outlier_label, outlier_num, dim_reduction

df = get_data()

df.head(10)


Loading data from wines: 8000it [00:00, 17481.72it/s]


Unnamed: 0,wine type,fixed acidity,volatile acidity,citric acid,residual sugar,magnesium,flavanoids,minerals,calcium,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,Pinot noir,5.8,0.15,0.49,1.1,76.729301,894.94,186.639301,109.91,0.048,21.0,98.0,0.9929,3.19,0.48,9.2,5
1,Merlot,6.6,0.25,0.32,5.6,4.795712,1160.95,251.875712,247.08,0.039,15.0,68.0,0.99163,2.96,0.52,11.1,6
2,Chardonnay,6.7,0.21,0.34,1.5,85.19371,789.82,304.70371,219.51,0.035,45.0,123.0,0.98949,3.24,0.36,12.6,7
3,Merlot,8.3,0.28,0.27,17.5,11.976525,777.86,237.586525,225.61,0.045,48.0,253.0,1.00014,3.02,0.56,9.1,6
4,Merlot,7.5,0.42,0.19,6.9,5.599673,785.72,95.399673,89.8,0.041,62.0,150.0,0.99508,3.23,0.37,10.0,6
5,Merlot,7.3,0.34,0.3,1.3,22.403749,1044.95,289.523749,267.12,0.057,25.0,173.0,0.9948,3.26,0.51,9.1,6
6,Merlot,7.6,0.21,0.49,2.5,23.875866,888.61,133.545866,109.67,0.047,20.0,130.0,0.99178,3.15,0.48,11.1,5
7,Chardonnay,6.0,0.25,0.4,5.7,23.309699,1381.79,266.529699,243.22,0.052,56.0,152.0,0.99398,3.16,0.88,10.5,6
8,Cabernet Sauvignon,6.7,0.18,0.19,4.7,49.165745,1456.41,269.915745,220.75,0.046,57.0,161.0,0.9946,3.32,0.66,10.5,6
9,Gamay,7.7,0.28,0.39,8.9,54.450579,929.44,377.690579,323.24,0.036,8.0,117.0,0.9935,3.06,0.38,12.0,2


In [None]:
categorical_features = df.select_dtypes(include=['object']).columns
numerical_features = df.select_dtypes(include=[np.number]).columns
#drop 'quality' from numerical features (its a series)
numerical_features = numerical_features.drop('quality')
label = pd.Series('quality')

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

## Cleaning Pipeline

### Modules

#### Label Outlier Detection

In [None]:
outlier_detection_label = FunctionTransformer(outlier_label).set_output(transform="pandas")


#### Feature Outlier Detection

In [None]:
outlier_detection = FunctionTransformer(outlier_num).set_output(transform="pandas")

#### Feature Selection

In [None]:
import json

def feature_selection(df,colinearity_threshold=0.5, correlation_threshold=0.1, vif_threshold=5):
    dropped_features = []
    dropped_features_set = set(dropped_features)

    # Add elements from fs_colinearity to dropped_features_set
    dropped_features_set.update(fs_colinearity(df, colinearity_threshold, correlation_threshold))

    # Add elements from fs_vif to dropped_features_set
    dropped_features_set.update(fs_vif(df, correlation_threshold, vif_threshold))

    # Convert dropped_features_set back to a list
    dropped_features = list(dropped_features_set)
      # Save dropped features list to a JSON file
    with open('dropped_features.json', 'w') as f:
        json.dump(dropped_features, f)
    return df
feature_selection = FunctionTransformer(feature_selection).set_output(transform="pandas")

### Pipeline Building

In [None]:
cleaning_pipeline = Pipeline(steps=[
])

#### Sub-Pipeline: Categorical Features

In [None]:
categorical_imputer = SimpleImputer(strategy="most_frequent").set_output(transform="pandas")

#pipeline for categorical features
categorical_pipeline = Pipeline(steps=[])
categorical_pipeline.steps.append(('imputer', categorical_imputer))
categorical_pipeline.steps.append(('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False).set_output(transform="pandas")))

#### Sub-Pipeline: Numerical Features

In [None]:
#pipeline for numerical features
numeric_pipeline = Pipeline(steps=[])
numerical_imputer = SimpleImputer(strategy="mean").set_output(transform="pandas")

numeric_pipeline.steps.append(('imputer', numerical_imputer))
numeric_pipeline.steps.append(('outlier_detection', outlier_detection))

#### Sub-Pipeline: Label

In [None]:
#pipeline for label
label_pipeline = Pipeline(steps=[])
label_pipeline.steps.append(('imputer', numerical_imputer))


### Pipeline

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features),
        ('label', label_pipeline, label)
    ]).set_output(transform="pandas")
cleaning_pipeline.steps.append(('preprocessor', preprocessor))
cleaning_pipeline.steps.append(("outlier_detection_label", outlier_detection_label))
cleaning_pipeline.steps.append(('feature_selection', feature_selection))
cleaning_pipeline

## Training Pipeline

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

models = [
    {
        "name": "LinearRegression",
        "estimator": LinearRegression(),
        "hyperparameters":
            {
                "fit_intercept": [True, False],
                "copy_X": [True, False],
                "n_jobs": [-1]
            }
    },
    {
        "name": "DecisionTreeRegressor",
        "estimator": DecisionTreeRegressor(),
        "hyperparameters":
            {
                "criterion": ["squared_error"],
                "splitter": ["best", "random"],
                "max_depth": [None, 2, 5, 10],
                "min_samples_split": [2, 5, 10],
                "min_samples_leaf": [1, 5, 10]
            }
    },
    {
        "name": "RandomForestRegressor",
        "estimator": RandomForestRegressor(),
        "hyperparameters":
            {
                "n_estimators": [100, 200],
                "criterion": ["squared_error"],
                "max_depth": [None, 2, 5, 10],
                "min_samples_split": [2, 5, 10],
                "min_samples_leaf": [1, 5, 10],
                "n_jobs": [-1]
            }
    },
    {
        "name": "Gradient Boosting Regressor",
        "estimator": GradientBoostingRegressor(),
        "hyperparameters":
        {
                "n_estimators": [100, 200, 500],
                "max_depth": [None, 2, 5, 10],
                "min_samples_split": [2, 5, 10],
                "learning_rate": [0.01, 0.011, 0.012],
                "loss": ["squared_error"],
        }
    },
    {
        "name": "Support Vector Machine",
        "estimator": SVR(),
        "hyperparameters":
        {
            "kernel": ["linear", "poly", "rbf", "sigmoid"],
            "degree": [1, 2, 3, 4, 5],
            "gamma": ["scale", "auto"],
            "C": [0.1, 1, 10, 100, 1000],
            "epsilon": [0.1, 0.2, 0.3, 0.4, 0.5]
    }
    }
]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import pickle


best_models_runtime = []

def god_function(dirty_df):
    for model in models:
        print(model["name"])
        print("-"*len(model["name"]))
        clean_df = pd.DataFrame(cleaning_pipeline.fit_transform(dirty_df))
        X = clean_df.drop('label__quality', axis=1)
        y = clean_df['label__quality']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
        grid = GridSearchCV(model["estimator"], model["hyperparameters"], cv=5, n_jobs=-1)
        grid = grid.fit(X_train, y_train)
        print("Best Parameters:")
        print(grid.best_params_)
        print("")
        print("Best Score:", grid.best_score_, "\t", "Test Score:", grid.score(X_test, y_test))
        print("Fit Time:", grid.refit_time_)
        print("")
        #save best estimator from grid with pickle
        with open("./models/" + model["name"] + '.pkl', 'wb') as f:
            pickle.dump(grid.best_estimator_, f)
        
        #save best model runtime
        best_models_runtime.append({
            "name": model["name"], 
            "runtime": grid.refit_time_,
            "best_score": grid.best_score_,
            "best_params": grid.best_params_
        })

god_function(df)

# TESTING

In [113]:
def calculate_significant_features(X, y, model):
    coefficients = model.coef_
    intercept = model.intercept_


    residuals = y - model.predict(X)

    n = len(y)
    p = X.shape[1]
    df = n - p - 1

    mse = np.sum(residuals ** 2) / df
    variance_covariance_matrix = mse * np.linalg.inv(np.dot(X.T, X))
    standard_errors = np.sqrt(np.diagonal(variance_covariance_matrix))


    t_values = coefficients / standard_errors
    p_values = 2 * (1 - stats.t.cdf(np.abs(t_values), df))

    headers = ['Feature', 'Coefficient', 'Standard Error', 't-value', 'p-value']

    prediction_metrics = pd.DataFrame(columns=headers)
    for i in range(len(coefficients)):
        prediction_metrics.loc[i] = [X.columns.values[i], coefficients[i], standard_errors[i], t_values[i], p_values[i]]

    #remove rows with p-value > 0.05
    features_to_remove = prediction_metrics[prediction_metrics['p-value'] > 0.05]['Feature'].values
    print("Removing features: ", features_to_remove)
    prediction_metrics = prediction_metrics[prediction_metrics['p-value'] < 0.05]
    return prediction_metrics

In [115]:
test_models = [{
        "name": "LinearRegression",
        "estimator": LinearRegression(),
        "hyperparameters":
            {
                "fit_intercept": [True, False],
                "copy_X": [True, False],
                "n_jobs": [-1]
            }
    },
    {
        "name": "Support Vector Machine",
        "estimator": SVR(),
        "hyperparameters":
        {
            "kernel": ["linear", "poly", "rbf", "sigmoid"],
            "degree": [1, 2, 3, 4, 5],
            "gamma": ["scale", "auto"],
            "C": [0.1, 1, 10, 100, 1000],
            "epsilon": [0.1, 0.2, 0.3, 0.4, 0.5]
    }
    }
    ]
dirty_df = df.copy(deep=True)
for model in test_models:
    print(model["name"])
    print("-"*len(model["name"]))
    pipeline = cleaning_pipeline
    #pipeline.steps.pop(2)
    clean_df = pd.DataFrame(pipeline.fit_transform(dirty_df))
    X = clean_df.drop('label__quality', axis=1)
    y = clean_df['label__quality']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
    grid = GridSearchCV(model["estimator"], model["hyperparameters"], cv=5, n_jobs=-1)
    grid = grid.fit(X_train, y_train)
    print("Best Parameters:")
    print(grid.best_params_)
    print("")
    print("Best Score:", grid.best_score_, "\t", "Test Score:", grid.score(X_test, y_test))
    print("Fit Time:", grid.refit_time_)
    print("")
    best_model = grid.best_estimator_
    significant_features = calculate_significant_features(X_train, y_train, best_model)
    #keep columns of X only if they are present in significant_features
    X_train = X_train[significant_features['Feature'].values]
    X_test = X_test[significant_features['Feature'].values]
    grid = GridSearchCV(model["estimator"], model["hyperparameters"], cv=5, n_jobs=-1)
    grid = grid.fit(X_train, y_train)
    print("Best Parameters:")
    print(grid.best_params_)
    print("")
    print("Best Score:", grid.best_score_, "\t", "Test Score:", grid.score(X_test, y_test))
    print("Fit Time:", grid.refit_time_)
    print("")


LinearRegression
----------------
Best Parameters:
{'copy_X': True, 'fit_intercept': False, 'n_jobs': -1}

Best Score: 0.5536345285850655 	 Test Score: 0.5586981830127015
Fit Time: 0.007969141006469727

Removing features:  ['num__citric acid' 'num__magnesium' 'num__flavanoids' 'num__minerals'
 'num__calcium' 'num__chlorides' 'num__total sulfur dioxide']
Best Parameters:
{'copy_X': True, 'fit_intercept': False, 'n_jobs': -1}

Best Score: 0.5534053988619101 	 Test Score: 0.5563535229323042
Fit Time: 0.004335880279541016

DecisionTreeRegressor
---------------------
Best Parameters:
{'criterion': 'squared_error', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best'}

Best Score: 0.7421017128377512 	 Test Score: 0.8814416670502254
Fit Time: 0.10364890098571777



AttributeError: 'DecisionTreeRegressor' object has no attribute 'coef_'

In [101]:
best_model.score()

Removing features:  ['num__citric acid' 'num__magnesium' 'num__flavanoids' 'num__minerals'
 'num__calcium' 'num__chlorides' 'num__total sulfur dioxide']


Unnamed: 0,Feature,Coefficient,Standard Error,t-value,p-value
0,num__fixed acidity,0.103786,0.017743,5.849269,5.181988e-09
1,num__volatile acidity,-1.893428,0.126769,-14.936076,0.0
3,num__residual sugar,0.08817,0.005914,14.907957,0.0
9,num__free sulfur dioxide,0.006213,0.000796,7.805723,6.883383e-15
11,num__density,-183.99863,15.936008,-11.546093,0.0
12,num__pH,0.850246,0.086788,9.796794,0.0
13,num__sulphates,0.621911,0.10046,6.19064,6.364433e-10
14,num__alcohol,0.134464,0.021098,6.373369,1.978397e-10
15,cat__wine type_Cabernet Sauvignon,183.454918,15.762737,11.638519,0.0
16,cat__wine type_Chardonnay,183.473879,15.762451,11.639933,0.0


# END TESTING

# Simulation Meth-Daten

In [None]:
#load best model with pickle
best_model = pickle.load(open('best_model__random_forest.pkl','rb'))
#select randomly 1000 data points from df and drop selected ones
df_validation = df.sample(n=1000, random_state=1)
clean_df = df.drop(df_validation.index)

X = clean_df.drop('label__quality', axis=1)
y = clean_df['label__quality']

X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.2, random_state=200, stratify=y)

print(best_model.score(X_validation, y_validation))

In [None]:
df_validation = df.sample(n=1000, random_state=42)

clean_df = df.drop(df_validation.index)
clean_df = pd.DataFrame(cleaning_pipeline.fit_transform(clean_df))

X_clean = clean_df.drop('label__quality', axis=1)
y_clean = clean_df['label__quality']

X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(X_clean, y_clean, test_size=0.2, random_state=200, stratify=y_clean)
best_model = RandomForestRegressor(criterion = 'squared_error', max_depth = None, min_samples_leaf = 1, min_samples_split = 2, n_estimators = 200)
best_model = best_model.fit(X_train_clean, y_train_clean)


In [None]:
best_model.score(X_test_clean, y_test_clean)

In [None]:
df_validation = pd.DataFrame(cleaning_pipeline.fit_transform(df_validation))
X_validation = df_validation.drop('label__quality', axis=1)
y_validation = df_validation['label__quality']

best_model.score(X_validation, y_validation)