# God Pipeline

In [4]:
import sys


import os
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#1. load environment variables and data

# load environment variables
load_dotenv()

#add working directory to sys path to execute utils/dataset.py
working_dir = os.environ.get("WORKING_DIRECTORY")
sys.path.insert(0, working_dir)

from utils.dataset import get_data 

from utils.pipeline_moduls import fs_colinearity, fs_vif

df = get_data()

df.head(10)


Loading data from wines: 8000it [00:00, 17212.28it/s]


Unnamed: 0,wine type,fixed acidity,volatile acidity,citric acid,residual sugar,magnesium,flavanoids,minerals,calcium,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,Pinot noir,5.8,0.15,0.49,1.1,76.729301,894.94,186.639301,109.91,0.048,21.0,98.0,0.9929,3.19,0.48,9.2,5
1,Merlot,6.6,0.25,0.32,5.6,4.795712,1160.95,251.875712,247.08,0.039,15.0,68.0,0.99163,2.96,0.52,11.1,6
2,Chardonnay,6.7,0.21,0.34,1.5,85.19371,789.82,304.70371,219.51,0.035,45.0,123.0,0.98949,3.24,0.36,12.6,7
3,Merlot,8.3,0.28,0.27,17.5,11.976525,777.86,237.586525,225.61,0.045,48.0,253.0,1.00014,3.02,0.56,9.1,6
4,Merlot,7.5,0.42,0.19,6.9,5.599673,785.72,95.399673,89.8,0.041,62.0,150.0,0.99508,3.23,0.37,10.0,6
5,Merlot,7.3,0.34,0.3,1.3,22.403749,1044.95,289.523749,267.12,0.057,25.0,173.0,0.9948,3.26,0.51,9.1,6
6,Merlot,7.6,0.21,0.49,2.5,23.875866,888.61,133.545866,109.67,0.047,20.0,130.0,0.99178,3.15,0.48,11.1,5
7,Chardonnay,6.0,0.25,0.4,5.7,23.309699,1381.79,266.529699,243.22,0.052,56.0,152.0,0.99398,3.16,0.88,10.5,6
8,Cabernet Sauvignon,6.7,0.18,0.19,4.7,49.165745,1456.41,269.915745,220.75,0.046,57.0,161.0,0.9946,3.32,0.66,10.5,6
9,Gamay,7.7,0.28,0.39,8.9,54.450579,929.44,377.690579,323.24,0.036,8.0,117.0,0.9935,3.06,0.38,12.0,2


In [5]:
categorical_features = df.select_dtypes(include=['object']).columns
numerical_features = df.select_dtypes(include=[np.number]).columns
#drop 'quality' from numerical features (its a series)
numerical_features = numerical_features.drop('quality')
label = pd.Series('quality')

## God Pipeline

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
#import FunctionTransformer

### Cleaning Pipeline

#### Label Outlier Detection

In [7]:
def outlier_detection_label(df):
    #detect outliers and impute them with the simple imputer
    from sklearn.impute import KNNImputer

    #detect outliers with z-score and set them to NaN with = np.nan
    from scipy import stats
    z = np.abs(stats.zscore(df.iloc[:, df.shape[1]-1]))
    df.iloc[:, df.shape[1]-1][(z >= 3)] = np.nan


    #impute outliers with linear regression
    imputer = KNNImputer(n_neighbors=5).set_output(transform="pandas")
    df = imputer.fit_transform(df)
    
    return df

outlier_detection_label = FunctionTransformer(outlier_detection_label).set_output(transform="pandas")




#### Feature Outlier Detection

In [8]:
def outlier_detection(df):
    #detect outliers and impute them with the simple imputer
    from sklearn.impute import SimpleImputer

    #detect outliers
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1

    #detect outliers for each column and set them to NaN
    for col in df.columns:
        df.loc[(df[col] < (Q1[col] - 1.5 * IQR[col])) | (df[col] > (Q3[col] + 1.5 * IQR[col])), col] = np.nan

    #impute outliers with median
    imputer = SimpleImputer(strategy='median').set_output(transform="pandas")
    df = imputer.fit_transform(df)

    return df
outlier_detection = FunctionTransformer(outlier_detection).set_output(transform="pandas")

#### Feature Selection

In [21]:
import json

def feature_selection(df,colinearity_threshold=0.5, correlation_threshold=0.1, vif_threshold=5):
    dropped_features = []
    dropped_features_set = set(dropped_features)

    # Add elements from fs_colinearity to dropped_features_set
    dropped_features_set.update(fs_colinearity(df, colinearity_threshold, correlation_threshold))

    # Add elements from fs_vif to dropped_features_set
    dropped_features_set.update(fs_vif(df, correlation_threshold, vif_threshold))

    # Convert dropped_features_set back to a list
    dropped_features = list(dropped_features_set)
      # Save dropped features list to a JSON file
    with open('dropped_features.json', 'w') as f:
        json.dump(dropped_features, f)
    return df
feature_selection = FunctionTransformer(feature_selection).set_output(transform="pandas")



In [22]:
cleaning_pipeline = Pipeline(steps=[
])

categorical_imputer = SimpleImputer(strategy="most_frequent").set_output(transform="pandas")

numerical_imputer = SimpleImputer(strategy="mean").set_output(transform="pandas")

#pipeline for categorical features
categorical_pipeline = Pipeline(steps=[])
categorical_pipeline.steps.append(('imputer', categorical_imputer))
categorical_pipeline.steps.append(('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False).set_output(transform="pandas")))

#pipeline for numerical features
numeric_pipeline = Pipeline(steps=[])
numeric_pipeline.steps.append(('imputer', numerical_imputer))
numeric_pipeline.steps.append(('outlier_detection', outlier_detection))

#pipeline for label
label_pipeline = Pipeline(steps=[])
label_pipeline.steps.append(('imputer', numerical_imputer))

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features),
        ('label', label_pipeline, label)
    ]).set_output(transform="pandas")
cleaning_pipeline.steps.append(('preprocessor', preprocessor))
cleaning_pipeline.steps.append(("outlier_detection_label", outlier_detection_label))
cleaning_pipeline.steps.append(('feature_selection', feature_selection))
cleaning_pipeline

In [23]:
from sklearn.ensemble import RandomForestRegressor

training_pipeline_random_forst = Pipeline(
    steps=[("model", RandomForestRegressor())]
)
model = {
        "name": "RandomForestRegressor",
        "estimator": RandomForestRegressor(),
        "hyperparameters":
            {
                "model__n_estimators": [200],
                "model__criterion": ["squared_error"],
                "model__max_depth": [None],
                "model__min_samples_split": [2],
                "model__min_samples_leaf": [1]
            }
    }

In [24]:
#God Function based on example: LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

def god_function(dirty_df):
    
    clean_df = pd.DataFrame(cleaning_pipeline.fit_transform(dirty_df))
    X = clean_df.drop('label__quality', axis=1)
    y = clean_df['label__quality']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
    grid = GridSearchCV(training_pipeline_random_forst, model["hyperparameters"], cv=5)
    grid = grid.fit(X_train, y_train)
    print(grid.best_params_)
    print(grid.best_score_)
    print(grid.score(X_test, y_test))

god_function(df)

In [25]:
clean_df = pd.DataFrame(cleaning_pipeline.fit_transform(df))
X = clean_df.drop('label__quality', axis=1)
y = clean_df['label__quality']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
best_model = RandomForestRegressor(criterion = 'squared_error', max_depth = None, min_samples_leaf = 1, min_samples_split = 2, n_estimators = 200)
best_model = best_model.fit(X_train, y_train)
print(best_model.score(X_test, y_test))

#export best_model with pickle
import pickle
pickle.dump(best_model, open('best_model__random_forest.pkl','wb'))











--------Colinearity--------

High Colinearity between num__residual sugar and num__density with a value of 0.8316095689674599
Dropping num__residual sugar because of low correlation -0.07760006766406449 with quality

High Colinearity between num__minerals and num__calcium with a value of 0.9032394680532317
Dropping num__calcium because of low correlation -0.021263570937899847 with quality

High Colinearity between num__chlorides and num__alcohol with a value of 0.5298204778821368
Not Dropping num__chlorides because of high correlation -0.1936620103051994 with quality

High Colinearity between num__free sulfur dioxide and num__total sulfur dioxide with a value of 0.6048632677816177
Dropping num__free sulfur dioxide because of low correlation 0.019648020910944565 with quality

High Colinearity between num__total sulfur dioxide and num__density with a value of 0.5523519514518478
Not Dropping num__total sulfur dioxide because of high correlation -0.10790569243506677 with quality


  vif = 1. / (1. - r_squared_i)


Highest VIF Value, Feature: num__density with a value of 24.847234348058475
Not Dropping num__density because of high correlation -0.22713404725253997 with quality

Highest VIF Value, Feature: num__minerals with a value of 21.884148542872783
Dropping num__minerals because of low correlation -0.02917335964360098 with quality

Highest VIF Value, Feature: num__residual sugar with a value of 9.962782774181546
Dropping num__residual sugar because of low correlation -0.07760006766406449 with quality

Every VIF is below the threshold of 5! 

0.9152185586784313


In [26]:
#load best model with pickle
best_model = pickle.load(open('best_model__random_forest.pkl','rb'))
#select randomly 1000 data points from df and drop selected ones
df_validation = df.sample(n=1000, random_state=1)
clean_df = df.drop(df_validation.index)

X = clean_df.drop('label__quality', axis=1)
y = clean_df['label__quality']

X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.2, random_state=200, stratify=y)

print(best_model.score(X_validation, y_validation))

KeyError: "['label__quality'] not found in axis"

# Simulation Meth-Daten

In [27]:
df_validation = df.sample(n=1000, random_state=42)

clean_df = df.drop(df_validation.index)
clean_df = pd.DataFrame(cleaning_pipeline.fit_transform(clean_df))

X_clean = clean_df.drop('label__quality', axis=1)
y_clean = clean_df['label__quality']

X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(X_clean, y_clean, test_size=0.2, random_state=200, stratify=y_clean)
best_model = RandomForestRegressor(criterion = 'squared_error', max_depth = None, min_samples_leaf = 1, min_samples_split = 2, n_estimators = 200)
best_model = best_model.fit(X_train_clean, y_train_clean)












--------Colinearity--------

High Colinearity between num__residual sugar and num__density with a value of 0.830780406761989
Dropping num__residual sugar because of low correlation -0.08364822160050354 with quality

High Colinearity between num__minerals and num__calcium with a value of 0.9030737715248937
Dropping num__calcium because of low correlation -0.02806247904531667 with quality

High Colinearity between num__chlorides and num__alcohol with a value of 0.5286770193126731
Not Dropping num__chlorides because of high correlation -0.19340534323337413 with quality

High Colinearity between num__free sulfur dioxide and num__total sulfur dioxide with a value of 0.6009950373633854
Dropping num__free sulfur dioxide because of low correlation 0.013309191783902878 with quality

High Colinearity between num__total sulfur dioxide and num__density with a value of 0.5511630654263218
Not Dropping num__total sulfur dioxide because of high correlation -0.12018177495915404 with quality



  vif = 1. / (1. - r_squared_i)


Highest VIF Value, Feature: num__density with a value of 24.870624007480075
Not Dropping num__density because of high correlation -0.23155100567997977 with quality

Highest VIF Value, Feature: num__minerals with a value of 20.933597405227317
Dropping num__minerals because of low correlation -0.037717742004088416 with quality

Highest VIF Value, Feature: num__residual sugar with a value of 9.969598621051034
Dropping num__residual sugar because of low correlation -0.08364822160050354 with quality

Every VIF is below the threshold of 5! 



In [19]:
best_model.score(X_test_clean, y_test_clean)

0.8644157603942098

In [20]:
df_validation = pd.DataFrame(cleaning_pipeline.fit_transform(df_validation))
X_validation = df_validation.drop('label__quality', axis=1)
y_validation = df_validation['label__quality']

best_model.score(X_validation, y_validation)











--------Colinearity--------

High Colinearity between num__residual sugar and num__density with a value of 0.83747432504066
Dropping num__residual sugar because of low correlation -0.03841622427373332 with quality

High Colinearity between num__minerals and num__calcium with a value of 0.8940828932837327
Dropping num__minerals because of low correlation 0.02073140012831619 with quality

High Colinearity between num__chlorides and num__alcohol with a value of 0.5384231056394151
Not Dropping num__chlorides because of high correlation -0.20146010910088724 with quality

High Colinearity between num__free sulfur dioxide and num__total sulfur dioxide with a value of 0.6284853559467134
Dropping num__total sulfur dioxide because of low correlation -0.02969884939978695 with quality



KeyError: 'num__total sulfur dioxide'