In [79]:
import sys
import os
from dotenv import load_dotenv

#1. load environment variables and data

# load environment variables
load_dotenv()

#add working directory to sys path to execute utils/dataset.py
working_dir = os.environ.get("WORKING_DIRECTORY")
sys.path.insert(0, working_dir)

In [80]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from utils.dataset import get_data 
df = get_data()
df.head(10)

Loading data from wines: 8000it [00:00, 15938.09it/s]


Unnamed: 0,wine type,fixed acidity,volatile acidity,citric acid,residual sugar,magnesium,flavanoids,minerals,calcium,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,Pinot noir,5.8,0.15,0.49,1.1,76.729301,894.94,186.639301,109.91,0.048,21.0,98.0,0.9929,3.19,0.48,9.2,5
1,Merlot,6.6,0.25,0.32,5.6,4.795712,1160.95,251.875712,247.08,0.039,15.0,68.0,0.99163,2.96,0.52,11.1,6
2,Chardonnay,6.7,0.21,0.34,1.5,85.19371,789.82,304.70371,219.51,0.035,45.0,123.0,0.98949,3.24,0.36,12.6,7
3,Merlot,8.3,0.28,0.27,17.5,11.976525,777.86,237.586525,225.61,0.045,48.0,253.0,1.00014,3.02,0.56,9.1,6
4,Merlot,7.5,0.42,0.19,6.9,5.599673,785.72,95.399673,89.8,0.041,62.0,150.0,0.99508,3.23,0.37,10.0,6
5,Merlot,7.3,0.34,0.3,1.3,22.403749,1044.95,289.523749,267.12,0.057,25.0,173.0,0.9948,3.26,0.51,9.1,6
6,Merlot,7.6,0.21,0.49,2.5,23.875866,888.61,133.545866,109.67,0.047,20.0,130.0,0.99178,3.15,0.48,11.1,5
7,Chardonnay,6.0,0.25,0.4,5.7,23.309699,1381.79,266.529699,243.22,0.052,56.0,152.0,0.99398,3.16,0.88,10.5,6
8,Cabernet Sauvignon,6.7,0.18,0.19,4.7,49.165745,1456.41,269.915745,220.75,0.046,57.0,161.0,0.9946,3.32,0.66,10.5,6
9,Gamay,7.7,0.28,0.39,8.9,54.450579,929.44,377.690579,323.24,0.036,8.0,117.0,0.9935,3.06,0.38,12.0,2


In [81]:
#import pipeline from scikit
from sklearn.pipeline import Pipeline
pipeline = Pipeline(steps=[])

categorical_features = df.select_dtypes(include=['object']).columns
numerical_features = df.select_dtypes(include=[np.number]).columns

In [82]:
from sklearn.preprocessing import FunctionTransformer
from statsmodels.stats.outliers_influence import variance_inflation_factor

def feature_selection(df,threshold=10.0):

    # exclude non numeric columns
    df = df.select_dtypes(exclude=['object'])
    
    # Drop rows with any non-finite values
    df = df.dropna()
    # Calculate the correlation between columns
    corr_matrix = df.corr().abs()
    
    # Create a mask to select the upper triangle of the correlation matrix
    mask = np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
    
    # Apply the mask to get the upper triangle of the correlation matrix
    upper_triangle = corr_matrix.where(mask)
    
    # Find the columns with colinearity greater than 0.7
    colinear_columns = upper_triangle[upper_triangle > 0.7].stack().index
    
    # Filter the colinear_columns based on their correlation to quality and select the one with lower correlation
    selected_columns_colinearity = []
    for col1, col2 in colinear_columns:
        corr_col1 = df[col1].corr(df['quality'])
        corr_col2 = df[col2].corr(df['quality'])
        colinearity = corr_matrix.loc[col1, col2]
        
        if colinearity > 0.7:
            if corr_col1 < corr_col2:
                if col1 not in selected_columns_colinearity:
                    selected_columns_colinearity.append(col1)
            else:
                if col2 not in selected_columns_colinearity:
                    selected_columns_colinearity.append(col2)
    

    # Calculate the VIF for each feature expect the target variable
    vif = pd.DataFrame()
    vif["Feature"] = df.drop('quality', axis=1).columns
    vif["VIF"] = [variance_inflation_factor(df.drop('quality', axis=1).values, i) for i in range(len(df.drop('quality', axis=1).columns))]

 
    
    # Select the columns with VIF greater than the threshold
    selected_columns_vif = vif[vif['VIF'] > threshold]['Feature']
    
    # Filter the selected columns based on their correlation to quality and select the one with lower correlation
    selected_features_vif = []
    for col in selected_columns_vif:
        corr_col = df[col].corr(df['quality'])
        if corr_col < 0:
            selected_features_vif.append(col)

    all_features_to_drop = list(selected_columns_colinearity) + list(selected_features_vif)
    # remove duplicates in the list
    all_features_to_drop = list(dict.fromkeys(all_features_to_drop))

    # drop the columns in df that are  in all_features
    #df = df.drop(all_features_to_drop, axis=1)

    print(all_features_to_drop)
    return df

feature_selection(df, 10)


feature_selection = FunctionTransformer(feature_selection)

pipeline.steps.append(('outlier_detection', feature_selection))


['density', 'minerals', 'magnesium', 'total sulfur dioxide', 'pH']


  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)


In [150]:
from sklearn.preprocessing import FunctionTransformer
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd
import numpy as np
import statsmodels.api as sm
import json

def feature_selection(df, colinearity_threshold=0.5,correlation_threshold=0.1, vif_threshold=10.0):
    dropped_features = []  # List to store dropped features
    #-------------------------------------------------Cleaning-------------------------------------------------

    # exclude non numeric columns
    df = df.select_dtypes(exclude=['object'])
    
    # Drop rows with any non-finite values
    df = df.dropna()
    
    for column in df.columns:
        mean = df[column].mean()
        std = df[column].std()
        df = df[(df[column] >= (mean - 3 * std)) & (df[column] <= (mean + 3 * std))]

    #-------------------------------------------------VIF-------------------------------------------------
    columns_to_exclude = ['quality']
    # Select the columns excluding the quality label
    columns = [col for col in df.select_dtypes(exclude='object').columns if col not in columns_to_exclude]

    # Create a new dataframe with only the selected columns
    df_selected = df[columns]

    # Add a constant column to the dataframe (required for VIF calculation)
    df_selected = sm.add_constant(df_selected)

    print("--------VIF--------"+ "\n")
    vifToRemove = 0
    while True:
        # Calculate VIF values for remaining features
        vif = pd.DataFrame()
        vif["Variable"] = df_selected.columns
        vif["VIF"] = [variance_inflation_factor(df_selected.values, i) if np.var(df_selected.iloc[:, i]) != 0 else 0 for i in range(df_selected.shape[1])]

        # Exclude the constant column from the results
        vif = vif[1:]
        # Order the VIF values in ascending order
        vif.sort_values('VIF', ascending=False, inplace=True)
        # Check if all VIF values are below the threshold
        if all(vif["VIF"] < vif_threshold):
            print("Every VIF is below the threshold of "+str(vif_threshold)+"! \n")

            break
        # Find the feature with the highest VIF value
        highest_vif_feature = vif.iloc[vifToRemove]["Variable"]
        
        # Check correlation of the highest VIF feature with quality label
        correlation = df[highest_vif_feature].corr(df["quality"])
        print("Highest VIF Value, Feature: " + str(highest_vif_feature) + " with a value of " + str(vif.iloc[vifToRemove]["VIF"]))
       

        if abs(correlation) < correlation_threshold:
            # Remove the feature if correlation is below the threshold
            print("Dropping " + highest_vif_feature + " because of low correlation "+str(correlation)+" with quality"+ "\n")
            dropped_features.append(highest_vif_feature)
            df = df.drop(highest_vif_feature, axis=1)
            df_selected = df_selected.drop(highest_vif_feature, axis=1)  # Drop from df_selected as well
        else:
            print("Not Dropping " + highest_vif_feature + " because of high correlation with "+str(correlation)+"quality"+ "\n")
            # Move on to the next highest VIF feature
            vifToRemove = vifToRemove + 1
    print("--------VIF--------"+ "\n")
    
    #-------------------------------------------------Colinearity-------------------------------------------------
    # Calculate the correlation between columns
    corr_matrix = df.corr().abs()
    # Create a mask to select the upper triangle of the correlation matrix
    mask = np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)

    # Apply the mask to get the upper triangle of the correlation matrix
    upper_triangle = corr_matrix.where(mask)
    
    # Find the columns with colinearity greater than correlation_threshold
    colinear_columns = upper_triangle[upper_triangle > colinearity_threshold].stack().index
     # Get the values of colinearity
    colinearity_values = upper_triangle[upper_triangle > colinearity_threshold].stack()

    # loop through the colinear_columns and select the one with lower correlation to quality and avoid duplicates in the list
    print("--------colinearity--------")
    for col1, col2 in colinear_columns:
        corr_col1 = df[col1].corr(df['quality'])
        corr_col2 = df[col2].corr(df['quality'])
        print("High Colinearity between " + col1 + " and "+col2 + " with a value of " + str(colinearity_values[col1, col2]))
        if abs(corr_col1) < abs(corr_col2):
                corr_col = df[col1].corr(df['quality'])
                if abs(corr_col) < correlation_threshold:
                    print("Dropping " + col1 + " because of low correlation "+str(corr_col1)+" with quality"+ "\n")
                    df = df.drop(col1, axis=1)
                    dropped_features.append(col1)
                else:
                    print("Not Dropping " + col1 + " because of high correlation with "+str(corr_col1)+" quality"+ "\n")

        else:
                corr_col = df[col2].corr(df['quality'])
                if abs(corr_col) < correlation_threshold:
                    print("Dropping " + col2 + " because of low correlation "+str(corr_col)+" with quality"+ "\n")
                    df = df.drop(col2, axis=1)
                    dropped_features.append(col2)
                else: 
                    print("Not Dropping " + col2 + " because of high correlation with "+str(corr_col)+" quality"+ "\n")
                     

    print("Every colinearity is below the threshold of "+str(colinearity_threshold)+"! \n")
    print("--------colinearity--------")
    
    # Save dropped features list to a JSON file
    with open('dropped_features.json', 'w') as f:
        json.dump(dropped_features, f)
   
    return df




def remove_dropped_features(df, json_file):
    with open(json_file, 'r') as f:
        dropped_features = json.load(f)

    # Remove the dropped features from the DataFrame
    df = df.drop(dropped_features, axis=1)
    print(df.columns)
    return df

feature_selection(df, 0.6, 0.1, 10.0)
remove_dropped_features(df, 'dropped_features.json')


feature_selection = FunctionTransformer(feature_selection)
pipeline.steps.append(('outlier_detection', feature_selection))

--------VIF--------

Highest VIF Value, Feature: magnesium with a value of inf
Dropping magnesium because of low correlation -0.03953627745933602 with quality

Highest VIF Value, Feature: density with a value of 46.46831295820154
Not Dropping density because of high correlation with -0.21508934604611996quality



  vif = 1. / (1. - r_squared_i)


Highest VIF Value, Feature: residual sugar with a value of 18.23742132540072
Dropping residual sugar because of low correlation -0.07596745567637178 with quality

Every VIF is below the threshold of 10.0! 

--------VIF--------

--------colinearity--------
High Colinearity between minerals and calcium with a value of 0.9229986876054563
Dropping calcium because of low correlation -0.021927721646049687 with quality

High Colinearity between free sulfur dioxide and total sulfur dioxide with a value of 0.6170895155741144
Dropping free sulfur dioxide because of low correlation 0.017850021913618824 with quality

High Colinearity between density and alcohol with a value of 0.8169900547685602
Not Dropping density because of high correlation with -0.21508934604611996 quality

Every colinearity is below the threshold of 0.6! 

--------colinearity--------
Index(['wine type', 'fixed acidity', 'volatile acidity', 'citric acid',
       'flavanoids', 'minerals', 'chlorides', 'total sulfur dioxide',
  