In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.model_selection import cross_val_score 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score
from sklearn import metrics
from sklearn import preprocessing
from sklearn.decomposition import PCA 
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import RFE

In [2]:
df = pd.read_csv('winequality-white.csv', delimiter = ';')
df.head(1)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6


In [3]:
cut_bins = [1, 4, 6, 9]
df['quality'] = pd.cut(df['quality'], bins = cut_bins, labels = [0, 1, 2])
df['quality'].value_counts()

1    3655
2    1060
0     183
Name: quality, dtype: int64

In [4]:
df.head(1)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,1


In [5]:
# Indicate the target column
target = df['quality']

# Indicate the columns that will serve as features
features = df.drop('quality', axis = 1)

features.head(1)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8


In [6]:
# Split the dataset into training and test sets
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 42)

In [7]:
# Train and Evaluate model performance
def performance(estimators, x_train, x_test, y_train, y_test):

    def print_validation_performance(estimator, x_train, y_train, name = None, cv = 5):
        
        print(estimator.__class__)
        
        scores = cross_val_score(estimator, x_train, y_train, cv = cv, scoring = 'accuracy')

        print("Accuracy (validation){}".format(scores.mean()))
    
    def print_test_performance(estimator, x_train, x_test, y_train, y_test, name = None):
        
        estimator.fit(x_train, y_train)
        
        test_predict = estimator.predict(x_test)
        
        print("Accuracy (test):{}".format(metrics.accuracy_score(y_test, test_predict)))
               
        print(confusion_matrix(y_test, test_predict))
        
        print(classification_report(y_test, test_predict))            
    
    # Print validation and test performance for all classifiers in the list.
    for estimator in estimators:
        
        print_validation_performance(estimators[estimator], x_train, y_train, name = estimator)
        
        print_test_performance(estimators[estimator], x_train, x_test, y_train, y_test, name = estimator)
        
        print()
        

In [8]:
# Define 3 different estimators
estimators = {
    
    "KNN": KNeighborsClassifier(n_neighbors = 11, p = 1, weights = 'distance'),
    "Decision Tree": DecisionTreeClassifier(criterion = 'entropy', max_depth=7),
    "SVM": SVC(C = 1, gamma = 'auto', kernel = 'rbf')
}

In [9]:
#Scaling Data
def normalize_scaler(df):
    
    names = x_train.columns
    
    # Create the Scaler object   
    scaler = preprocessing.Normalizer()
    
    # Fit the data on the Scaler object    
    scaled_features = scaler.fit_transform(x_train)
    
    # After normalization, scaled_features is transformed into an array so we need to convert it
    scaled_features_df = pd.DataFrame(scaled_features, columns = names)
    
    return scaled_features_df

In [10]:
# Filter approach using Variance Threshold

# Compute the variance of each column
normalize_scaler(df).var().sort_values(ascending=False)

free sulfur dioxide     6.900638e-03
alcohol                 1.569927e-03
residual sugar          1.240375e-03
total sulfur dioxide    6.201797e-04
fixed acidity           5.774087e-04
pH                      1.049855e-04
density                 1.025616e-05
sulphates               2.764116e-06
citric acid             1.900910e-06
volatile acidity        1.787987e-06
chlorides               3.916266e-08
dtype: float64

In [11]:
#Case 1: All features
performance(estimators, x_train, x_test, y_train, y_test)

<class 'sklearn.neighbors._classification.KNeighborsClassifier'>
Accuracy (validation)0.8014279849870984
Accuracy (test):0.8030612244897959
[[  3  26   1]
 [  2 667  54]
 [  0 110 117]]
              precision    recall  f1-score   support

           0       0.60      0.10      0.17        30
           1       0.83      0.92      0.87       723
           2       0.68      0.52      0.59       227

    accuracy                           0.80       980
   macro avg       0.70      0.51      0.54       980
weighted avg       0.79      0.80      0.79       980


<class 'sklearn.tree._classes.DecisionTreeClassifier'>
Accuracy (validation)0.7748814085020983
Accuracy (test):0.7806122448979592
[[  7  23   0]
 [  6 650  67]
 [  0 119 108]]
              precision    recall  f1-score   support

           0       0.54      0.23      0.33        30
           1       0.82      0.90      0.86       723
           2       0.62      0.48      0.54       227

    accuracy                          

In [12]:
# Drop 4 columns with least variance

# Define columns with Least variance
col_drop = ['chlorides', 'volatile acidity', 'citric acid', 'sulphates']

# Use 7 features with the highest variances
# - Drop columns with least variances from x_train and x_test
x_train.drop(col_drop, axis = 1, inplace = True)
x_test.drop(col_drop, axis = 1, inplace = True)

In [13]:
#Case 2: 7 features

performance(estimators, x_train, x_test, y_train, y_test)

<class 'sklearn.neighbors._classification.KNeighborsClassifier'>
Accuracy (validation)0.7993855396564756
Accuracy (test):0.8020408163265306
[[  3  26   1]
 [  2 668  53]
 [  0 112 115]]
              precision    recall  f1-score   support

           0       0.60      0.10      0.17        30
           1       0.83      0.92      0.87       723
           2       0.68      0.51      0.58       227

    accuracy                           0.80       980
   macro avg       0.70      0.51      0.54       980
weighted avg       0.79      0.80      0.78       980


<class 'sklearn.tree._classes.DecisionTreeClassifier'>
Accuracy (validation)0.7595648604269293
Accuracy (test):0.7775510204081633
[[  1  29   0]
 [  4 671  48]
 [  2 135  90]]
              precision    recall  f1-score   support

           0       0.14      0.03      0.05        30
           1       0.80      0.93      0.86       723
           2       0.65      0.40      0.49       227

    accuracy                          

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
# Drop another 4 columns with least variance

# Define columns with Least variance
col_drop = ['density', 'pH', 'fixed acidity', 'total sulfur dioxide']

# Use 3 features with the highest variances
# - Drop columns with least variances from x_train and x_test
x_train.drop(col_drop, axis = 1, inplace = True)
x_test.drop(col_drop, axis = 1, inplace = True)

x_train.head(1)

Unnamed: 0,residual sugar,free sulfur dioxide,alcohol
4665,8.2,44.0,12.4


In [15]:
#Case 3: 3 features

performance(estimators, x_train, x_test, y_train, y_test)

<class 'sklearn.neighbors._classification.KNeighborsClassifier'>
Accuracy (validation)0.8011732087471003
Accuracy (test):0.8153061224489796
[[  5  25   0]
 [  2 675  46]
 [  0 108 119]]
              precision    recall  f1-score   support

           0       0.71      0.17      0.27        30
           1       0.84      0.93      0.88       723
           2       0.72      0.52      0.61       227

    accuracy                           0.82       980
   macro avg       0.76      0.54      0.59       980
weighted avg       0.81      0.82      0.80       980


<class 'sklearn.tree._classes.DecisionTreeClassifier'>
Accuracy (validation)0.7618666432090079
Accuracy (test):0.7663265306122449
[[  2  28   0]
 [  2 688  33]
 [  0 166  61]]
              precision    recall  f1-score   support

           0       0.50      0.07      0.12        30
           1       0.78      0.95      0.86       723
           2       0.65      0.27      0.38       227

    accuracy                          

In [16]:
# Filter approach using chi-square

# Create new copy of df
df_chi = df.copy()
df_chi.head(1)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,1


In [17]:
# Indicate the target column
target = df_chi['quality']

# Indicate the columns that will serve as features
features = df_chi.drop('quality', axis = 1)

features.head(1)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8


In [18]:
# Split the dataset into training and test sets
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 42)

In [19]:
#Scaling Data
normalize_scaler(df_chi)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,0.060535,0.001410,0.002985,0.067998,0.000232,0.364866,0.920458,0.008232,0.026038,0.003400,0.102826
1,0.031264,0.001241,0.002184,0.057566,0.000203,0.238205,0.967708,0.004947,0.015781,0.002581,0.047145
2,0.056364,0.003221,0.003321,0.074481,0.000372,0.251624,0.956171,0.009991,0.032711,0.004932,0.111721
3,0.064430,0.001774,0.003268,0.015874,0.000336,0.308144,0.943107,0.009274,0.029974,0.005042,0.100847
4,0.042970,0.001674,0.001451,0.105752,0.000296,0.200900,0.971019,0.005579,0.017858,0.002790,0.058038
...,...,...,...,...,...,...,...,...,...,...,...
3913,0.048857,0.001655,0.004098,0.051221,0.000370,0.220646,0.969267,0.007834,0.025374,0.003861,0.078014
3914,0.046130,0.000923,0.002109,0.059309,0.000257,0.355856,0.929181,0.006561,0.021220,0.002834,0.061945
3915,0.048959,0.001739,0.003350,0.020614,0.000277,0.180376,0.979185,0.006386,0.019455,0.003414,0.073439
3916,0.043354,0.001652,0.001996,0.094279,0.000241,0.364728,0.922143,0.006852,0.021815,0.002615,0.072946


In [20]:
# Create a selector
# Setting k = 7 means we want the top 7 features 
selector = SelectKBest(chi2, k = 7)

# Select top 7 features based on the training set
x_new = selector.fit_transform(x_train, y_train) 
selector.get_support(indices=True)

array([ 0,  1,  3,  4,  5,  6, 10])

In [21]:
# Drop 4 columns: array([ 2, 7, 8, 9])
col_drop = ['citric acid', 'density', 'pH', 'sulphates']

# Use 7 features with the top 7 features 
x_train.drop(col_drop, axis = 1, inplace = True)
x_test.drop(col_drop, axis = 1, inplace = True)

x_train.head(1)

Unnamed: 0,fixed acidity,volatile acidity,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,alcohol
4665,7.3,0.17,8.2,0.028,44.0,111.0,12.4


In [22]:
#Case 1: 7 features

performance(estimators, x_train, x_test, y_train, y_test)

<class 'sklearn.neighbors._classification.KNeighborsClassifier'>
Accuracy (validation)0.7996409674981104
Accuracy (test):0.7979591836734694
[[  3  26   1]
 [  2 666  55]
 [  0 114 113]]
              precision    recall  f1-score   support

           0       0.60      0.10      0.17        30
           1       0.83      0.92      0.87       723
           2       0.67      0.50      0.57       227

    accuracy                           0.80       980
   macro avg       0.70      0.51      0.54       980
weighted avg       0.78      0.80      0.78       980


<class 'sklearn.tree._classes.DecisionTreeClassifier'>
Accuracy (validation)0.7605904814032893
Accuracy (test):0.7683673469387755
[[  3  27   0]
 [  8 665  50]
 [  0 142  85]]
              precision    recall  f1-score   support

           0       0.27      0.10      0.15        30
           1       0.80      0.92      0.85       723
           2       0.63      0.37      0.47       227

    accuracy                          

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
# Create a selector
# Setting k = 3 means we want the top 3 features 
selector = SelectKBest(chi2, k = 3)

# Select top 3 features based on the training set
x_new = selector.fit_transform(x_train, y_train) 
selector.get_support(indices = True)

array([2, 4, 5])

In [24]:
# Drop another 4 columns: array([0, 1, 3, 6])
col_drop = ['fixed acidity', 'volatile acidity', 'chlorides', 'alcohol']

# Use 3 features with the top 3 features 
x_train.drop(col_drop, axis = 1, inplace = True)
x_test.drop(col_drop, axis = 1, inplace = True)

x_train.head(1)

Unnamed: 0,residual sugar,free sulfur dioxide,total sulfur dioxide
4665,8.2,44.0,111.0


In [25]:
#Case 2: 3 features

performance(estimators, x_train, x_test, y_train, y_test)

<class 'sklearn.neighbors._classification.KNeighborsClassifier'>
Accuracy (validation)0.7866258764042016
Accuracy (test):0.7908163265306123
[[  2  26   2]
 [  2 666  55]
 [  0 120 107]]
              precision    recall  f1-score   support

           0       0.50      0.07      0.12        30
           1       0.82      0.92      0.87       723
           2       0.65      0.47      0.55       227

    accuracy                           0.79       980
   macro avg       0.66      0.49      0.51       980
weighted avg       0.77      0.79      0.77       980


<class 'sklearn.tree._classes.DecisionTreeClassifier'>
Accuracy (validation)0.7409375244350614
Accuracy (test):0.7479591836734694
[[  5  25   0]
 [  1 718   4]
 [  2 215  10]]
              precision    recall  f1-score   support

           0       0.62      0.17      0.26        30
           1       0.75      0.99      0.85       723
           2       0.71      0.04      0.08       227

    accuracy                          

In [26]:
# Filter approach using Information Gain

# Create new copy of df
df_inf = df.copy()

df_inf.head(1)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,1


In [27]:
# Indicate the target column
target = df_inf['quality']

# Indicate the columns that will serve as features
features = df_inf.drop('quality', axis = 1)

features.head(1)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8


In [28]:
# Split the dataset into training and test sets
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 42)

In [29]:
#Scaling Data
normalize_scaler(df_inf)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,0.060535,0.001410,0.002985,0.067998,0.000232,0.364866,0.920458,0.008232,0.026038,0.003400,0.102826
1,0.031264,0.001241,0.002184,0.057566,0.000203,0.238205,0.967708,0.004947,0.015781,0.002581,0.047145
2,0.056364,0.003221,0.003321,0.074481,0.000372,0.251624,0.956171,0.009991,0.032711,0.004932,0.111721
3,0.064430,0.001774,0.003268,0.015874,0.000336,0.308144,0.943107,0.009274,0.029974,0.005042,0.100847
4,0.042970,0.001674,0.001451,0.105752,0.000296,0.200900,0.971019,0.005579,0.017858,0.002790,0.058038
...,...,...,...,...,...,...,...,...,...,...,...
3913,0.048857,0.001655,0.004098,0.051221,0.000370,0.220646,0.969267,0.007834,0.025374,0.003861,0.078014
3914,0.046130,0.000923,0.002109,0.059309,0.000257,0.355856,0.929181,0.006561,0.021220,0.002834,0.061945
3915,0.048959,0.001739,0.003350,0.020614,0.000277,0.180376,0.979185,0.006386,0.019455,0.003414,0.073439
3916,0.043354,0.001652,0.001996,0.094279,0.000241,0.364728,0.922143,0.006852,0.021815,0.002615,0.072946


In [30]:
# Create a selector
# Setting k = 7 means we want the top 7 features 
selector = SelectKBest(mutual_info_classif, k = 7)

# Select top 7 features based on the training set
x_new = selector.fit_transform(x_train, y_train) 
selector.get_support(indices = True)

array([ 2,  3,  4,  5,  6,  7, 10])

In [31]:
# Drop 4 bottom columns: array([ 0, 2, 8, 9])
col_drop = ['fixed acidity', 'citric acid', 'pH', 'sulphates']

# Use 7 features with the top 7 features 
x_train.drop(col_drop, axis = 1, inplace = True)
x_test.drop(col_drop, axis = 1, inplace = True)

x_train.head(1)

Unnamed: 0,volatile acidity,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,alcohol
4665,0.17,8.2,0.028,44.0,111.0,0.99272,12.4


In [32]:
#Case 1: 7 features

performance(estimators, x_train, x_test, y_train, y_test)

<class 'sklearn.neighbors._classification.KNeighborsClassifier'>
Accuracy (validation)0.8004088800271066
Accuracy (test):0.7989795918367347
[[  2  26   2]
 [  2 668  53]
 [  0 114 113]]
              precision    recall  f1-score   support

           0       0.50      0.07      0.12        30
           1       0.83      0.92      0.87       723
           2       0.67      0.50      0.57       227

    accuracy                           0.80       980
   macro avg       0.67      0.50      0.52       980
weighted avg       0.78      0.80      0.78       980


<class 'sklearn.tree._classes.DecisionTreeClassifier'>
Accuracy (validation)0.7567603669820419
Accuracy (test):0.7816326530612245
[[  5  25   0]
 [  8 675  40]
 [  0 141  86]]
              precision    recall  f1-score   support

           0       0.38      0.17      0.23        30
           1       0.80      0.93      0.86       723
           2       0.68      0.38      0.49       227

    accuracy                          

In [33]:
# Create a selector
# Setting k = 3 means we want the top 3 features 
selector = SelectKBest(chi2, k = 3)

# Select top 3 features based on the training set
x_new = selector.fit_transform(x_train, y_train) 
selector.get_support(indices = True)

array([1, 3, 4])

In [34]:
# Drop another 4 bottom columns: array([0, 2, 5, 6])
col_drop = ['volatile acidity', 'chlorides', 'density', 'alcohol']

# Use 3 features with the top 3 features 
x_train.drop(col_drop, axis = 1, inplace = True)
x_test.drop(col_drop, axis = 1, inplace = True)

x_train.head()

Unnamed: 0,residual sugar,free sulfur dioxide,total sulfur dioxide
4665,8.2,44.0,111.0
1943,11.6,48.0,195.0
3399,7.4,25.0,95.0
843,1.7,33.0,101.0
2580,18.95,36.0,174.0


In [35]:
#Case 2: 3 features

performance(estimators, x_train, x_test, y_train, y_test)

<class 'sklearn.neighbors._classification.KNeighborsClassifier'>
Accuracy (validation)0.7866258764042016
Accuracy (test):0.7908163265306123
[[  2  26   2]
 [  2 666  55]
 [  0 120 107]]
              precision    recall  f1-score   support

           0       0.50      0.07      0.12        30
           1       0.82      0.92      0.87       723
           2       0.65      0.47      0.55       227

    accuracy                           0.79       980
   macro avg       0.66      0.49      0.51       980
weighted avg       0.77      0.79      0.77       980


<class 'sklearn.tree._classes.DecisionTreeClassifier'>
Accuracy (validation)0.7409375244350614
Accuracy (test):0.7459183673469387
[[  4  26   0]
 [  2 717   4]
 [  2 215  10]]
              precision    recall  f1-score   support

           0       0.50      0.13      0.21        30
           1       0.75      0.99      0.85       723
           2       0.71      0.04      0.08       227

    accuracy                          

In [36]:
# Wapper approach using Recursive Feature Elimination

# Create new copy of df
df_rfe = df.copy()

df_rfe.head(1)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,1


In [37]:
# Indicate the target column
target = df_rfe['quality']

# Indicate the columns that will serve as features
features = df_rfe.drop('quality', axis = 1)

features.head(1)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8


In [38]:
# Split the dataset into training and test sets
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 42)

In [39]:
#Scaling Data
normalize_scaler(df_rfe)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,0.060535,0.001410,0.002985,0.067998,0.000232,0.364866,0.920458,0.008232,0.026038,0.003400,0.102826
1,0.031264,0.001241,0.002184,0.057566,0.000203,0.238205,0.967708,0.004947,0.015781,0.002581,0.047145
2,0.056364,0.003221,0.003321,0.074481,0.000372,0.251624,0.956171,0.009991,0.032711,0.004932,0.111721
3,0.064430,0.001774,0.003268,0.015874,0.000336,0.308144,0.943107,0.009274,0.029974,0.005042,0.100847
4,0.042970,0.001674,0.001451,0.105752,0.000296,0.200900,0.971019,0.005579,0.017858,0.002790,0.058038
...,...,...,...,...,...,...,...,...,...,...,...
3913,0.048857,0.001655,0.004098,0.051221,0.000370,0.220646,0.969267,0.007834,0.025374,0.003861,0.078014
3914,0.046130,0.000923,0.002109,0.059309,0.000257,0.355856,0.929181,0.006561,0.021220,0.002834,0.061945
3915,0.048959,0.001739,0.003350,0.020614,0.000277,0.180376,0.979185,0.006386,0.019455,0.003414,0.073439
3916,0.043354,0.001652,0.001996,0.094279,0.000241,0.364728,0.922143,0.006852,0.021815,0.002615,0.072946


In [40]:
# Create a SVM classifier with linear kernel
svmlinear = SVC(kernel = 'linear')

# Use RFE to rank features and return top 7 features
# Parameter step corresponds to the (integer) number of features to remove at each iteratio 
rfe = RFE(estimator = svmlinear, n_features_to_select = 7, step = 1)
rfe.fit(x_train, y_train)

print("Number of Features: ", rfe.n_features_)
print("Feature Ranking: ", rfe.ranking_)
print("Selected Features: ", rfe.support_)

Number of Features:  7
Feature Ranking:  [1 1 1 2 1 4 5 3 1 1 1]
Selected Features:  [ True  True  True False  True False False False  True  True  True]


In [41]:
# Drop 4 bottom columns: array([ 3, 5, 6, 7])
col_drop = ['residual sugar', 'free sulfur dioxide', 'total sulfur dioxide', 'density']

# Use 7 features with the top 7 features 
x_train.drop(col_drop, axis = 1, inplace = True)
x_test.drop(col_drop, axis = 1, inplace = True)

x_train.head(1)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,chlorides,pH,sulphates,alcohol
4665,7.3,0.17,0.36,0.028,3.14,0.41,12.4


In [42]:
#Case 1: 7 features

performance(estimators, x_train, x_test, y_train, y_test)

<class 'sklearn.neighbors._classification.KNeighborsClassifier'>
Accuracy (validation)0.8134216905152865
Accuracy (test):0.8408163265306122
[[  3  27   0]
 [  1 667  55]
 [  0  73 154]]
              precision    recall  f1-score   support

           0       0.75      0.10      0.18        30
           1       0.87      0.92      0.90       723
           2       0.74      0.68      0.71       227

    accuracy                           0.84       980
   macro avg       0.79      0.57      0.59       980
weighted avg       0.84      0.84      0.83       980


<class 'sklearn.tree._classes.DecisionTreeClassifier'>
Accuracy (validation)0.7656967576302551
Accuracy (test):0.7816326530612245
[[  5  24   1]
 [ 10 666  47]
 [  0 132  95]]
              precision    recall  f1-score   support

           0       0.33      0.17      0.22        30
           1       0.81      0.92      0.86       723
           2       0.66      0.42      0.51       227

    accuracy                          

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [43]:
# Use RFE to rank features and return top 3 features
# Parameter step corresponds to the (integer) number of features to remove at each iteratio 
rfe = RFE(estimator = svmlinear, n_features_to_select = 3, step = 1)
rfe.fit(x_train, y_train)

print("Number of Features: ", rfe.n_features_)
print("Feature Ranking: ", rfe.ranking_)
print("Selected Features: ", rfe.support_)

Number of Features:  3
Feature Ranking:  [5 1 4 1 3 1 2]
Selected Features:  [False  True False  True False  True False]


In [44]:
# Drop another 4 bottom columns: array([ 0, 2, 4, 6])

col_drop = ['fixed acidity', 'citric acid', 'pH', 'alcohol']

# Use 7 features with the top 3 features 
x_train.drop(col_drop, axis = 1, inplace = True)
x_test.drop(col_drop, axis = 1, inplace = True)

x_train.head(1)

Unnamed: 0,volatile acidity,chlorides,sulphates
4665,0.17,0.028,0.41


In [45]:
#Case 2: 3 features

performance(estimators, x_train, x_test, y_train, y_test)

<class 'sklearn.neighbors._classification.KNeighborsClassifier'>
Accuracy (validation)0.7861137175176585
Accuracy (test):0.786734693877551
[[  2  26   2]
 [  6 656  61]
 [  1 113 113]]
              precision    recall  f1-score   support

           0       0.22      0.07      0.10        30
           1       0.83      0.91      0.86       723
           2       0.64      0.50      0.56       227

    accuracy                           0.79       980
   macro avg       0.56      0.49      0.51       980
weighted avg       0.76      0.79      0.77       980


<class 'sklearn.tree._classes.DecisionTreeClassifier'>
Accuracy (validation)0.7340491177313837
Accuracy (test):0.736734693877551
[[  3  27   0]
 [  5 676  42]
 [  3 181  43]]
              precision    recall  f1-score   support

           0       0.27      0.10      0.15        30
           1       0.76      0.93      0.84       723
           2       0.51      0.19      0.28       227

    accuracy                           0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [46]:
# Principal Component Analysis

# Create new copy of df
df_pca = df.copy()

df_pca.head(1)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,1


In [47]:
# Indicate the target column
target = df_pca['quality']

# Indicate the columns that will serve as features
features = df_pca.drop('quality', axis = 1)

features.head(1)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8


In [48]:
# Split the dataset into training and test sets
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 42)

In [49]:
#Scaling Data
normalize_scaler(df_pca)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,0.060535,0.001410,0.002985,0.067998,0.000232,0.364866,0.920458,0.008232,0.026038,0.003400,0.102826
1,0.031264,0.001241,0.002184,0.057566,0.000203,0.238205,0.967708,0.004947,0.015781,0.002581,0.047145
2,0.056364,0.003221,0.003321,0.074481,0.000372,0.251624,0.956171,0.009991,0.032711,0.004932,0.111721
3,0.064430,0.001774,0.003268,0.015874,0.000336,0.308144,0.943107,0.009274,0.029974,0.005042,0.100847
4,0.042970,0.001674,0.001451,0.105752,0.000296,0.200900,0.971019,0.005579,0.017858,0.002790,0.058038
...,...,...,...,...,...,...,...,...,...,...,...
3913,0.048857,0.001655,0.004098,0.051221,0.000370,0.220646,0.969267,0.007834,0.025374,0.003861,0.078014
3914,0.046130,0.000923,0.002109,0.059309,0.000257,0.355856,0.929181,0.006561,0.021220,0.002834,0.061945
3915,0.048959,0.001739,0.003350,0.020614,0.000277,0.180376,0.979185,0.006386,0.019455,0.003414,0.073439
3916,0.043354,0.001652,0.001996,0.094279,0.000241,0.364728,0.922143,0.006852,0.021815,0.002615,0.072946


In [50]:
# Specify the number of components = 7
pca = PCA(n_components = 7)

# Generate the principal components
pca.fit(x_train)

# Transform the training set into principal components
train_pca = pca.transform(x_train)

# Transform the test set into principal components
test_pca = pca.transform(x_test)

# Convert train set into a data frame to make it easier to view
col_pca = ['principal component 1', 'principal component 2', 'principal component 3', 'principal component 4',\
          'principal component 5', 'principal component 6', 'principal component 7']

principalDf = pd.DataFrame(data = train_pca, columns = col_pca)

principalDf.head()

Unnamed: 0,principal component 1,principal component 2,principal component 3,principal component 4,principal component 5,principal component 6,principal component 7
0,-23.679791,15.651154,2.504163,1.598024,0.828449,-0.007527,0.008663
1,58.58594,-2.252917,2.322404,0.045068,-0.695104,-0.038144,0.125526
2,-44.054223,1.456679,3.062976,0.448083,-1.170754,-0.015628,0.047601
3,-36.47091,7.503535,-3.072416,-0.495367,0.122794,0.042162,0.061736
4,35.575768,-8.255546,10.858227,1.184435,0.748087,0.127777,-0.107691


In [51]:
for num in range(1, principalDf.shape[1]+1):
    print("Number of principal components: {}".format(num))
    print("**********************************************")
    
    df_pca_train = pd.DataFrame(data = train_pca[:,:num], columns = col_pca[:num])
    df_pca_test = pd.DataFrame(data = test_pca[:,:num], columns = col_pca[:num])
  
    performance(estimators, df_pca_train, df_pca_test, y_train, y_test)

Number of principal components: 1
**********************************************
<class 'sklearn.neighbors._classification.KNeighborsClassifier'>
Accuracy (validation)0.7628912867829124
Accuracy (test):0.7765306122448979
[[  2  26   2]
 [  9 656  58]
 [  2 122 103]]
              precision    recall  f1-score   support

           0       0.15      0.07      0.09        30
           1       0.82      0.91      0.86       723
           2       0.63      0.45      0.53       227

    accuracy                           0.78       980
   macro avg       0.53      0.48      0.49       980
weighted avg       0.75      0.78      0.76       980


<class 'sklearn.tree._classes.DecisionTreeClassifier'>
Accuracy (validation)0.7409368728334245
Accuracy (test):0.7408163265306122
[[  0  30   0]
 [  5 717   1]
 [  2 216   9]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        30
           1       0.74      0.99      0.85       723
           2

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy (validation)0.7636520316939036
Accuracy (test):0.7428571428571429
[[  0  28   2]
 [  3 675  45]
 [  0 174  53]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        30
           1       0.77      0.93      0.84       723
           2       0.53      0.23      0.32       227

    accuracy                           0.74       980
   macro avg       0.43      0.39      0.39       980
weighted avg       0.69      0.74      0.70       980


Number of principal components: 3
**********************************************
<class 'sklearn.neighbors._classification.KNeighborsClassifier'>
Accuracy (validation)0.780242460969062
Accuracy (test):0.7887755102040817
[[  4  24   2]
 [  2 661  60]
 [  0 119 108]]
              precision    recall  f1-score   support

           0       0.67      0.13      0.22        30
           1       0.82      0.91      0.87       723
           2       0.64      0.48      0.54       227

    accuracy 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy (validation)0.7871337998801053
Accuracy (test):0.7979591836734694
[[  0  30   0]
 [  0 697  26]
 [  0 142  85]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        30
           1       0.80      0.96      0.88       723
           2       0.77      0.37      0.50       227

    accuracy                           0.80       980
   macro avg       0.52      0.45      0.46       980
weighted avg       0.77      0.80      0.76       980




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
