In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)

In [12]:
# Loading data in df and removing target variable quality and loading in y

df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv", sep = ';')
y = df.pop('quality')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 11 columns):
fixed acidity           4898 non-null float64
volatile acidity        4898 non-null float64
citric acid             4898 non-null float64
residual sugar          4898 non-null float64
chlorides               4898 non-null float64
free sulfur dioxide     4898 non-null float64
total sulfur dioxide    4898 non-null float64
density                 4898 non-null float64
pH                      4898 non-null float64
sulphates               4898 non-null float64
alcohol                 4898 non-null float64
dtypes: float64(11)
memory usage: 421.0 KB


In [3]:
df.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
dtype: int64

In [6]:
df.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol'],
      dtype='object')

In [5]:
#for i in df.columns:
#    df[i] = df[i].fillna(np.mean(df[i]))


### Splitting into Train and Test

In [13]:
train, test, y_train, y_test = train_test_split(df, y, test_size = 0.2, random_state=100)

### Logistic regression on the unscaled train data  

In [15]:
lr = LogisticRegression()
lr.fit(train, y_train)
y_pred = lr.predict(test)
print('Accuracy score of Logistic regression:', accuracy_score(y_test, y_pred))

Accuracy score of Logistic regression: 0.5255102040816326


In [21]:
from sklearn.metrics import confusion_matrix,classification_report

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[  0   0   3   3   0   0   0]
 [  0   0  16  10   0   0   0]
 [  0   0 151 129   0   0   0]
 [  0   0  91 355   6   0   0]
 [  0   0   6 163   9   0   0]
 [  0   0   2  28   6   0   0]
 [  0   0   0   2   0   0   0]]
             precision    recall  f1-score   support

          3       0.00      0.00      0.00         6
          4       0.00      0.00      0.00        26
          5       0.56      0.54      0.55       280
          6       0.51      0.79      0.62       452
          7       0.43      0.05      0.09       178
          8       0.00      0.00      0.00        36
          9       0.00      0.00      0.00         2

avg / total       0.48      0.53      0.46       980



## Function to parameterize the decision tree hyper-parameters 

### Run in loop for different values to identify the best value for hyperparameters and use that value in the final model 

In [29]:
def fit_predict(train, test, y_train, y_test, scaler, max_depth, 
                criterion = 'entropy', max_features = 1, min_samples_split = 4):
    train_scaled = scaler.fit_transform(train)
    test_scaled = scaler.fit_transform(test)        
    dt = DecisionTreeClassifier(criterion = criterion, max_depth=max_depth, 
                                random_state=42, max_features=max_features,
                               min_samples_split=min_samples_split)
    dt.fit(train_scaled, y_train)
    y_pred = dt.predict(test_scaled)
    print(accuracy_score(y_test, y_pred))

### Max depth tuning

In [17]:
for i in range(1, 20):
    print('Accuracy score using max_depth =', i, end = ': ')
    fit_predict(train, test, y_train, y_test, StandardScaler(), i)

Accuracy score using max_depth = 1: 0.46122448979591835
Accuracy score using max_depth = 2: 0.46122448979591835
Accuracy score using max_depth = 3: 0.4846938775510204
Accuracy score using max_depth = 4: 0.4642857142857143
Accuracy score using max_depth = 5: 0.5010204081632653
Accuracy score using max_depth = 6: 0.45
Accuracy score using max_depth = 7: 0.47346938775510206
Accuracy score using max_depth = 8: 0.4816326530612245
Accuracy score using max_depth = 9: 0.5051020408163265
Accuracy score using max_depth = 10: 0.5081632653061224
Accuracy score using max_depth = 11: 0.5112244897959184
Accuracy score using max_depth = 12: 0.5418367346938775
Accuracy score using max_depth = 13: 0.5377551020408163
Accuracy score using max_depth = 14: 0.5448979591836735
Accuracy score using max_depth = 15: 0.5489795918367347
Accuracy score using max_depth = 16: 0.5173469387755102
Accuracy score using max_depth = 17: 0.5244897959183673
Accuracy score using max_depth = 18: 0.5255102040816326
Accuracy sco

### Max features tuning

In [30]:
for i in np.arange(0.1, 1.0, 0.1):
    print('Accuracy score using max features =', i, end = ': ')
    fit_predict(train, test, y_train, y_test, StandardScaler(), max_depth = 18, max_features=i)

Accuracy score using max features = 0.1: 0.5775510204081633
Accuracy score using max features = 0.2: 0.6081632653061224
Accuracy score using max features = 0.30000000000000004: 0.6081632653061224
Accuracy score using max features = 0.4: 0.5959183673469388
Accuracy score using max features = 0.5: 0.6061224489795919
Accuracy score using max features = 0.6: 0.5806122448979592
Accuracy score using max features = 0.7000000000000001: 0.6030612244897959
Accuracy score using max features = 0.8: 0.5724489795918367
Accuracy score using max features = 0.9: 0.576530612244898


### Min samples split tuning

In [31]:
for i in range(2, 20):
    print('Accuracy score using min samples split =', i, end = ': ')
    fit_predict(train, test, y_train, y_test, StandardScaler(), 18, max_features=0.2, min_samples_split=i)

Accuracy score using min samples split = 2: 0.5908163265306122
Accuracy score using min samples split = 3: 0.5530612244897959
Accuracy score using min samples split = 4: 0.563265306122449
Accuracy score using min samples split = 5: 0.5642857142857143
Accuracy score using min samples split = 6: 0.5428571428571428
Accuracy score using min samples split = 7: 0.5112244897959184
Accuracy score using min samples split = 8: 0.5377551020408163
Accuracy score using min samples split = 9: 0.5510204081632653
Accuracy score using min samples split = 10: 0.5540816326530612
Accuracy score using min samples split = 11: 0.5377551020408163
Accuracy score using min samples split = 12: 0.5418367346938775
Accuracy score using min samples split = 13: 0.5591836734693878
Accuracy score using min samples split = 14: 0.5265306122448979
Accuracy score using min samples split = 15: 0.5244897959183673
Accuracy score using min samples split = 16: 0.5153061224489796
Accuracy score using min samples split = 17: 0.52

### Criterion tuning

In [38]:
for i in ['gini', 'entropy']:
    print('Accuracy score using criterion =', i, end = ': ')
    fit_predict(train, test, y_train, y_test, StandardScaler(), 18, 
                max_features=0.2, min_samples_split=2, criterion = i)

Accuracy score using criterion = gini: 0.5785714285714286
Accuracy score using criterion = entropy: 0.5908163265306122


## Creating Polynomial features of the train/test data with various degrees and run the model

In [40]:
def create_poly(train,test,degree):
    poly = PolynomialFeatures(degree=degree)
    train_poly = poly.fit_transform(train)
    test_poly = poly.fit_transform(test)
    return train_poly,test_poly

In [41]:
for degree in [1,2,3,4]:
    train_poly, test_poly = create_poly(train, test, degree)
    print('Polynomial degree',degree)
    fit_predict(train_poly, test_poly, y_train, y_test, StandardScaler(), 18, 
                max_features=0.3, min_samples_split=2, criterion = 'entropy')
    print(10*'-')
    
train_poly, test_poly = create_poly(train, test, 2) 

Polynomial degree 1
0.6193877551020408
----------
Polynomial degree 2
0.5591836734693878
----------
Polynomial degree 3
0.5428571428571428
----------
Polynomial degree 4
0.5387755102040817
----------


## Creating new features and run the model without/with polynomial features

In [42]:
def feat_eng(df):
    df['eng1'] = df['fixed acidity'] * df['pH']
    df['eng2'] = df['total sulfur dioxide'] / df['free sulfur dioxide']
    df['eng3'] = df['sulphates'] / df['chlorides']
    df['eng4'] = df['chlorides'] / df['sulphates']
    return df

train = feat_eng(train)
test = feat_eng(test)

print('Additional feature engineering:')

fit_predict(train, test, y_train, y_test, StandardScaler(), 18, 
                max_features=0.3, min_samples_split=2, criterion = 'entropy')

train_poly, test_poly = create_poly(train, test, 2)

fit_predict(train_poly, test_poly, y_train, y_test, StandardScaler(), 18, 
                max_features=0.3, min_samples_split=2, criterion = 'entropy')


Additional feature engineering:
0.5816326530612245
0.5520408163265306


### Compare the new accuracy with the Logistic regression 

In [48]:
original_score = 0.5255102040816326
best_score = 0.6193877551020408
improvement = np.abs(np.round(100*(original_score - best_score)/original_score,2))
print('overall improvement is {} %'.format(improvement))

overall improvement is 17.86 %
