In [1]:
# Importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [17]:
# Importing the dataset
dataset = pd.read_csv("data/winequality-red.csv")

# Extract independent variables - all eleven columns
X = dataset.iloc[:, :11] # it's useful not to drop column names - makes outputs clearer

# Introduce new binary variable "good" equal 1 for wines with quality greater than or equal 7
dataset = dataset.assign(good = dataset["quality"] >= 7)
dataset["good"] = dataset["good"].astype(int)
print(dataset.head())

# Extract the dependent "good" variable
y = dataset.iloc[:, 12]

y.head()

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  good  
0      9.4        5     0  
1      9.8        5     0  
2 

0    0
1    0
2    0
3    0
4    0
Name: good, dtype: int32

In [3]:
# Split the model into test and validation?
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)



In [4]:
# Scale the data to a standard distribution
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X)
scaler.transform(X_train)
scaler.transform(X_test)

array([[ 1.4250439 , -0.32301294,  0.81659759, ..., -0.91431164,
         0.60105502,  0.35389538],
       [-0.1261883 ,  1.63225386, -1.39147228, ...,  0.3167512 ,
        -0.75624575, -0.77251161],
       [ 0.44834214, -1.32857872,  0.30309297, ..., -0.33117661,
         1.07315963,  1.19870062],
       ...,
       [ 0.44834214, -1.04925489,  0.76524713, ..., -0.84951886,
        -0.6382196 ,  0.91709887],
       [ 0.44834214,  1.32499765, -1.18607043, ..., -0.13679827,
        -0.69723268, -0.67864436],
       [-0.06873526, -1.16098443,  0.76524713, ..., -0.26638383,
        -0.6382196 ,  1.76190411]])

In [5]:
# Training the classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
print(y_pred)

[0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1]


In [6]:
# Confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[281   9]
 [ 18  12]]


In [7]:
# Print more results
from sklearn.metrics import classification_report
target_names = ["0", "1"]
print(classification_report(y_test, y_pred, target_names=target_names))

             precision    recall  f1-score   support

          0       0.94      0.97      0.95       290
          1       0.57      0.40      0.47        30

avg / total       0.91      0.92      0.91       320



In [9]:
# Try new model
import statsmodels.discrete.discrete_model as sm
logit = sm.Logit(y_train, X_train)
result = logit.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.282290
         Iterations 8


0,1,2,3
Dep. Variable:,good,No. Observations:,1279.0
Model:,Logit,Df Residuals:,1268.0
Method:,MLE,Df Model:,10.0
Date:,"Tue, 04 Dec 2018",Pseudo R-squ.:,0.3215
Time:,23:58:31,Log-Likelihood:,-361.05
converged:,True,LL-Null:,-532.16
,,LLR p-value:,1.7809999999999998e-67

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
fixed acidity,0.1180,0.090,1.307,0.191,-0.059,0.295
volatile acidity,-3.8557,0.843,-4.571,0.000,-5.509,-2.203
citric acid,0.1561,0.907,0.172,0.863,-1.622,1.934
residual sugar,0.1552,0.066,2.342,0.019,0.025,0.285
chlorides,-9.3221,3.816,-2.443,0.015,-16.801,-1.843
free sulfur dioxide,0.0114,0.013,0.873,0.383,-0.014,0.037
total sulfur dioxide,-0.0134,0.005,-2.654,0.008,-0.023,-0.003
density,-13.5905,3.736,-3.637,0.000,-20.913,-6.267
pH,0.0849,0.933,0.091,0.928,-1.745,1.914


In [24]:
#Fixed acidity, citric acid content, free sulfur dioxide, and pH have high p values and don't help with predictions.
#Let's try dropping these columns.

simple_dataset = dataset.copy()
simple_dataset.drop(columns=["fixed acidity", "citric acid", "free sulfur dioxide", "pH"], inplace=True)

X = simple_dataset.copy().drop(columns = ["quality", "good"])
y = simple_dataset.good.copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

X_train.head()

Unnamed: 0,volatile acidity,residual sugar,chlorides,total sulfur dioxide,density,sulphates,alcohol
642,0.54,2.3,0.071,40.0,0.9991,0.62,9.4
679,0.26,3.3,0.06,49.0,0.9972,0.54,9.6
473,0.35,2.1,0.062,14.0,0.9971,0.79,10.6
390,0.85,1.4,0.045,88.0,0.9924,0.82,12.9
1096,0.725,5.5,0.117,17.0,0.99655,0.49,10.8


In [26]:
# Train statsmodels on new data
logit = sm.Logit(y_train, X_train)
result = logit.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.284355
         Iterations 8


0,1,2,3
Dep. Variable:,good,No. Observations:,1279.0
Model:,Logit,Df Residuals:,1272.0
Method:,MLE,Df Model:,6.0
Date:,"Wed, 05 Dec 2018",Pseudo R-squ.:,0.3166
Time:,00:23:17,Log-Likelihood:,-363.69
converged:,True,LL-Null:,-532.16
,,LLR p-value:,9.818e-70

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
volatile acidity,-4.3982,0.667,-6.591,0.000,-5.706,-3.090
residual sugar,0.1743,0.063,2.778,0.005,0.051,0.297
chlorides,-8.0333,3.364,-2.388,0.017,-14.626,-1.441
total sulfur dioxide,-0.0126,0.003,-3.600,0.000,-0.019,-0.006
density,-11.6291,1.235,-9.413,0.000,-14.051,-9.208
sulphates,3.3506,0.539,6.214,0.000,2.294,4.407
alcohol,0.9467,0.092,10.340,0.000,0.767,1.126


In [28]:
# Train scikit model using new data

classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

# Print confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

print(classification_report(y_test, y_pred))

[[283   7]
 [ 20  10]]
             precision    recall  f1-score   support

          0       0.93      0.98      0.95       290
          1       0.59      0.33      0.43        30

avg / total       0.90      0.92      0.90       320

