In [1]:
# Importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report



In [2]:
# Importing the dataset
dataset = pd.read_csv("data/winequality-red.csv")
validation_indexes = pd.read_csv("data/validation indexes.csv")

# Introduce new binary variable "good" equal 1 for wines with quality greater than or equal 7
dataset = dataset.assign(good = dataset["quality"] >= 6)
dataset["good"] = dataset["good"].astype(int)
print(dataset.head())

dataset.drop(labels=validation_indexes["index"], inplace=True)

# Extract independent variables - all eleven columns
X = dataset.iloc[:, :11] # it's useful not to drop column names - makes outputs clearer

# Extract the dependent "good" variable
y = dataset.iloc[:, 12]

# Split the model into test and validation?
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

# Scale the data to a standard distribution
scaler = StandardScaler()
scaler.fit_transform(X)
scaler.transform(X_train)
scaler.transform(X_test)

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  good  
0      9.4        5     0  
1      9.8        5     0  
2 

array([[-1.95892834e-01, -5.56834399e-01,  4.62334643e-01, ...,
         2.06482073e-01, -1.15961429e+00, -9.48173158e-01],
       [-1.40052778e+00, -1.33715344e+00, -1.03370076e-01, ...,
         1.40757934e-01, -6.89067171e-01, -9.40831096e-02],
       [-9.98982797e-01,  4.46432942e-01, -1.33763492e+00, ...,
         2.50682696e+00,  4.28482234e-01,  9.57146790e-02],
       ...,
       [-2.38021275e-02,  1.39396321e+00, -8.74785603e-01, ...,
         1.12662003e+00, -7.47885561e-01, -3.78779792e-01],
       [-1.95892834e-01,  1.67264858e+00, -5.14672920e-04, ...,
        -1.22138624e-01, -1.04197751e+00, -5.68577581e-01],
       [-1.05634637e+00,  5.36345983e-04, -1.08049641e+00, ...,
         4.69378632e-01,  1.01666613e+00, -9.40831096e-02]])

In [4]:
# The default classifier is actually the best!

classifier = LogisticRegression()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[ 87  31]
 [ 29 113]]
             precision    recall  f1-score   support

          0       0.75      0.74      0.74       118
          1       0.78      0.80      0.79       142

avg / total       0.77      0.77      0.77       260

