In [1]:
# Importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [25]:
# Importing the dataset
dataset = pd.read_csv("data/winequality-red.csv")
validation_indexes = pd.read_csv("data/validation indexes.csv")

# Introduce new binary variable "good" equal 1 for wines with quality greater than or equal 7
dataset = dataset.assign(good = dataset["quality"] >= 6)
dataset["good"] = dataset["good"].astype(int)
print(dataset.head())

# Drop all rows that belong to the validation set
dataset.drop(labels=validation_indexes["index"], inplace=True)

# Extract independent variables - all eleven columns
X = dataset.iloc[:, :11] # it's useful not to drop column names - makes outputs clearer

# Extract the dependent "good" variable
y = dataset.iloc[:, 12]

# Split the model into test and validation?
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

# Scale the data to a standard distribution
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X)
scaler.transform(X_train)
scaler.transform(X_test)

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  good  
0      9.4        5     0  
1      9.8        5     0  
2 

array([[-1.95892834e-01, -5.56834399e-01,  4.62334643e-01, ...,
         2.06482073e-01, -1.15961429e+00, -9.48173158e-01],
       [-1.40052778e+00, -1.33715344e+00, -1.03370076e-01, ...,
         1.40757934e-01, -6.89067171e-01, -9.40831096e-02],
       [-9.98982797e-01,  4.46432942e-01, -1.33763492e+00, ...,
         2.50682696e+00,  4.28482234e-01,  9.57146790e-02],
       ...,
       [-2.38021275e-02,  1.39396321e+00, -8.74785603e-01, ...,
         1.12662003e+00, -7.47885561e-01, -3.78779792e-01],
       [-1.95892834e-01,  1.67264858e+00, -5.14672920e-04, ...,
        -1.22138624e-01, -1.04197751e+00, -5.68577581e-01],
       [-1.05634637e+00,  5.36345983e-04, -1.08049641e+00, ...,
         4.69378632e-01,  1.01666613e+00, -9.40831096e-02]])

In [26]:
print("Training the classifier on a scikit Logistic Regression model with cross-validation.\n")

# Training the classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

# Predict
y_pred = classifier.predict(X_test)

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Confusion matrix
print("Confusion matrix:\n")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Coefficients:\n")
print(classifier.coef_)

Training the classifier on a scikit Logistic Regression model with cross-validation.

Confusion matrix:

[[ 87  31]
 [ 29 113]]
             precision    recall  f1-score   support

          0       0.75      0.74      0.74       118
          1       0.78      0.80      0.79       142

avg / total       0.77      0.77      0.77       260

Coefficients:

[[-0.02798375 -2.9722091  -0.32619856 -0.00300579 -1.52907568  0.02285837
  -0.01781595 -1.00410248 -1.51402804  1.46650502  0.84532336]]


In [4]:
# Try new model
import statsmodels.discrete.discrete_model as sm

print("Training the classifier on a statsmodels Logit model")
logit = sm.Logit(y_train, X_train)
result = logit.fit()

y_pred = [(int)(x >= 0.5) for x in result.predict(X_test).values]

# Confusion matrix
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Training the classifier on a statsmodels Logit model
Optimization terminated successfully.
         Current function value: 0.508449
         Iterations 6
Confusion matrix:
[[180  64]
 [ 77 199]]
             precision    recall  f1-score   support

          0       0.70      0.74      0.72       244
          1       0.76      0.72      0.74       276

avg / total       0.73      0.73      0.73       520



In [5]:
#Fixed acidity, citric acid content, free sulfur dioxide, and pH have high p values and don't help with predictions.
#Let's try dropping these columns.

columns=["fixed acidity", "citric acid", "free sulfur dioxide", "pH"]

simple_dataset = dataset.copy()
simple_dataset.drop(columns=columns, inplace=True)

X_new = simple_dataset.copy().drop(columns = ["quality", "good"])
y_new = simple_dataset.good.copy()

X_train, X_test, y_train, y_test = train_test_split(X_new, y_new, test_size = 0.4, random_state=0)

# Scale the data to a standard distribution
scaler = StandardScaler()
scaler.fit(X_new)
scaler.transform(X_train)
scaler.transform(X_test)

array([[-5.56834399e-01, -1.72052416e-01, -2.73482704e-01, ...,
         4.36082586e-01, -1.15961429e+00, -9.48173158e-01],
       [-1.33715344e+00,  7.94975668e+00, -4.39956008e-01, ...,
         2.26848685e-01, -6.89067171e-01, -9.40831096e-02],
       [ 4.46432942e-01, -4.64730222e-01, -1.69436888e-01, ...,
         3.60972981e-01,  4.28482234e-01,  9.57146790e-02],
       ...,
       [ 1.39396321e+00,  4.74559374e-02, -1.48627725e-01, ...,
         8.11630613e-01, -7.47885561e-01, -3.78779792e-01],
       [ 1.67264858e+00, -3.91560770e-01, -1.69436888e-01, ...,
        -1.43334370e-01, -1.04197751e+00, -5.68577581e-01],
       [ 5.36345983e-04, -3.91560770e-01, -5.23192661e-01, ...,
        -1.01782478e+00,  1.01666613e+00, -9.40831096e-02]])

In [6]:
# Train scikit model using new data
print("Training a scikit model on simplified data.\n")
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

# Print confusion matrix
print("Confusion matrix:\n")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Training a scikit model on simplified data.

Confusion matrix:

[[ 87  31]
 [ 30 112]]
             precision    recall  f1-score   support

          0       0.74      0.74      0.74       118
          1       0.78      0.79      0.79       142

avg / total       0.77      0.77      0.77       260



In [7]:
# Try new model
print("Training the classifier on a statsmodels model with simplified data.")
logit = sm.Logit(y_train, X_train)
result = logit.fit()

y_pred = [(int)(x >= 0.5) for x in result.predict(X_test).values ]

# Confusion matrix
print("Confusion matrix:\n")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Training the classifier on a statsmodels model with simplified data.
Optimization terminated successfully.
         Current function value: 0.523542
         Iterations 6
Confusion matrix:

[[ 87  31]
 [ 33 109]]
             precision    recall  f1-score   support

          0       0.72      0.74      0.73       118
          1       0.78      0.77      0.77       142

avg / total       0.75      0.75      0.75       260



In [27]:
import warnings
warnings.simplefilter("ignore")

# Let's try grid search on the original dataset!

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

from sklearn.model_selection import GridSearchCV

scores = ['accuracy']

tuned_parameters = [{'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000, 1000000],
                     'class_weight': [None, 'balanced'],
                     'penalty': ['l1', 'l2'],
                     'solver': ['liblinear', 'saga']},
                    {'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000, 1000000],
                     'class_weight': [None, 'balanced'],
                     'penalty': ['l2'],
                     'solver': ['newton-cg', 'lbfgs', 'sag']}
                    ]

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()
    
    clf = GridSearchCV(LogisticRegression(), tuned_parameters, cv=3, scoring=score)
    clf.fit(X_train, y_train)
    
    print("Best parameters set found on training set:")
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()
    print("Detailed classification report:")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(confusion_matrix(y_true, y_pred))
    print(classification_report(y_true, y_pred))
    print(clf.score(X_test, y_test))
    print()

# Tuning hyper-parameters for accuracy

Best parameters set found on training set:
{'C': 1, 'class_weight': None, 'penalty': 'l2', 'solver': 'newton-cg'}

Grid scores on development set:

0.474 (+/-0.001) for {'C': 1e-05, 'class_weight': None, 'penalty': 'l1', 'solver': 'liblinear'}
0.509 (+/-0.050) for {'C': 1e-05, 'class_weight': None, 'penalty': 'l1', 'solver': 'saga'}
0.559 (+/-0.026) for {'C': 1e-05, 'class_weight': None, 'penalty': 'l2', 'solver': 'liblinear'}
0.603 (+/-0.019) for {'C': 1e-05, 'class_weight': None, 'penalty': 'l2', 'solver': 'saga'}
0.474 (+/-0.001) for {'C': 1e-05, 'class_weight': 'balanced', 'penalty': 'l1', 'solver': 'liblinear'}
0.491 (+/-0.050) for {'C': 1e-05, 'class_weight': 'balanced', 'penalty': 'l1', 'solver': 'saga'}
0.516 (+/-0.035) for {'C': 1e-05, 'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'liblinear'}
0.537 (+/-0.047) for {'C': 1e-05, 'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'saga'}
0.474 (+/-0.001) for {'C': 0.0001, 'clas

In [34]:
# Let's get sure about the best classifier here.

classifier = LogisticRegression()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

print(confusion_matrix(y_true, y_pred))
print(classification_report(y_true, y_pred))

[[ 87  31]
 [ 29 113]]
             precision    recall  f1-score   support

          0       0.75      0.74      0.74       118
          1       0.78      0.80      0.79       142

avg / total       0.77      0.77      0.77       260



In [35]:
classifier = LogisticRegression(solver='newton-cg')
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

print(confusion_matrix(y_true, y_pred))
print(classification_report(y_true, y_pred))

[[ 88  30]
 [ 32 110]]
             precision    recall  f1-score   support

          0       0.73      0.75      0.74       118
          1       0.79      0.77      0.78       142

avg / total       0.76      0.76      0.76       260

