In [1]:
# Importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Importing the dataset
dataset = pd.read_csv("data/winequality-red.csv")

# Introduce new binary variable "good" equal 1 for wines with quality greater than or equal 7
dataset = dataset.assign(good = dataset["quality"] >= 7)
dataset["good"] = dataset["good"].astype(int)
print(dataset.head())

# Extract independent variables - all eleven columns
X = dataset.iloc[:, :11] # it's useful not to drop column names - makes outputs clearer

# Extract the dependent "good" variable
y = dataset.iloc[:, 12]

y.head()

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  good  
0      9.4        5     0  
1      9.8        5     0  
2 

0    0
1    0
2    0
3    0
4    0
Name: good, dtype: int32

In [3]:
# Split the model into test and validation?
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

# Scale the data to a standard distribution
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit_transform(X)
scaler.transform(X_train)
scaler.transform(X_test)



array([[ 1.4250439 , -0.32301294,  0.81659759, ..., -0.91431164,
         0.60105502,  0.35389538],
       [-0.1261883 ,  1.63225386, -1.39147228, ...,  0.3167512 ,
        -0.75624575, -0.77251161],
       [ 0.44834214, -1.32857872,  0.30309297, ..., -0.33117661,
         1.07315963,  1.19870062],
       ...,
       [ 0.44834214, -1.04925489,  0.76524713, ..., -0.84951886,
        -0.6382196 ,  0.91709887],
       [ 0.44834214,  1.32499765, -1.18607043, ..., -0.13679827,
        -0.69723268, -0.67864436],
       [-0.06873526, -1.16098443,  0.76524713, ..., -0.26638383,
        -0.6382196 ,  1.76190411]])

In [17]:
# Training the classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
#print(y_pred)

# Confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Print more results
from sklearn.metrics import classification_report
target_names = ["0", "1"]
print(classification_report(y_test, y_pred, target_names=target_names))
print(classifier.score(X_test, y_test))

# Try new model
import statsmodels.discrete.discrete_model as sm
logit = sm.Logit(y_train, X_train)
result = logit.fit()
result.summary()

# what does it do????
# Try cross-validation?? wtf
# from sklearn.model_selection import cross_val_score
# scores = cross_val_score(classifier, X, y, cv=10)
# print(scores)
# print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[[283   7]
 [ 20  10]]
             precision    recall  f1-score   support

          0       0.93      0.98      0.95       290
          1       0.59      0.33      0.43        30

avg / total       0.90      0.92      0.90       320

0.915625
Optimization terminated successfully.
         Current function value: 0.284355
         Iterations 8


0,1,2,3
Dep. Variable:,good,No. Observations:,1279.0
Model:,Logit,Df Residuals:,1272.0
Method:,MLE,Df Model:,6.0
Date:,"Wed, 05 Dec 2018",Pseudo R-squ.:,0.3166
Time:,22:15:36,Log-Likelihood:,-363.69
converged:,True,LL-Null:,-532.16
,,LLR p-value:,9.818e-70

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
volatile acidity,-4.3982,0.667,-6.591,0.000,-5.706,-3.090
residual sugar,0.1743,0.063,2.778,0.005,0.051,0.297
chlorides,-8.0333,3.364,-2.388,0.017,-14.626,-1.441
total sulfur dioxide,-0.0126,0.003,-3.600,0.000,-0.019,-0.006
density,-11.6291,1.235,-9.413,0.000,-14.051,-9.208
sulphates,3.3506,0.539,6.214,0.000,2.294,4.407
alcohol,0.9467,0.092,10.340,0.000,0.767,1.126


In [5]:
#Fixed acidity, citric acid content, free sulfur dioxide, and pH have high p values and don't help with predictions.
#Let's try dropping these columns.

simple_dataset = dataset.copy()
simple_dataset.drop(columns=["fixed acidity", "citric acid", "free sulfur dioxide", "pH"], inplace=True)

X_new = simple_dataset.copy().drop(columns = ["quality", "good"])
y_new = simple_dataset.good.copy()

X_train, X_test, y_train, y_test = train_test_split(X_new, y_new, test_size = 0.2, random_state=0)

# Scale the data to a standard distribution
scaler = StandardScaler()
scaler.fit_transform(X_new)
scaler.transform(X_train)
scaler.transform(X_test)

array([[-0.32301294, -0.31132282,  1.7753969 , ...,  0.77027994,
         0.60105502,  0.35389538],
       [ 1.63225386,  1.10763304,  0.16011403, ...,  0.9504846 ,
        -0.75624575, -0.77251161],
       [-1.32857872, -0.34679672, -0.52000507, ..., -0.84096169,
         1.07315963,  1.19870062],
       ...,
       [-1.04925489, -0.5241662 , -0.62627368, ..., -1.49817867,
        -0.6382196 ,  0.91709887],
       [ 1.32499765, -0.66606179, -0.20119924, ...,  0.6642772 ,
        -0.69723268, -0.67864436],
       [-1.16098443, -0.16942723,  0.2238752 , ..., -0.89396306,
        -0.6382196 ,  1.76190411]])

In [18]:
# Train scikit model using new data
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

# Print confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

print(classification_report(y_test, y_pred))
print(classifier.score(X_test, y_test))

# Train statsmodels on new data
logit = sm.Logit(y_train, X_train)
result = logit.fit()
result.summary()

[[283   7]
 [ 20  10]]
             precision    recall  f1-score   support

          0       0.93      0.98      0.95       290
          1       0.59      0.33      0.43        30

avg / total       0.90      0.92      0.90       320

0.915625
Optimization terminated successfully.
         Current function value: 0.284355
         Iterations 8


0,1,2,3
Dep. Variable:,good,No. Observations:,1279.0
Model:,Logit,Df Residuals:,1272.0
Method:,MLE,Df Model:,6.0
Date:,"Wed, 05 Dec 2018",Pseudo R-squ.:,0.3166
Time:,22:24:49,Log-Likelihood:,-363.69
converged:,True,LL-Null:,-532.16
,,LLR p-value:,9.818e-70

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
volatile acidity,-4.3982,0.667,-6.591,0.000,-5.706,-3.090
residual sugar,0.1743,0.063,2.778,0.005,0.051,0.297
chlorides,-8.0333,3.364,-2.388,0.017,-14.626,-1.441
total sulfur dioxide,-0.0126,0.003,-3.600,0.000,-0.019,-0.006
density,-11.6291,1.235,-9.413,0.000,-14.051,-9.208
sulphates,3.3506,0.539,6.214,0.000,2.294,4.407
alcohol,0.9467,0.092,10.340,0.000,0.767,1.126


In [36]:
import warnings
warnings.simplefilter("ignore")

# Let's try grid search on the original dataset!
from sklearn.model_selection import GridSearchCV

scores = ['accuracy']

tuned_parameters = [{'tol': [1e-3, 1e-4, 1e-5],
                      'C': [0.01, 0.1, 1, 10, 100, 1000],
                      'class_weight': [None, 'balanced']}
                    ]

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()
    
    clf = GridSearchCV(LogisticRegression(), tuned_parameters, cv=10, scoring=score)
    clf.fit(X_train, y_train)
    
    print("Best parameters set found on training set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the training set.")
    print("The scores are computed on the test set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(confusion_matrix(y_true, y_pred))
    print(classification_report(y_true, y_pred))
    print(clf.score(X_test, y_test))
    print()

# Tuning hyper-parameters for accuracy

Best parameters set found on training set:

{'C': 10, 'class_weight': None, 'tol': 0.001}

Grid scores on development set:

0.854 (+/-0.006) for {'C': 0.01, 'class_weight': None, 'tol': 0.001}
0.854 (+/-0.006) for {'C': 0.01, 'class_weight': None, 'tol': 0.0001}
0.854 (+/-0.006) for {'C': 0.01, 'class_weight': None, 'tol': 1e-05}
0.557 (+/-0.096) for {'C': 0.01, 'class_weight': 'balanced', 'tol': 0.001}
0.557 (+/-0.096) for {'C': 0.01, 'class_weight': 'balanced', 'tol': 0.0001}
0.557 (+/-0.096) for {'C': 0.01, 'class_weight': 'balanced', 'tol': 1e-05}
0.854 (+/-0.006) for {'C': 0.1, 'class_weight': None, 'tol': 0.001}
0.854 (+/-0.006) for {'C': 0.1, 'class_weight': None, 'tol': 0.0001}
0.854 (+/-0.006) for {'C': 0.1, 'class_weight': None, 'tol': 1e-05}
0.756 (+/-0.042) for {'C': 0.1, 'class_weight': 'balanced', 'tol': 0.001}
0.756 (+/-0.042) for {'C': 0.1, 'class_weight': 'balanced', 'tol': 0.0001}
0.756 (+/-0.042) for {'C': 0.1, 'class_weight': 