In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

%matplotlib inline

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### Let's use the white wine dataset for logistic regression

The following dataset is also common, referred to as the Wine Quality Dataset. A local copy of this dataset is provided in Moodle.
https://archive.ics.uci.edu/ml/datasets/wine+quality

In [None]:
wine = pd.read_csv('winequality-white.csv')
wine.tail()

In [None]:
sns.barplot(x = 'Quality', y = 'VAcidity', data = wine);

Let's assume that Good wine is Quality value 7, 8 and 9 while any other is Bad wine. Let's modify our data quality to comply with this arbitrary cut


In [None]:
sns.countplot(x = 'Quality', data = wine);

In [None]:
wine['Quality'].value_counts()

In [None]:
wine['Quality'] = pd.cut(wine['Quality'], bins = [2, 6.5, 10.5], labels = ["Bad", "Good"]) # Execute this once!

wine.tail() 

In [None]:
wine['Quality'].value_counts()

In [None]:
sns.countplot(wine['Quality']);

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(fit_intercept=True)

In [None]:
# Extracting data
X = wine.drop(['Quality'], axis=1)
y = wine['Quality']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Making a split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
# training the model
lr.fit(X_train, y_train)
print (lr.coef_, lr.intercept_);

In [None]:
# Let's make predictions
y_pred = lr.predict(X_test)

In [None]:
y_prob = lr.predict_proba(X_test)

In [None]:
print ('Score of model in training group: {0:2.2f}'.format(lr.score(X_train, y_train)))
print ('Score of model in test group: {0:2.2f}'.format(lr.score(X_test, y_test)))

In [None]:
print (y_test[0:5])

In [None]:
print (y_pred[0:5])

In [None]:
print (y_prob[0:5])

In [None]:
hits = np.count_nonzero(y_test == y_pred)
misses = np.count_nonzero(y_test != y_pred)
print (hits, misses)

print ("Accuracy is: {:3.2f}".format(hits/(hits+misses)))

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
scores = cross_val_score(lr, X, y, cv=5)
print ('Mean score is: {0:2.2f}'.format(scores.mean()))

### Logistic regression with Stochastic Gradient Descend

In [None]:
from sklearn.linear_model import SGDClassifier
lrsgd = SGDClassifier(loss = 'log', fit_intercept=True, max_iter = 5000) # Logistic regression

In [None]:
wine = pd.read_csv('winequality-white.csv')
wine['Quality'] = pd.cut(wine['Quality'], bins = [0, 6.5, 10.5], labels = ["Bad", "Good"]) 

# Extracting data
X = wine.drop(['Quality'], axis=1)
y = wine['Quality']

In [None]:
scores = cross_val_score(lrsgd, X, y, cv=5)
print ('Mean score is: {0:2.2f}'.format(scores.mean()))

## To do: Lab on your own

- Repeat this experiment excluding all instances of wine with quality 3 and quality 8. 