# Bus241 datasets: Wine quality data example
* Source:  ML repository
* Wine quality by experts (0 - 10)
* Predictors:  Wine chemical composition
* This can be done as either classification, or regression


## Load tools

In [7]:
%matplotlib inline
# Import lots of tools
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

## Load and process data

In [4]:
# load default data set
wineall = pd.read_csv("winequality-red.csv")
X = wineall.values[:,0:11].copy()
y = wineall.quality.values
print(X.shape)
list(wineall)

(1599, 11)


['fixedAcidity',
 'volatileAcidity',
 'citricAcid',
 'residualSugar',
 'chlorides',
 'freeSulfurDioxide',
 'totalSulfurDioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'quality']

### Set up a binary classifier

In [16]:
ybin = y > 5
print(np.mean(ybin))
wineall['goodqual'] = ybin

0.5347091932457786


In [17]:
# display dataframe in nice table just to see
wineall.head()

Unnamed: 0,fixedAcidity,volatileAcidity,citricAcid,residualSugar,chlorides,freeSulfurDioxide,totalSulfurDioxide,density,pH,sulphates,alcohol,quality,goodqual
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,False
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,False
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,False
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,True
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,False


###  Naive Bayes example

In [11]:
# Try a naive Bayes classifier
from sklearn.naive_bayes import GaussianNB
model =  GaussianNB()

X_train, X_test, y_train, y_test = train_test_split(X,ybin,test_size=0.2)

model.fit(X_train,y_train)
trainScore = model.score(X_train,y_train)
testScore = model.score(X_test,y_test)
print(trainScore)
print(testScore)

0.7443315089913995
0.703125


### Quick demo of scikitLearn automated simulation

In [19]:
# cross validation machinery
# These are some new scikit learn toys that will make your life easier (see below)
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

model =  GaussianNB()
model.fit(X_train,y_train)
# Set up train/test simulation and store in cvf
cvf = ShuffleSplit(n_splits=25, test_size=0.25)
# Simuate scores:  returns vector, note cv argument
scores = cross_val_score(model, X, ybin, cv=cvf)
print(np.mean(scores))
print(np.std(scores))

0.7271000000000001
0.017083325203250097


### Linear Discrimant

In [20]:
# Let's try a linear separation
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
model = LinearDiscriminantAnalysis()
# Set up train/test simulation and store in cvf
cvf = ShuffleSplit(n_splits=25, test_size=0.25)
# Simuate scores:  returns vector, note cv argument
scores = cross_val_score(model, X, ybin, cv=cvf)
print(np.mean(scores))
print(np.std(scores))

0.7454999999999999
0.019339079605813707


### Linear regression
* This problem can be thought of as classification, or
* Regression

In [21]:
# Linear regression
# Find R-squared by hand
from sklearn.linear_model import LinearRegression
model = LinearRegression()
nmc = 50
r2Train = np.zeros(nmc)
r2Test  = np.zeros(nmc)
for i in range(nmc):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
    yGuess = np.mean(y_train)
    model.fit(X_train,y_train)
    yhatTrain = model.predict(X_train)
    yhatTest  = model.predict(X_test)
    mseTrain = np.mean( (y_train-yhatTrain)**2)
    mseTest  = np.mean( (y_test -yhatTest) **2)
    mseGuessTrain = np.mean( (y_train - yGuess)**2)
    mseGuessTest  = np.mean( (y_test  - yGuess)**2)
    r2Train[i] = 1. - mseTrain/mseGuessTrain
    r2Test[i]  = 1. - mseTest/mseGuessTest
print(np.mean(r2Train))
print(np.mean(r2Test))

0.3629352987780537
0.34212545661296107
