In [91]:
# Import libraries and modules
import numpy as np
import pandas as pd
from cqcplot import *
 
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.externals import joblib 
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold,KFold, cross_val_score

from keras.wrappers.scikit_learn import KerasRegressor, KerasClassifier
from keras.models import Sequential
from keras.layers import Dense

In [3]:
%matplotlib inline

In [None]:
#set seed 
seed = 1
np.random.seed(seed)


#### In this notebook I will load the cleaned data and 
1) fit a liner model 
2) fit other models (framed as both classification and regression) 

trying to predict variable 'quality' the problem can be framed as a regression task or a classification task. If it is taken as a classification task then this can be multi class or can be reduced to a binary classification, by say assuming all wines above 'good' are good and the rest have insufficient quality. The potential benefit of this approach is is that classes become a bit more balanced. This will be undertaken. 

From previous notebook Approx.80% of wines in dataset are nothing special (i.e. decent or worse), this should be a proxy for model evaluation. A simple strategy would be to always guess 'bad' wine, this would be 80% correct i.e 80% accuracy. This is the baseline to beat.  

comment on evaluation metrics: problem has unbalanced classes so accuracy is not enough. In case of classification I will also use confusion matrix, precision and recall to evaluate performance.


### Load Data

In [8]:
 with open('./clean_wine_data.pickle', 'rb') as f:
                data = pickle.load(f)

In [9]:
data.shape # have 15 features and a target column (quality)

(6497, 16)

In [22]:
data.columns

Index(['Unamed', 'fixed_acidity', 'volatile_acidity', 'citric_acid',
       'residual_sugar', 'chlorides', 'free_sulfur_dioxide',
       'total_sulfur_dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality', 'technique', 'sweetness', 'wine_type'],
      dtype='object')

### prepare Data

#### make into binary classification

In [28]:
def isGoodWine(quality):
    if quality >= 3:
        return 1
    else:
        return 0

In [62]:
wines = data.copy()

In [63]:
wines['quality'] = wines['quality'].apply(isGoodWine)

In [64]:
wines.quality.value_counts()

0    5218
1    1279
Name: quality, dtype: int64

#### Split data into training and test sets 

In [35]:
# 80/20 split with shuffling (default is true)
y = wines['quality']
X = wines.drop('quality', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                   train_size = 0.8,
                                                    random_state = seed, 
                                                    stratify = y)

#### Standardize The Data

In [36]:
from sklearn.preprocessing import StandardScaler
# Define the scaler 
scaler = StandardScaler().fit(X_train)
# Scale the train set
X_train = scaler.transform(X_train)
# Scale the test set
X_test = scaler.transform(X_test)

#### Simplest model : Logistic Regression 

In [123]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,roc_auc_score, confusion_matrix,classification_report
# instantiate a logistic regression model, and fit with X and y
LR_model = LogisticRegression()
LR_model = LR_model.fit(X_train, y_train)

In [118]:
# check the accuracy on the training set
print('Logistic Regression TRAIN accuracy: {}' .format(LR_model.score(X_train, y_train)))

Logistic Regression TRAIN accuracy: 0.8190212373037857


In [120]:
# evaluate 
predicted = LR_model.predict(X_test)
# accuracy evaluation metrics
print('TEST accuracy:{}' .format(accuracy_score(y_test, predicted)))
# Area Under the Receiver Operating Characteristic Curve
probs = LR_model.predict_proba(X_test)
print ('ROC area:{}'.format(roc_auc_score(y_test, probs[:, 1])))

TEST accuracy:0.8165024630541872
ROC area:0.818303800915454


In [126]:
print('confusion matrix:')
confusion_matrix(y_test, predicted)

confusion matrix:


array([[2567,   42],
       [ 554,   85]])

The accuracy is the same as when training and predicting on the same data.

From confusion matrix acan see that a relatively large proportion of the minority class lables (ie. good wine) was missclasified 

In [147]:
print('{} minority class labels precision'.format(42/(85)))

0.49411764705882355 minority class labels precision


#### Multiclass clasisfictaion with  Trees 
use simple decision tree, a Gradient-Boosting classifier, and a Random Forest Classifier

In [37]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, make_scorer

FIT TREES:

In [38]:
simpleTree = DecisionTreeClassifier(max_depth=5)
simpleTree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [39]:
gbmTree = GradientBoostingClassifier(max_depth=5)
gbmTree.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=5,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

In [40]:
rfTree = RandomForestClassifier(max_depth=5)
rfTree.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

#### Evaluate moedl preforemnce 
using Precision, Recall, F-Score, and Support measures for each classifier

- Precision is a measure of a classifier’s exactness. The higher the precision, the more accurate the classifier.
- Recall is a measure of a classifier’s completeness. The higher the recall, the more cases the classifier covers.
- Looking at the support metric, can compare class-wise composition of the test population with the population as a whole. 

In [41]:
simpleTreePerformance = precision_recall_fscore_support(y_test,simpleTree.predict(X_test))
gbmTreePerformance = precision_recall_fscore_support(y_test,gbmTree.predict(X_test))
rfTreePerformance = precision_recall_fscore_support(y_test,rfTree.predict(X_test))

In [44]:
print('Metrics for each class in simple, gradient boosted, and random forest tree classifiers:'+'\n')
for treeMethod in [simpleTreePerformance,gbmTreePerformance,rfTreePerformance]:
    print('Precision: ',treeMethod[0])
    print('Recall: ',treeMethod[1])
    print('Fscore: ',treeMethod[2])
    print('Support: ',treeMethod[3],'\n')

Metrics for each class in simple, gradient boosted, and random forest tree classifiers:

Precision:  [ 0.88052681  0.5443038 ]
Recall:  [ 0.89655172  0.50390625]
Fscore:  [ 0.88846701  0.52332657]
Support:  [1044  256] 

Precision:  [ 0.88251121  0.67567568]
Recall:  [ 0.94252874  0.48828125]
Fscore:  [ 0.91153312  0.56689342]
Support:  [1044  256] 

Precision:  [ 0.82651391  0.56410256]
Recall:  [ 0.96743295  0.171875  ]
Fscore:  [ 0.89143866  0.26347305]
Support:  [1044  256] 



Columns are the two clasess 'bad' and 'good' wine respectively. We are looking for values as close to 1 as possible. For classyfing the majority class all trees have performed better than the baseline 80% accuracy. 
Can see that the Gradient boosted tree achieved best results predicting the minority class with 67% precion and 49% recall.

looking at support can see test population class composition very similarly distributed to population:

In [48]:
print('class-composition: test set:{}, population:{}'.format((256/1044),(1279/5218)))

class-composition: test set:0.24521072796934865, population:0.2451130701418168


In [43]:
print('Confusion Matrix for simple, gradient boosted, and random forest tree classifiers:')
print('Simple Tree:\n',confusion_matrix(y_test, simpleTree.predict(X_test)),'\n')
print('Gradient Boosted:\n',confusion_matrix(y_test, gbmTree.predict(X_test)),'\n')
print('Random Forest:\n',confusion_matrix(y_test, rfTree.predict(X_test)))

Confusion Matrix for simple, gradient boosted, and random forest tree classifiers:
Simple Tree:
 [[936 108]
 [127 129]] 

Gradient Boosted:
 [[984  60]
 [131 125]] 

Random Forest:
 [[1010   34]
 [ 212   44]]


In [129]:
print('{} minority lables missclassifed by Gradient Boosted Tree'.format(60/(125+60))) 

0.32432432432432434

It can be seen that Gradient Boosted tree corectly classifies the largest number of instances (diagonal elments) it also corectly predicts the most cases in the minority class. It has lower numbers on the non-diagonal elements meaning less miss-classifctaion. Random Forest is teh worst and the simple tree sits in between. 
It should be noted that trees are not really much beter than Logistic Regression.

THUS the Gradient Boosted tree should be put forward as the candidate tree model 

Can also explore relative feature importance

In [50]:
print('Feature Importances for GBM tree\n')
for importance,feature in zip(gbmTree.feature_importances_,['fixed acidity', 'volatile acidity', 'citric acid', \
'residual sugar','chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol','technique', 'sweetness', 'wine_type']):
    print('{}: {}'.format(feature,importance))

Feature Importances for GBM tree

fixed acidity: 0.07157683953099729
volatile acidity: 0.050763385291246396
citric acid: 0.08913313361091399
residual sugar: 0.061938656546516556
chlorides: 0.07899814025647582
free sulfur dioxide: 0.07906189742702219
total sulfur dioxide: 0.06987227792111861
density: 0.08109758979817112
pH: 0.1093501202335202
sulphates: 0.07448449381648868
alcohol: 0.07897973792240626
technique: 0.14601545340352806
sweetness: 0.0006870647403854599
wine_type: 3.527037747250596e-06


can observe that pH, sulphates, fixed acidity had the most impact 
sweetness and wine_type had the least impact (possibly because they are derived from the other variables and thus dependent). Surprisingly technique has an impact, maybe due to the underlying generative process being related to the quality variable. 

#### Fit simple NN: MLP 
1) make data into arrays
2) make simple NN model
3) standardise 
4) implemnet KFold CV and fir model
5) evaluate

In [66]:
Y = wines.quality.values

In [68]:
wines.drop('quality',axis=1,inplace=True)

In [71]:
X = wines.values

In [80]:
skf = StratifiedKFold(n_splits=4)
skf.get_n_splits(X, Y)
print(skf)  

StratifiedKFold(n_splits=2, random_state=None, shuffle=False)


In [139]:
def baseline_model():
    model = Sequential()
    model.add(Dense(32, input_dim=15, kernel_initializer='normal', activation='relu'))
    model.add(Dense(64, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    model.compile(optimizer='adam', loss='binary_crossentropy',metrics=['accuracy'])
    return model

In [140]:
estimators = []
# Scale the data with `StandardScaler`
estimators.append(('standardize', StandardScaler()))
# run model as a classification problem 
estimators.append(('mlp', KerasClassifier(build_fn = baseline_model, epochs=20, batch_size=10, verbose=0)))
pipeline = Pipeline(estimators)
# apply KFold CV
kfold = KFold(n_splits= 3, random_state=seed)
results = cross_val_score(pipeline, X, Y, cv = kfold)

In [141]:
print("\nAccuracy: %.2f with std: %.2f" % (results.mean(), results.std()))


Accuracy: 0.82 with std: 0.02


Simple NN learns reasonably well achieving approx 82% accuracy in 20 epochs averaged over 3 runs. This is slightly above the baseline strategy (achieving 80% accuracy) however it is not significantly better due to the standard deviation of the result. Tree models generally outperform the NN. 
However, If hyperparameters were tuned and the NN model trained for longer this would doubtlessly improve further and could potentially beat the trees. 

## Final Conclussion: 

Problem was framed as a binary classification and evaluated using Logistic Regression, different Tree models and a simple Neural Network. <br>
The simple strategy of always guessing 'bad' wine would yield 80% accuracy. <br>
Logistic Regression improved on this slightly by reaching an accuracy of approx 82%. However the precision on the minority class label prediction was approx. 49%. Further improvements could come from dropping insignificant features and transforming other variables. This was not undertaken due to time constraints <br>
Trees performed slightly better, with Gradient boosted tree scoring the best among tree models. the GB_tree reached a precision of over 60% for the minority class and 88% for the majority class. Yielding it the best model thus far. <br>
A simple NN was also attempted however it reached an accuracy of 82% with a standard deviation of 2% effectively yielding it as bad as the baseline. However, further tuning of the NN could markedly improve results, again this could not be undertaken due to time constraints. 