# Exercise 1: Wine Classification!

In [None]:
# Load packages we need
import sys
import os

import numpy as np
import sklearn

import scipy.stats as stats
import scipy as sp

import pandas as pd

%matplotlib inline
from matplotlib import pyplot as plt
plt.rcParams.update({'font.size': 18})

# Let's check our software versions
print('### Python version: ' + sys.version)
print('### Numpy version: ' + np.__version__)
print('### Scipy version: ' + sp.__version__)
print('### Pandas version: ' + pd.__version__)
print('### Scikit-learn version: ' + sklearn.__version__)
print('------------')


# load our packages / code
sys.path.insert(1, '../common/')
import utils
import plots

In [None]:
# global parameters to control behavior of the pre-processing, ML, analysis, etc.
seed = 42

np.random.seed(seed) # deterministic seed for reproducibility

## Interesting stuff starts now

### Loading data

In [None]:
# Use pandas to load the data from compressed CSV
#wine_type = 'red'
wine_type = 'white'

df = pd.read_csv('../data/{}-wine-quality.csv'.format(wine_type), header=0, na_values='?', sep=' *; *', skipinitialspace=True, engine='python')

In [None]:
# Check that we loaded the data as expected
if wine_type == 'white':
    df_expected_shape = (4898,12)
else:
    df_expected_shape = (1599,12)
    
assert df.shape == df_expected_shape, 'Unexpected shape of df!'

In [None]:
# Quick tip: use info() to get a glance at the size and attributes of the dataset
df.info()

In [None]:
# Let's look at a few rows of our dataframe
df.head(10)

In [None]:
# how many records do we have?
df.shape

### Pre-processing data

In [None]:
## header right now: fixed acidity;volatile acidity;citric acid;residual sugar;chlorides;free sulfur dioxide;total sulfur dioxide;density;pH;sulphates;alcohol;quality
col_names = df.columns
col_names = [x for x in col_names]

#### all columns are numerical and the last one 'quality' is what we want to predict
#### Note: quality is a score between 0 (very bad) and 10 (excellent)

In [None]:
# grab all the data as a numpy array
all_xy = np.asarray(df, dtype='float64')
# a different way of accomplishing the same thing is: all_xy = df.to_numpy(dtype='float64')
assert all_xy.shape[1] == 12

# grab label and features column indices
label_col_idx = all_xy.shape[1]-1
features_col_idx = range(0, label_col_idx)

#### Let's separate features from labels

In [None]:
# separate features from the label
all_x = all_xy[:,features_col_idx]
all_y = all_xy[:,label_col_idx]
all_y = all_y.astype(int)

### Train, Test, Validation Split

In [None]:
# now split between train, test, and validation
prop_vec = [14, 3, 3]
train_x, train_y, test_x, test_y, val_x, val_y = utils.train_test_val_split(all_x, all_y, prop_vec, shuffle=True, seed=seed)

In [None]:
# sanity check shapes
train_x.shape, train_y.shape, test_x.shape, test_y.shape, val_x.shape, val_y.shape

### Stats & Looking at the data

In [None]:
# what does the distribution of labels look like?
label_name = col_names[label_col_idx]
utils.print_array_hist(train_y, label=label_name)

### Clearly, this is not a balanced dataset (we will see later on why this can matter)

In [None]:
# let's plot a histogram to visualize the distribution of labels
bins = np.arange(-1, 11) + 0.5

plt.hist(train_y, bins, density=False, alpha=0.5, edgecolor='k', label=label_name)

plt.xticks(np.arange(11))
plt.xlabel(label_name)
plt.ylabel('Frequency')
plt.show()

### Question: what do you think is a good baseline for predicting the quality exactly?

In [None]:
# what does the distribution of features look like?
for i in range(train_x.shape[1]):
    utils.print_array_basic_stats(train_x[:, i], label=col_names[i])
    print()

### Question: Do the features even help us predict the quality?

In [None]:
# plot feature distribution based on quality

#feature_idx = 0; bins = np.linspace(3, 12, 12)
#feature_idx = 3; bins = np.linspace(0, 70, 20)
feature_idx = 10; bins = np.linspace(7, 15, 12)

lowq_idx = train_y == 4 # low quality wines
highq_idx = train_y == 8 # high quality wines

plt.hist(train_x[lowq_idx,feature_idx], bins, density=True, alpha=0.5, edgecolor='k', label='Low quality')
plt.hist(train_x[highq_idx,feature_idx], bins, density=True, alpha=0.5, edgecolor='k', label='High quality')

plt.xlabel('{}'.format(col_names[feature_idx]))
plt.ylabel('Density')

plt.legend(loc='upper right')
plt.show()

### Can we look at the statistical information that features contain about the task in a systematic way?

In [None]:
# Hint: this may be in your assignment!

train_xy = np.hstack((train_x, train_y.reshape(-1, 1)))

pairwise_corr = np.corrcoef(train_xy, rowvar=False)

plots.heatmap(pairwise_corr, col_names, col_names, rot=90, fsz=(14, 14))

### [Left as exercise]: use Pandas' scatter_matrix to look at scatter plots for the correlation. *Good exercise

In [None]:
## Ref: https://pandas.pydata.org/docs/reference/api/pandas.plotting.scatter_matrix.html

### Should we scale features?

In [None]:
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(copy=True)
scaler.fit(train_x) # fit on the training set! Why?

train_x_scaled = scaler.transform(train_x)
test_x_scaled = scaler.transform(test_x)
val_x_scaled = scaler.transform(val_x)

#note:  we don't scale y. Q: why not?

## Let's train a model

#### SVM classifier

In [None]:
from sklearn.base import clone
import time

# Step 1: instantiate the model and set hyperparameters
## refer to: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
svm = SVC(kernel='linear', random_state=seed)
svm_scaled = clone(svm)

# Step 2: train the model (we use the training set)
st = time.time()
svm.fit(train_x, train_y)
et1 = time.time()
_ = svm_scaled.fit(train_x_scaled, train_y)
et2 = time.time()

print('[Training Time] unscaled: {:.1f} seconds, scaled: {:.1f} seconds'.format(et1 - st, et2 - et1))

#### KNN classifier

In [None]:
# Step 1: instantiate the model and set hyperparameters
## refer to: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
knn = KNeighborsClassifier(n_neighbors=1)
knn_scaled = clone(knn)

# Step 2: train the model (we use the training set)
st = time.time()
knn.fit(train_x, train_y)
et1 = time.time()
_ = knn_scaled.fit(train_x_scaled, train_y)
et2 = time.time()

print('[Training Time] unscaled: {:.2f} seconds, scaled: {:.2f} seconds'.format(et1 - st, et2 - et1))

### Let's evaluate our models!

In [None]:
# Evaluate the models
# define a function to calculate accuracy
def model_accuracy(model, x, true_y):
    pred = model.predict(x)
    return np.sum(pred == true_y) / true_y.shape[0]

def evaluate_model(name, model, train_x, train_y, val_x, val_y):
    train_acc = model_accuracy(model, train_x, train_y)
    val_acc = model_accuracy(model, val_x, val_y)
    print('[{}] Training accuracy: {:.2f}%, Validation accuracy: {:.2f}%'.format(name, train_acc*100, val_acc*100))

In [None]:
evaluate_model('SVM', svm, train_x, train_y, val_x, val_y)
evaluate_model('SVM (w/ scaled features)', svm_scaled, train_x_scaled, train_y, val_x_scaled, val_y)
evaluate_model('KNN', knn, train_x, train_y, val_x, val_y)
evaluate_model('KNN (w/ scaled features)', knn_scaled, train_x_scaled, train_y, val_x_scaled, val_y)

### Q: Does scaling features make a difference?

### Q: Wait. What's going on with 100% accuracy?

### Q: Is around 50% a good model? Why or why not?

In [None]:
# What does the label distribution look like?
utils.print_array_hist(train_y, label='Label distribution')

In [None]:
### Baselines?
# baseline: random between 1 and 10
# --> baseline accuracy: ~10%

# better baseline: predict the mode
mode = stats.mode(train_y)[0]
print('Mode: {}'.format(mode)) 

In [None]:
baseline_pred_y_train = (np.ones_like(train_y) * mode).astype(int)
baseline_pred_y_val = (np.ones_like(val_y) * mode).astype(int)

In [None]:
## How good is the baseline?
def pred_accuracy(true_y, pred_y):
    return np.sum(pred_y == true_y) / true_y.shape[0]

def evaluate_baseline(name, train_y, pred_y_train, val_y, pred_y_val):
    train_acc = pred_accuracy(train_y, pred_y_train)
    val_acc = pred_accuracy(val_y, pred_y_val)
    print('[{}] Training accuracy: {:.2f}%, Validation accuracy: {:.2f}%'.format(name, train_acc*100, val_acc*100))

In [None]:
evaluate_baseline('Baseline (mode)', train_y, baseline_pred_y_train, val_y, baseline_pred_y_val)

In [None]:
# different way to do the same thing with sklearn
# ref: https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html
from sklearn.dummy import DummyClassifier 

mode_clf = DummyClassifier(strategy="most_frequent")
mode_clf.fit(train_x, train_y)

baseline_pred_y_train = mode_clf.predict(train_x)
baseline_pred_y_val = mode_clf.predict(val_x)

evaluate_baseline('Baseline (mode - sklearn dummy classifier)', train_y, baseline_pred_y_train, val_y, baseline_pred_y_val)