# Imbalanced classes experiment
## January 9th, 2017
### Dr Jose M Albornoz

An experiment to determine a good class imbalance compensation strategy. Three options are explored in this document using the wine quality dataset found in the UCI website:

1. Training and test sets are obtained from raw imbalanced data
2. Training and test sets are obtained from resampled data
3. Training set is obtained from raw data which is then resampled; test set comes from raw data

# Import necessary modules

In [1]:
import pandas as pd
import re
import matplotlib.pyplot as plt
from matplotlib import style
style.use('fivethirtyeight')
%matplotlib inline

import numpy as np
import seaborn as sbs
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score
from sklearn.cross_validation import cross_val_score
from sklearn.utils import resample
from sklearn.learning_curve import validation_curve
from sklearn.model_selection import train_test_split

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.classifier import StackingClassifier

import seaborn as sns

RANDOM_STATE = 19



# 1 Load data

In [2]:
wine_df_white = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv', 
                      sep=';')

In [3]:
wine_df_red = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', 
                      sep=';')

In [4]:
wine_df = pd.concat([wine_df_white, wine_df_red])
wine_df = wine_df_white

In [5]:
# shuffle data set
wine_df = wine_df.sample(frac=1, random_state=RANDOM_STATE)
wine_df = wine_df.sample(frac=1, random_state=RANDOM_STATE)
wine_df = wine_df.sample(frac=1, random_state=RANDOM_STATE)

In [6]:
wine_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
583,7.4,0.24,0.22,10.7,0.042,26.0,81.0,0.9954,2.86,0.36,9.7,6
3759,7.8,0.19,0.32,7.4,0.015,47.0,124.0,0.99278,2.99,0.39,11.0,6
4204,6.9,0.26,0.38,10.5,0.044,33.0,139.0,0.99517,3.06,0.5,10.3,6
4214,7.0,0.44,0.24,12.1,0.056,68.0,210.0,0.99718,3.05,0.5,9.5,5
4856,7.1,0.23,0.39,13.7,0.058,26.0,172.0,0.99755,2.9,0.46,9.0,6


In [7]:
wine_df.shape

(4898, 12)

In [8]:
wine_df['quality'].unique()

array([6, 5, 7, 4, 8, 3, 9])

# 2 Create unbalanced target variable

In [9]:
def f(row):
    if row['quality'] < 7:
        val = 0
    else:
        val = 1
    return val

In [10]:
wine_df['score'] = wine_df.apply(f, axis=1)

In [11]:
np.bincount(wine_df['score'])

array([3838, 1060])

In [12]:
np.bincount(wine_df['score'])*100/len(wine_df)

array([78, 21])

# 3 Training and tests sets: raw dataset

In [13]:
df_train_raw, df_test_raw = train_test_split(wine_df, test_size=0.2)

In [14]:
df_train_raw.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,score
479,6.3,0.22,0.33,1.7,0.041,67.0,164.0,0.9928,3.32,0.56,10.4,6,0
4256,6.0,0.33,0.26,5.1,0.051,16.0,119.0,0.99416,3.15,0.41,9.2,5,0
3130,6.5,0.33,0.38,2.5,0.047,30.0,148.0,0.98964,3.17,0.43,12.7,6,0
3496,6.0,0.24,0.28,3.95,0.038,61.0,134.0,0.99146,3.3,0.54,11.3,7,1
3189,6.6,0.19,0.33,1.8,0.035,42.0,148.0,0.99196,3.15,0.36,10.2,5,0


In [15]:
df_train_raw.shape

(3918, 13)

In [16]:
df_test_raw.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,score
3211,8.2,0.3,0.44,12.4,0.043,52.0,154.0,0.99452,3.04,0.33,12.0,6,0
802,6.8,0.39,0.35,11.6,0.044,57.0,220.0,0.99775,3.07,0.53,9.3,5,0
3243,6.6,0.34,0.24,3.3,0.034,29.0,99.0,0.99031,3.1,0.4,12.3,7,1
134,6.8,0.27,0.22,8.1,0.034,55.0,203.0,0.9961,3.19,0.52,8.9,5,0
1257,6.4,0.17,0.27,6.7,0.036,88.0,223.0,0.9948,3.28,0.35,10.2,6,0


In [17]:
df_test_raw.shape

(980, 13)

## 3.1 Training set

In [18]:
y_train1 = df_train_raw['score'].values

In [19]:
X_train1 = df_train_raw.values

In [57]:
X_train1.shape

(3918, 13)

## 3.2 Test set

In [20]:
y_test1 = df_test_raw['score'].values

In [21]:
df_test1 = df_test_raw.drop(['score'], axis = 1)

In [23]:
X_test1 = df_test_raw.values

In [58]:
X_test1.shape

(980, 13)

# 4 Training and test sets: compensate class imbalance in the whole dataset

## 4.1 Upsampling the minority class on the whole dataset

In [24]:
class_counts = np.bincount(wine_df['score'])

In [25]:
class_counts

array([3838, 1060])

In [26]:
np.bincount(wine_df['score'])*100/len(wine_df['score'])

array([78, 21])

In [27]:
df_majority = wine_df[wine_df['score'] == 0]

In [28]:
df_minority = wine_df[wine_df['score'] == 1]

In [29]:
df_minority_upsampled = resample(df_minority, replace=True, n_samples=class_counts[0], random_state=801)

In [30]:
wine_df_resampled = pd.concat([df_majority, df_minority_upsampled])

In [31]:
np.bincount(wine_df_resampled['score'])

array([3838, 3838])

In [32]:
# shuffle resampled training set
wine_df_resampled = wine_df_resampled.sample(frac=1, random_state=RANDOM_STATE)
wine_df_resampled = wine_df_resampled.sample(frac=1, random_state=RANDOM_STATE)
wine_df_resampled = wine_df_resampled.sample(frac=1, random_state=RANDOM_STATE)

## 4.2 Build training and test sets from the whole resampled dataset

In [33]:
df_train2, df_test2 = train_test_split(wine_df_resampled, test_size=0.2)

### 4.2.1 Train set

In [34]:
y_train2 = df_train2['score'].values

In [35]:
X_train2 = df_train2.values

In [59]:
X_train2.shape

(6140, 13)

### 4.2.2 Test set

In [36]:
y_test2 = df_test2['score'].values

In [37]:
df_test2 = df_test2.drop(['score'], axis = 1)

In [38]:
X_test2 = df_test2.values

In [60]:
X_test2.shape

(1536, 12)

# 5 Training and test sets: compensate class imbalance in the training set only

## 5.1 Upsampling training set only

In [39]:
class_counts = np.bincount(df_train_raw['score'])

In [40]:
class_counts

array([3064,  854])

In [41]:
np.bincount(df_train_raw['score'])*100/len(df_train_raw['score'])

array([78, 21])

In [42]:
df_majority = df_train_raw[df_train_raw['score'] == 0]

In [43]:
df_minority = df_train_raw[df_train_raw['score'] == 1]

In [44]:
df_minority_upsampled = resample(df_minority, replace=True, n_samples=class_counts[0], random_state=801)

In [45]:
df_train3 = pd.concat([df_majority, df_minority_upsampled])

In [46]:
np.bincount(df_train3['score'])

array([3064, 3064])

In [47]:
# shuffle resampled training set
df_train3 = df_train3.sample(frac=1, random_state=RANDOM_STATE)
df_train3 = df_train3.sample(frac=1, random_state=RANDOM_STATE)
df_train3 = df_train3.sample(frac=1, random_state=RANDOM_STATE)

## 5.2. Train set

In [48]:
y_train3 = df_train3['score'].values

In [49]:
X_train3 = df_train3.values

## 5.3 Test set

In [50]:
y_test3 = y_test1

In [51]:
X_test3 = X_test1

# 6 Data normalisation

In [52]:
scl = StandardScaler()    

## 5.1 Normalisation - raw dataset

In [53]:
scl.fit(X_train1)
X_train1_norm = scl.transform(X_train1)

In [54]:
X_test1_norm = scl.transform(X_test1)

## 5.2 Normalisation - full resampled dataset

In [55]:
scl.fit(X_train2)
X_train2_norm = scl.transform(X_train2)

In [56]:
X_test2_norm = scl.transform(X_test2)

ValueError: operands could not be broadcast together with shapes (1536,12) (13,) (1536,12) 

## 5.3 Normalisation: only training set resampled 

In [None]:
scl.fit(X_train3)
X_train3_norm = scl.transform(X_train3)

In [None]:
X_test3_norm = scl.transform(X_test3)

# 6 Model Definition

## 6.1 Estimator definition

In [None]:
clf1 = RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1) 

In [None]:
clf2 = LogisticRegression(tol=0.1, random_state=RANDOM_STATE, n_jobs=-1)

## 6.2 Search space definition

In [None]:
param_grid1 = {'n_estimators': [50, 100, 150, 200, 250, 300, 350, 400, 450, 500], 
               'max_features': ['auto', 'sqrt', 'log2']}

In [None]:
param_grid2 = grid_values = {'penalty': ['l1','l2'], 
                             'C': [0.001, 0.01, 0.1, 1, 10, 20, 30, 40, 50, 60, 70, 80]}

## 6.3 Grid search definition

In [None]:
gs1 = GridSearchCV(estimator=clf1, param_grid=param_grid1, scoring='roc_auc', cv=5, n_jobs=-1)

In [None]:
gs2 = GridSearchCV(estimator=clf2, param_grid=param_grid2, scoring='roc_auc', cv=5, n_jobs=-1)

# 7 Model selection, raw dataset

## 7.1 Grid search

In [None]:
gs1 = gs1.fit(X_train1_norm, y_train1)

In [None]:
gs2 = gs2.fit(X_train1_norm, y_train1)

In [None]:
gs1.best_score_

In [None]:
gs2.best_score_

In [None]:
gs1.best_params_

In [None]:
gs2.best_params_

## 7.2 Best estimators

In [None]:
clf1_b = gs1.best_estimator_

In [None]:
clf2_b = gs2.best_estimator_

## 7.3 Performance on test set

In [None]:
y_pred1 = clf1_b.predict(X_test1_norm)

In [None]:
y_pred2 = clf2_b.predict(X_test1_norm)

In [None]:
accuracy_score(y_test1, y_pred1)

In [None]:
accuracy_score(y_test1, y_pred2)

## 7.4 Confusion Matrix

In [None]:
np.bincount(y_test1)

In [None]:
confusion1 = confusion_matrix(y_test1, y_pred1)
confusion1

In [None]:
fig, ax = plt.subplots(figsize=(8,6))
ax = sns.heatmap(confusion1, annot=True, fmt="d", linewidths=.5)
sns.plt.xlabel('Predicted', fontsize=20)
sns.plt.ylabel('Actual', fontsize=20)
sns.plt.title('Random Forest', fontsize = 25)

In [None]:
confusion2 = confusion_matrix(y_test1, y_pred2)
confusion2

In [None]:
fig, ax = plt.subplots(figsize=(8,6))
ax = sns.heatmap(confusion2, annot=True, fmt="d", linewidths=.5)
sns.plt.xlabel('Predicted', fontsize=20)
sns.plt.ylabel('Actual', fontsize=20)
sns.plt.title('Logistic regression', fontsize = 25)

## 7.5 Accuracy

In [None]:
# Overall, how often is the classifier correct?
# random forest
accuracy_score(y_test1, y_pred1)

In [None]:
# Overall, how often is the classifier correct?
# logistic regression
accuracy_score(y_test1, y_pred2)

## 7.6 Classification error

In [None]:
# Overall, how often is the classifier incorrect?
# random forest
1 - accuracy_score(y_test1, y_pred1)

In [None]:
# Overall, how often is the classifier incorrect?
# logistic regression
1 - accuracy_score(y_test1, y_pred2)

## 7.7 Recall

In [None]:
# When the actual value is positive, how often is the prediction correct?
# random forest
recall_score(y_test1, y_pred1)

In [None]:
# When the actual value is positive, how often is the prediction correct?
# logistic regression
recall_score(y_test1, y_pred2)

## 7.8 Precision

In [None]:
# When a positive value is predicted, how often is the prediction correct?
# random forest
precision_score(y_test1, y_pred1)

In [None]:
# When a positive value is predicted, how often is the prediction correct?
# logistic regression
precision_score(y_test1, y_pred2)

# 8 Model selection, whole resampled dataset

## 8.1 Grid search

In [None]:
gs1 = gs1.fit(X_train2_norm, y_train2)

In [None]:
gs2 = gs2.fit(X_train2_norm, y_train2)

In [None]:
gs1.best_score_

In [None]:
gs2.best_score_

In [None]:
gs1.best_params_

In [None]:
gs2.best_params_

## 8.2 Best estimators

In [None]:
clf1_b = gs1.best_estimator_

In [None]:
clf2_b = gs2.best_estimator_

## 8.3 Performance on test set

In [None]:
y_pred1 = clf1_b.predict(X_test2_norm)

In [None]:
y_pred2 = clf2_b.predict(X_test2_norm)

In [None]:
accuracy_score(y_test2, y_pred1)

In [None]:
accuracy_score(y_test2, y_pred2)

## 8.4 Confusion Matrix

In [None]:
np.bincount(y_test2)

In [None]:
confusion1 = confusion_matrix(y_test2, y_pred1)
confusion1

In [None]:
fig, ax = plt.subplots(figsize=(8,6))
ax = sns.heatmap(confusion1, annot=True, fmt="d", linewidths=.5)
sns.plt.xlabel('Predicted', fontsize=20)
sns.plt.ylabel('Actual', fontsize=20)
sns.plt.title('Random Forest', fontsize = 25)

In [None]:
confusion2 = confusion_matrix(y_test2, y_pred2)
confusion2

In [None]:
fig, ax = plt.subplots(figsize=(8,6))
ax = sns.heatmap(confusion2, annot=True, fmt="d", linewidths=.5)
sns.plt.xlabel('Predicted', fontsize=20)
sns.plt.ylabel('Actual', fontsize=20)
sns.plt.title('Logistic regression', fontsize = 25)

## 8.5 Accuracy

In [None]:
# Overall, how often is the classifier correct?
# random forest
accuracy_score(y_test2, y_pred1)

In [None]:
# Overall, how often is the classifier correct?
# logistic regression
accuracy_score(y_test2, y_pred2)

## 8.6 Classification error

In [None]:
# Overall, how often is the classifier incorrect?
# random forest
1 - accuracy_score(y_test2, y_pred1)

In [None]:
# Overall, how often is the classifier incorrect?
# logistic regression
1 - accuracy_score(y_test2, y_pred2)

## 8.7 Recall

In [None]:
# When the actual value is positive, how often is the prediction correct?
# random forest
recall_score(y_test2, y_pred1)

In [None]:
# When the actual value is positive, how often is the prediction correct?
# logistic regression
recall_score(y_test2, y_pred2)

## 8.8 Precision

In [None]:
# When a positive value is predicted, how often is the prediction correct?
# random forest
precision_score(y_test2, y_pred1)

In [None]:
# When a positive value is predicted, how often is the prediction correct?
# logistic regression
precision_score(y_test2, y_pred2)

# 9 Model selection, compensate class imbalance in the training set only

## 9.1 Grid search

In [None]:
gs1 = gs1.fit(X_train3_norm, y_train3)

In [None]:
gs2 = gs2.fit(X_train3_norm, y_train3)

In [None]:
gs1.best_score_

In [None]:
gs2.best_score_

In [None]:
gs1.best_params_

In [None]:
gs2.best_params_

## 9.2 Best estimators

In [None]:
clf1_b = gs1.best_estimator_

In [None]:
clf2_b = gs2.best_estimator_

## 9.3 Performance on test set

In [None]:
y_pred1 = clf1_b.predict(X_test3_norm)

In [None]:
y_pred2 = clf2_b.predict(X_test3_norm)

In [None]:
accuracy_score(y_test3, y_pred1)

In [None]:
accuracy_score(y_test3, y_pred2)

## 9.4 Confusion Matrix

In [None]:
confusion1 = confusion_matrix(y_test3, y_pred1)
confusion1

In [None]:
fig, ax = plt.subplots(figsize=(8,6))
ax = sns.heatmap(confusion1, annot=True, fmt="d", linewidths=.5)
sns.plt.xlabel('Predicted', fontsize=20)
sns.plt.ylabel('Actual', fontsize=20)
sns.plt.title('Random Forest', fontsize = 25)

In [None]:
confusion2 = confusion_matrix(y_test3, y_pred2)
confusion2

In [None]:
fig, ax = plt.subplots(figsize=(8,6))
ax = sns.heatmap(confusion2, annot=True, fmt="d", linewidths=.5)
sns.plt.xlabel('Predicted', fontsize=20)
sns.plt.ylabel('Actual', fontsize=20)
sns.plt.title('Logistic regression', fontsize = 25)

## 9.5 Accuracy

In [None]:
# Overall, how often is the classifier correct?
# random forest
accuracy_score(y_test3, y_pred1)

In [None]:
# Overall, how often is the classifier correct?
# logistic regression
accuracy_score(y_test3, y_pred2)

## 9.6 Classification error

In [None]:
# Overall, how often is the classifier incorrect?
# random forest
1 - accuracy_score(y_test3, y_pred1)

In [None]:
# Overall, how often is the classifier incorrect?
# logistic regression
1 - accuracy_score(y_test3, y_pred2)

## 9.7 Recall

In [None]:
# When the actual value is positive, how often is the prediction correct?
# random forest
recall_score(y_test3, y_pred1)

In [None]:
# When the actual value is positive, how often is the prediction correct?
# logistic regression
recall_score(y_test3, y_pred2)

## 9.8 Precision

In [None]:
# When a positive value is predicted, how often is the prediction correct?
# random forest
precision_score(y_test3, y_pred1)

In [None]:
# When a positive value is predicted, how often is the prediction correct?
# logistic regression
precision_score(y_test3, y_pred2)