In [None]:
"""
Testing out 3 methods to handle imbalanced classes

Metrics: Precision/Recall

Oversampling and SMOTE have similar results. Recall reaches 0.72 using all variables. (With 0.49 precision).
ADASYN did slightly worse. Recall is 0.68 using all variables. (With 0.49 precision).


"""

In [1]:

# Summary of results (Precision/recall)
result_list = [{'method':'Before','con':'0.64/0.33','all':'0.59/0.22'},\
              {'method':'Oversampling','con':'0.45/0.57','all':'0.49/0.72'},\
              {'method':'SMOTE','con':'0.45/0.57','all':'0.49/0.72'},\
              {'method':'ADASYN','con':'0.44/0.58','all':'0.49/0.68'}]
pd.DataFrame(result_list)

NameError: name 'pd' is not defined

In [None]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.datasets import make_classification
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler

from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN

In [None]:
df_train = pd.read_pickle('../dump/df_train.csv')
df_test = pd.read_pickle('../dump/df_test.csv')

In [None]:
# Continuous variables only
features = ['city_development_index', 'training_hours', 'experience', 'last_new_job']

# Since we're using more than one feature, let's scale our features
scaler = StandardScaler()
X_train_con = scaler.fit_transform(df_train[features])
y_train = df_train['target']
X_test_con = scaler.transform(df_test[features])
y_test = df_test['target']

In [None]:
# All variables (continuous + categorical)
df_train = pd.read_pickle('../dump/df_train.csv')
df_test = pd.read_pickle('../dump/df_test.csv')


X_train = pd.read_pickle('../dump/X_train_processed')
y_train = df_train['target']
X_test = pd.read_pickle('../dump/X_test_processed')
y_test = df_test['target']

### Before treating imbalance classes

In [None]:
# before oversampling (continuous variables only)
lm = LogisticRegression()
lm.fit(X_train_con, y_train)
preds = lm.predict(X_test_con)


pd.DataFrame(confusion_matrix(y_test, preds),\
             columns=['Predict-not changing (0)','Predict-change (1)'],\
            index=['Not changing (0)','Changing (1)'])

In [None]:
print('Classification report (Before,con):\n\n',classification_report(y_test,preds))

In [None]:
# before oversampling (all variables)
lm = LogisticRegression()
lm.fit(X_train, y_train)
preds = lm.predict(X_test)


pd.DataFrame(confusion_matrix(y_test, preds),\
             columns=['Predict-not changing (0)','Predict-change (1)'],\
            index=['Not changing (0)','Changing (1)'])

In [None]:
print('Classification report (Before,all):\n\n',classification_report(y_test,preds))

### 1. Oversampling

In [None]:
# Add some random oversampling of the minority classes
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled_con, y_resampled_con = ros.fit_sample(X_train_con,y_train)

In [None]:
# Add some random oversampling of the minority classes
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_sample(X_train,y_train)

In [None]:
Counter(y_resampled)

In [None]:
# after oversampling (continuous variables only)
lm = LogisticRegression()
lm.fit(X_resampled_con, y_resampled_con)
preds = lm.predict(X_test_con)


pd.DataFrame(confusion_matrix(y_test, preds),\
             columns=['Predict-not changing (0)','Predict-change (1)'],\
            index=['Not changing (0)','Changing (1)'])

In [None]:
print('Classification report (Oversampling,con):\n\n',classification_report(y_test,preds))

In [None]:
# after oversampling (all variables)
lm = LogisticRegression()
lm.fit(X_resampled, y_resampled)
preds = lm.predict(X_test)


pd.DataFrame(confusion_matrix(y_test, preds),\
             columns=['Predict-not changing (0)','Predict-change (1)'],\
            index=['Not changing (0)','Changing (1)'])

In [None]:
print('Classification report (Oversampling,all):\n\n',classification_report(y_test,preds))

In [None]:
# much better (for recall)!

### 2. SMOTE (Synthetic Minority Oversampling TEchnique)
Note that SMOTE does not differentiate between points near the decision boundary and points far away from it.

In [None]:
from imblearn.over_sampling import SMOTE
X_smoted_con, y_smoted_con = SMOTE(random_state=42).fit_sample(X_train_con,y_train)

In [None]:
from imblearn.over_sampling import SMOTE
X_smoted, y_smoted = SMOTE(random_state=42).fit_sample(X_train,y_train)

In [None]:
Counter(y_smoted)

In [None]:
# after SMOTE
lm = LogisticRegression()
lm.fit(X_smoted_con, y_smoted_con)
preds = lm.predict(X_test_con)


pd.DataFrame(confusion_matrix(y_test, preds),\
             columns=['Predict-not changing (0)','Predict-change (1)'],\
            index=['Not changing (0)','Changing (1)'])

In [None]:
print('Classification report (SMOTE,con):\n\n',classification_report(y_test,preds))

In [None]:
# after SMOTE (all variables)
lm = LogisticRegression()
lm.fit(X_smoted, y_smoted)
preds = lm.predict(X_test)


pd.DataFrame(confusion_matrix(y_test, preds),\
             columns=['Predict-not changing (0)','Predict-change (1)'],\
            index=['Not changing (0)','Changing (1)'])

In [None]:
print('Classification report (SMOTE,all):\n\n',classification_report(y_test,preds))

In [None]:
# similar to just oversampling

### 3. ADASYN (ADAptive SYNthetic oversampling)
Generate new points in the regions where the class imbalance is the greatest (where the classifier is most likely to predict the majority class).

In [None]:
from imblearn.over_sampling import ADASYN
X_adasyn_con, y_adasyn_con = ADASYN(random_state=42).fit_sample(X_train_con,y_train)

In [None]:
from imblearn.over_sampling import ADASYN
X_adasyn, y_adasyn = ADASYN(random_state=42).fit_sample(X_train,y_train)

In [None]:
Counter(y_adasyn)

In [None]:
# after ADASYN
lm = LogisticRegression()
lm.fit(X_adasyn_con, y_adasyn_con)
preds = lm.predict(X_test_con)


pd.DataFrame(confusion_matrix(y_test, preds),\
             columns=['Predict-not changing (0)','Predict-change (1)'],\
            index=['Not changing (0)','Changing (1)'])

In [None]:
print('Classification report (ADASYN,con):\n\n',classification_report(y_test,preds))

In [None]:
# after ADASYN (all variables)
lm = LogisticRegression()
lm.fit(X_adasyn, y_adasyn)
preds = lm.predict(X_test)


pd.DataFrame(confusion_matrix(y_test, preds),\
             columns=['Predict-not changing (0)','Predict-change (1)'],\
            index=['Not changing (0)','Changing (1)'])

In [None]:
print('Classification report (ADASYN,all):\n\n',classification_report(y_test,preds))

In [None]:
# Recall of (1) improved but also caught a lot more false positive (0)

In [None]:
# Using all variables (continuous + categorical) improved recall and did not increase as many false positives!