In [None]:
"""
Testing out 3 methods to handle imbalanced classes
Oversampling and SMOTE have similar results.
ADASYN did slightly better for recall but also increased false-positives.

"""

In [15]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.datasets import make_classification
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler

from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN

In [10]:
df_train = pd.read_pickle('../dump/df_train.csv')
df_test = pd.read_pickle('../dump/df_test.csv')

In [17]:
features = ['city_development_index', 'training_hours', 'experience', 'last_new_job']

# Since we're using more than one feature, let's scale our features
scaler = StandardScaler()
X_train = scaler.fit_transform(df_train[features])
y_train = df_train['target']
X_test = scaler.transform(df_test[features])
y_test = df_test['target']

### 1. Oversampling

In [12]:
# Add some random oversampling of the minority classes
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_sample(X_train,y_train)

In [13]:
Counter(y_resampled)

Counter({1: 14381, 0: 14381})

In [21]:
# before oversampling
lm = LogisticRegression()
lm.fit(X_train, y_train)
preds = lm.predict(X_test)


pd.DataFrame(confusion_matrix(y_test, preds),\
             columns=['Predict-not changing (0)','Predict-change (1)'],\
            index=['Not changing (0)','Changing (1)'])

Unnamed: 0,Predict-not changing (0),Predict-change (1)
Not changing (0),1445,108
Changing (1),386,190


In [22]:
# after oversampling
lm = LogisticRegression()
lm.fit(X_resampled, y_resampled)
preds = lm.predict(X_test)


pd.DataFrame(confusion_matrix(y_test, preds),\
             columns=['Predict-not changing (0)','Predict-change (1)'],\
            index=['Not changing (0)','Changing (1)'])

Unnamed: 0,Predict-not changing (0),Predict-change (1)
Not changing (0),1159,394
Changing (1),248,328


In [None]:
# much better (for recall)!

### 2. SMOTE (Synthetic Minority Oversampling TEchnique)
Note that SMOTE does not differentiate between points near the decision boundary and points far away from it.

In [24]:
from imblearn.over_sampling import SMOTE
X_smoted, y_smoted = SMOTE(random_state=42).fit_sample(X_train,y_train)

In [25]:
Counter(y_smoted)

Counter({1: 14381, 0: 14381})

In [27]:
# after SMOTE
lm = LogisticRegression()
lm.fit(X_smoted, y_smoted)
preds = lm.predict(X_test)


pd.DataFrame(confusion_matrix(y_test, preds),\
             columns=['Predict-not changing (0)','Predict-change (1)'],\
            index=['Not changing (0)','Changing (1)'])

Unnamed: 0,Predict-not changing (0),Predict-change (1)
Not changing (0),1160,393
Changing (1),248,328


In [None]:
# similar to just oversampling

### 3. ADASYN (ADAptive SYNthetic oversampling)
Generate new points in the regions where the class imbalance is the greatest (where the classifier is most likely to predict the majority class).

In [26]:
from imblearn.over_sampling import ADASYN
X_adasyn, y_adasyn = ADASYN(random_state=42).fit_sample(X_train,y_train)

In [28]:
Counter(y_adasyn)

Counter({1: 14479, 0: 14381})

In [29]:
# after ADASYN
lm = LogisticRegression()
lm.fit(X_adasyn, y_adasyn)
preds = lm.predict(X_test)


pd.DataFrame(confusion_matrix(y_test, preds),\
             columns=['Predict-not changing (0)','Predict-change (1)'],\
            index=['Not changing (0)','Changing (1)'])

Unnamed: 0,Predict-not changing (0),Predict-change (1)
Not changing (0),1126,427
Changing (1),241,335


In [None]:
# Recall of (1) improved but also caught a lot more false positive (0)