In [None]:
"""
Testing out 3 methods to handle imbalanced classes
Oversampling and SMOTE have similar results.
ADASYN did slightly better for recall but also increased false-positives.

"""

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.datasets import make_classification
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler

from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN

In [2]:
df_train = pd.read_pickle('../dump/df_train.csv')
df_test = pd.read_pickle('../dump/df_test.csv')

In [3]:
# Continuous variables only
features = ['city_development_index', 'training_hours', 'experience', 'last_new_job']

# Since we're using more than one feature, let's scale our features
scaler = StandardScaler()
X_train_con = scaler.fit_transform(df_train[features])
y_train = df_train['target']
X_test_con = scaler.transform(df_test[features])
y_test = df_test['target']

In [4]:
# All variables (continuous + categorical)
df_train = pd.read_pickle('../dump/df_train.csv')
df_test = pd.read_pickle('../dump/df_test.csv')


X_train = pd.read_pickle('../dump/X_train_processed')
y_train = df_train['target']
X_test = pd.read_pickle('../dump/X_test_processed')
y_test = df_test['target']

### 1. Oversampling

In [5]:
# Add some random oversampling of the minority classes
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled_con, y_resampled_con = ros.fit_sample(X_train_con,y_train)

In [6]:
# Add some random oversampling of the minority classes
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_sample(X_train,y_train)

In [7]:
Counter(y_resampled)

Counter({1: 14381, 0: 14381})

In [8]:
# before oversampling (continuous variables only)
lm = LogisticRegression()
lm.fit(X_train_con, y_train)
preds = lm.predict(X_test_con)


pd.DataFrame(confusion_matrix(y_test, preds),\
             columns=['Predict-not changing (0)','Predict-change (1)'],\
            index=['Not changing (0)','Changing (1)'])

Unnamed: 0,Predict-not changing (0),Predict-change (1)
Not changing (0),1445,108
Changing (1),386,190


In [9]:
# before oversampling (all variables)
lm = LogisticRegression()
lm.fit(X_train, y_train)
preds = lm.predict(X_test)


pd.DataFrame(confusion_matrix(y_test, preds),\
             columns=['Predict-not changing (0)','Predict-change (1)'],\
            index=['Not changing (0)','Changing (1)'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Predict-not changing (0),Predict-change (1)
Not changing (0),1466,87
Changing (1),450,126


In [10]:
# after oversampling (continuous variables only)
lm = LogisticRegression()
lm.fit(X_resampled_con, y_resampled_con)
preds = lm.predict(X_test_con)


pd.DataFrame(confusion_matrix(y_test, preds),\
             columns=['Predict-not changing (0)','Predict-change (1)'],\
            index=['Not changing (0)','Changing (1)'])

Unnamed: 0,Predict-not changing (0),Predict-change (1)
Not changing (0),1159,394
Changing (1),248,328


In [11]:
# after oversampling (all variables)
lm = LogisticRegression()
lm.fit(X_resampled, y_resampled)
preds = lm.predict(X_test)


pd.DataFrame(confusion_matrix(y_test, preds),\
             columns=['Predict-not changing (0)','Predict-change (1)'],\
            index=['Not changing (0)','Changing (1)'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Predict-not changing (0),Predict-change (1)
Not changing (0),1129,424
Changing (1),164,412


In [12]:
# much better (for recall)!

### 2. SMOTE (Synthetic Minority Oversampling TEchnique)
Note that SMOTE does not differentiate between points near the decision boundary and points far away from it.

In [13]:
from imblearn.over_sampling import SMOTE
X_smoted_con, y_smoted_con = SMOTE(random_state=42).fit_sample(X_train_con,y_train)

In [14]:
from imblearn.over_sampling import SMOTE
X_smoted, y_smoted = SMOTE(random_state=42).fit_sample(X_train,y_train)

In [15]:
Counter(y_smoted)

Counter({1: 14381, 0: 14381})

In [16]:
# after SMOTE
lm = LogisticRegression()
lm.fit(X_smoted_con, y_smoted_con)
preds = lm.predict(X_test_con)


pd.DataFrame(confusion_matrix(y_test, preds),\
             columns=['Predict-not changing (0)','Predict-change (1)'],\
            index=['Not changing (0)','Changing (1)'])

Unnamed: 0,Predict-not changing (0),Predict-change (1)
Not changing (0),1160,393
Changing (1),248,328


In [17]:
# after SMOTE (all variables)
lm = LogisticRegression()
lm.fit(X_smoted, y_smoted)
preds = lm.predict(X_test)


pd.DataFrame(confusion_matrix(y_test, preds),\
             columns=['Predict-not changing (0)','Predict-change (1)'],\
            index=['Not changing (0)','Changing (1)'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Predict-not changing (0),Predict-change (1)
Not changing (0),1129,424
Changing (1),162,414


In [18]:
# similar to just oversampling

### 3. ADASYN (ADAptive SYNthetic oversampling)
Generate new points in the regions where the class imbalance is the greatest (where the classifier is most likely to predict the majority class).

In [19]:
from imblearn.over_sampling import ADASYN
X_adasyn_con, y_adasyn_con = ADASYN(random_state=42).fit_sample(X_train_con,y_train)

In [20]:
from imblearn.over_sampling import ADASYN
X_adasyn, y_adasyn = ADASYN(random_state=42).fit_sample(X_train,y_train)

In [21]:
Counter(y_adasyn)

Counter({1: 13802, 0: 14381})

In [22]:
# after ADASYN
lm = LogisticRegression()
lm.fit(X_adasyn_con, y_adasyn_con)
preds = lm.predict(X_test_con)


pd.DataFrame(confusion_matrix(y_test, preds),\
             columns=['Predict-not changing (0)','Predict-change (1)'],\
            index=['Not changing (0)','Changing (1)'])

Unnamed: 0,Predict-not changing (0),Predict-change (1)
Not changing (0),1126,427
Changing (1),241,335


In [23]:
# after ADASYN (all variables)
lm = LogisticRegression()
lm.fit(X_adasyn, y_adasyn)
preds = lm.predict(X_test)


pd.DataFrame(confusion_matrix(y_test, preds),\
             columns=['Predict-not changing (0)','Predict-change (1)'],\
            index=['Not changing (0)','Changing (1)'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Predict-not changing (0),Predict-change (1)
Not changing (0),1148,405
Changing (1),182,394


In [24]:
# Recall of (1) improved but also caught a lot more false positive (0)

In [25]:
# Using all variables (continuous + categorical) improved recall and did not increase as many false positives!