In [None]:
"""
Testing out 3 methods to handle imbalanced classes (with Logistic regression model)

Metrics: Precision/Recall

Oversampling and SMOTE have similar results. Recall reaches 0.70-0.71 using all variables. (With 0.50 precision).
ADASYN did slightly better! Recall is 0.74 using all variables. (With 0.49 precision).

Output to pickle 'X_adasyn','y_adasyn'

"""

In [1]:
import pandas as pd
# Summary of results (Precision/recall)
result_list = [{'method':'Before','con':'0.64/0.33','all':'0.58/0.25'},\
              {'method':'Oversampling','con':'0.45/0.57','all':'0.50/0.71'},\
              {'method':'SMOTE','con':'0.45/0.57','all':'0.50/0.70'},\
              {'method':'ADASYN','con':'0.44/0.58','all':'0.49/0.74'}]
pd.DataFrame(result_list)

Unnamed: 0,method,con,all
0,Before,0.64/0.33,0.58/0.25
1,Oversampling,0.45/0.57,0.50/0.71
2,SMOTE,0.45/0.57,0.50/0.70
3,ADASYN,0.44/0.58,0.49/0.74


In [2]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.datasets import make_classification
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler

from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN

In [3]:
df_train = pd.read_pickle('../dump/df_train.csv')
df_test = pd.read_pickle('../dump/df_test.csv')

In [4]:
# Continuous variables only
features = ['city_development_index', 'training_hours', 'experience', 'last_new_job']

# Since we're using more than one feature, let's scale our features
scaler = StandardScaler()
X_train_con = scaler.fit_transform(df_train[features])
y_train = df_train['target']
X_test_con = scaler.transform(df_test[features])
y_test = df_test['target']

In [5]:
# All variables (continuous + categorical)
df_train = pd.read_pickle('../dump/df_train.csv')
df_test = pd.read_pickle('../dump/df_test.csv')


X_train = pd.read_pickle('../dump/X_train_processed')
y_train = df_train['target']
X_test = pd.read_pickle('../dump/X_test_processed')
y_test = df_test['target']

### Before treating imbalance classes

In [6]:
# before oversampling (continuous variables only)
lm = LogisticRegression()
lm.fit(X_train_con, y_train)
preds = lm.predict(X_test_con)


pd.DataFrame(confusion_matrix(y_test, preds),\
             columns=['Predict-not changing (0)','Predict-change (1)'],\
            index=['Not changing (0)','Changing (1)'])

Unnamed: 0,Predict-not changing (0),Predict-change (1)
Not changing (0),1445,108
Changing (1),386,190


In [7]:
print('Classification report (Before,con):\n\n',classification_report(y_test,preds))

Classification report (Before,con):

               precision    recall  f1-score   support

           0       0.79      0.93      0.85      1553
           1       0.64      0.33      0.43       576

    accuracy                           0.77      2129
   macro avg       0.71      0.63      0.64      2129
weighted avg       0.75      0.77      0.74      2129



In [8]:
# before oversampling (all variables)
lm = LogisticRegression()
lm.fit(X_train, y_train)
preds = lm.predict(X_test)


pd.DataFrame(confusion_matrix(y_test, preds),\
             columns=['Predict-not changing (0)','Predict-change (1)'],\
            index=['Not changing (0)','Changing (1)'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Predict-not changing (0),Predict-change (1)
Not changing (0),1447,106
Changing (1),430,146


In [9]:
print('Classification report (Before,all):\n\n',classification_report(y_test,preds))

Classification report (Before,all):

               precision    recall  f1-score   support

           0       0.77      0.93      0.84      1553
           1       0.58      0.25      0.35       576

    accuracy                           0.75      2129
   macro avg       0.68      0.59      0.60      2129
weighted avg       0.72      0.75      0.71      2129



### 1. Oversampling

In [10]:
# Add some random oversampling of the minority classes
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled_con, y_resampled_con = ros.fit_sample(X_train_con,y_train)

In [11]:
# Add some random oversampling of the minority classes
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_sample(X_train,y_train)

In [12]:
Counter(y_resampled)

Counter({1: 14381, 0: 14381})

In [13]:
# after oversampling (continuous variables only)
lm = LogisticRegression()
lm.fit(X_resampled_con, y_resampled_con)
preds = lm.predict(X_test_con)


pd.DataFrame(confusion_matrix(y_test, preds),\
             columns=['Predict-not changing (0)','Predict-change (1)'],\
            index=['Not changing (0)','Changing (1)'])

Unnamed: 0,Predict-not changing (0),Predict-change (1)
Not changing (0),1159,394
Changing (1),248,328


In [14]:
print('Classification report (Oversampling,con):\n\n',classification_report(y_test,preds))

Classification report (Oversampling,con):

               precision    recall  f1-score   support

           0       0.82      0.75      0.78      1553
           1       0.45      0.57      0.51       576

    accuracy                           0.70      2129
   macro avg       0.64      0.66      0.64      2129
weighted avg       0.72      0.70      0.71      2129



In [15]:
# after oversampling (all variables)
lm = LogisticRegression()
lm.fit(X_resampled, y_resampled)
preds = lm.predict(X_test)


pd.DataFrame(confusion_matrix(y_test, preds),\
             columns=['Predict-not changing (0)','Predict-change (1)'],\
            index=['Not changing (0)','Changing (1)'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Predict-not changing (0),Predict-change (1)
Not changing (0),1146,407
Changing (1),167,409


In [16]:
print('Classification report (Oversampling,all):\n\n',classification_report(y_test,preds))

Classification report (Oversampling,all):

               precision    recall  f1-score   support

           0       0.87      0.74      0.80      1553
           1       0.50      0.71      0.59       576

    accuracy                           0.73      2129
   macro avg       0.69      0.72      0.69      2129
weighted avg       0.77      0.73      0.74      2129



In [17]:
# much better (for recall)!

### 2. SMOTE (Synthetic Minority Oversampling TEchnique)
Note that SMOTE does not differentiate between points near the decision boundary and points far away from it.

In [18]:
from imblearn.over_sampling import SMOTE
X_smoted_con, y_smoted_con = SMOTE(random_state=42).fit_sample(X_train_con,y_train)

In [19]:
from imblearn.over_sampling import SMOTE
X_smoted, y_smoted = SMOTE(random_state=42).fit_sample(X_train,y_train)

In [20]:
Counter(y_smoted)

Counter({1: 14381, 0: 14381})

In [21]:
# after SMOTE
lm = LogisticRegression()
lm.fit(X_smoted_con, y_smoted_con)
preds = lm.predict(X_test_con)


pd.DataFrame(confusion_matrix(y_test, preds),\
             columns=['Predict-not changing (0)','Predict-change (1)'],\
            index=['Not changing (0)','Changing (1)'])

Unnamed: 0,Predict-not changing (0),Predict-change (1)
Not changing (0),1160,393
Changing (1),248,328


In [22]:
print('Classification report (SMOTE,con):\n\n',classification_report(y_test,preds))

Classification report (SMOTE,con):

               precision    recall  f1-score   support

           0       0.82      0.75      0.78      1553
           1       0.45      0.57      0.51       576

    accuracy                           0.70      2129
   macro avg       0.64      0.66      0.64      2129
weighted avg       0.72      0.70      0.71      2129



In [23]:
# after SMOTE (all variables)
lm = LogisticRegression()
lm.fit(X_smoted, y_smoted)
preds = lm.predict(X_test)


pd.DataFrame(confusion_matrix(y_test, preds),\
             columns=['Predict-not changing (0)','Predict-change (1)'],\
            index=['Not changing (0)','Changing (1)'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Predict-not changing (0),Predict-change (1)
Not changing (0),1145,408
Changing (1),170,406


In [24]:
print('Classification report (SMOTE,all):\n\n',classification_report(y_test,preds))

Classification report (SMOTE,all):

               precision    recall  f1-score   support

           0       0.87      0.74      0.80      1553
           1       0.50      0.70      0.58       576

    accuracy                           0.73      2129
   macro avg       0.68      0.72      0.69      2129
weighted avg       0.77      0.73      0.74      2129



In [25]:
# similar to just oversampling

### 3. ADASYN (ADAptive SYNthetic oversampling)
Generate new points in the regions where the class imbalance is the greatest (where the classifier is most likely to predict the majority class).

In [26]:
from imblearn.over_sampling import ADASYN
X_adasyn_con, y_adasyn_con = ADASYN(random_state=42).fit_sample(X_train_con,y_train)

In [27]:
from imblearn.over_sampling import ADASYN
X_adasyn, y_adasyn = ADASYN(random_state=42).fit_sample(X_train,y_train)

In [28]:
Counter(y_adasyn)

Counter({1: 14455, 0: 14381})

In [29]:
# after ADASYN
lm = LogisticRegression()
lm.fit(X_adasyn_con, y_adasyn_con)
preds = lm.predict(X_test_con)


pd.DataFrame(confusion_matrix(y_test, preds),\
             columns=['Predict-not changing (0)','Predict-change (1)'],\
            index=['Not changing (0)','Changing (1)'])

Unnamed: 0,Predict-not changing (0),Predict-change (1)
Not changing (0),1126,427
Changing (1),241,335


In [30]:
print('Classification report (ADASYN,con):\n\n',classification_report(y_test,preds))

Classification report (ADASYN,con):

               precision    recall  f1-score   support

           0       0.82      0.73      0.77      1553
           1       0.44      0.58      0.50       576

    accuracy                           0.69      2129
   macro avg       0.63      0.65      0.64      2129
weighted avg       0.72      0.69      0.70      2129



In [31]:
# after ADASYN (all variables)
lm = LogisticRegression()
lm.fit(X_adasyn, y_adasyn)
preds = lm.predict(X_test)


pd.DataFrame(confusion_matrix(y_test, preds),\
             columns=['Predict-not changing (0)','Predict-change (1)'],\
            index=['Not changing (0)','Changing (1)'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Predict-not changing (0),Predict-change (1)
Not changing (0),1100,453
Changing (1),149,427


In [32]:
print('Classification report (ADASYN,all):\n\n',classification_report(y_test,preds))

Classification report (ADASYN,all):

               precision    recall  f1-score   support

           0       0.88      0.71      0.79      1553
           1       0.49      0.74      0.59       576

    accuracy                           0.72      2129
   macro avg       0.68      0.72      0.69      2129
weighted avg       0.77      0.72      0.73      2129



In [33]:
# Recall of (1) improved but also caught a lot more false positive (0)

In [34]:
# Using all variables (continuous + categorical) improved recall and did not increase as many false positives!

### Pickle

In [36]:
X_adasyn.to_pickle('../dump/X_adasyn')
y_adasyn.to_pickle('../dump/y_adasyn')