In [1]:
import pandas as pd
import pipeline as p
import assess_clf_models as acm
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np

  _nan_object_mask = _nan_object_array != _nan_object_array


In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

# from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, #GridSearchCV

from sklearn.pipeline import Pipeline

from imblearn.under_sampling import RandomUnderSampler, TomekLinks, EditedNearestNeighbours, NeighbourhoodCleaningRule
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import CondensedNearestNeighbour, OneSidedSelection, InstanceHardnessThreshold
from imblearn.over_sampling import SMOTE, ADASYN, SVMSMOTE
from imblearn.under_sampling import ClusterCentroids, NearMiss, TomekLinks
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.ensemble import BalancedBaggingClassifier, RUSBoostClassifier, EasyEnsembleClassifier
from imblearn.combine import SMOTEENN, SMOTETomek

from collections import defaultdict

import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [4]:
seed = 19

#### Load & Prep data

In [5]:
X_train = p.open_pkl('Data/X_train_unprocessed.pkl')
y_train = p.open_pkl('Data/y_train_unprocessed.pkl')

In [8]:
preprocessor = p.open_pkl('Data/preprocess_wo_sub.pkl')

**Focus on Models**:
- Logistic Regression 
- Gradient Boost
- AdaBoost

### Base performance of models:

In [11]:
log_reg = LogisticRegression(solver='liblinear')

In [12]:
baseline_auc = acm.assess_preproc_model_auc(preprocessor, log_reg, X_train, y_train)
baseline_auc

0.73330146243373451

In [13]:
grad_boost = GradientBoostingClassifier()
grad_boost_auc = acm.assess_preproc_model_auc(preprocessor, grad_boost, X_train, y_train)
grad_boost_auc

0.75407568057647445

In [14]:
ada = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
ada_auc = acm.assess_preproc_model_auc(preprocessor, ada, X_train, y_train)
ada_auc

0.57114547512503366

In [15]:
rand_for = RandomForestClassifier(n_estimators=10)
rf_auc = acm.assess_preproc_model_auc(preprocessor, rand_for, X_train, y_train)

0.66276915828054239

In [25]:
rf_auc = 0.66276915828054239

In [16]:
nb = GaussianNB()
nb_auc = acm.assess_preproc_model_auc(preprocessor, nb, X_train, y_train)

0.7144980627741957

In [26]:
nb_auc = 0.7144980627741957

ComplementNB, MultinomialNB did not perform any better

In [55]:
base_scores = [["lr", baseline_auc], ['gb', grad_boost_auc], ['ada', ada_auc], ['rf',rf_auc],['nb',nb_auc]]

## Addressing Imbalance

### Weighting

In [21]:
def assess_model_auc_no_strat(preprocessor, model, X, y, n=5):
    pipe =  Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
#     cv = StratifiedKFold(n_splits=n, random_state=seed)
    scores = cross_validate(model, X, y, cv=n, scoring='roc_auc', n_jobs=-1,
                            return_train_score=False)
    return np.mean(scores['test_score'])

In [23]:
# Note: sag & saga solvers did not converge
log_reg2 = LogisticRegression(class_weight='balanced', solver='liblinear')
lr2_auc = assess_model_auc_no_strat(preprocessor, log_reg2, X_train, y_train)
lr2_auc

0.73474849145635857

### Over-sampling

In [27]:
resamp_list = []

In [38]:
model_lib = {'lr':LogisticRegression(solver='liblinear'),
                 'gb': GradientBoostingClassifier(),
#                  'nb': GaussianNB()} # nb wasn't playing nice with the  pipeline & re-sampling. 
             'rf': RandomForestClassifier()}

**Smote**

In [41]:
# Synthetic Minority Oversampling TEchnique
res = SMOTE(random_state=seed)
resamp_list.append(["Smote", acm.assess_all_models_with_resamp(preprocessor, model_lib, res, X_train, y_train)])



**ADASYN**

In [42]:
# ADAptive SYNthetic
res = ADASYN(random_state=seed)
resamp_list.append(["ADASYN", acm.assess_all_models_with_resamp(preprocessor, model_lib, res, X_train, y_train)])

### Under Sampling

**Random Under Sampling**

In [43]:
# self-explanatory
res = RandomUnderSampler(random_state=seed)
resamp_list.append(["Random Under", acm.assess_all_models_with_resamp(preprocessor, model_lib, res, X_train, y_train)])

**Editied Nearest Neighbors**

In [44]:
# applies a nearest-neighbors algorithm and removing samples which do not agree “enough” with their neighboorhood 
res = EditedNearestNeighbours(random_state=seed)
resamp_list.append(["ENN", acm.assess_all_models_with_resamp(preprocessor, model_lib, res, X_train, y_train)])

ClusterCentroids was very slow.

**NearMiss**

*applies heuristic rules based on nearest neighbors. keeps majority points that are closest to...*

NearMiss1: the N-closest minority points

In [45]:
res = NearMiss(random_state=seed, version=1)
resamp_list.append(["Near Miss 1", acm.assess_all_models_with_resamp(preprocessor, model_lib, res, X_train, y_train)])

NearMiss2 was slow

NearMiss3: the majority of minority points. Least sesnsitive to noise

In [46]:
res = NearMiss(random_state=seed, version=3)
resamp_list.append(["Near Miss 3", acm.assess_all_models_with_resamp(preprocessor, model_lib, res, X_train, y_train)])

**Cleaning**

Condensed Nearest Neighbor was slow

Neighborhood Cleaning Rule

In [47]:
res = NeighbourhoodCleaningRule(random_state=seed)
resamp_list.append(["NCC", acm.assess_all_models_with_resamp(preprocessor, model_lib, res, X_train, y_train)])

One Sided Selection

In [48]:
res = OneSidedSelection(random_state=seed)
resamp_list.append(["One Sided Cleaning", acm.assess_all_models_with_resamp(preprocessor, model_lib, res, X_train, y_train)])

**Tomek Links**

In [49]:
res = TomekLinks(random_state=seed, n_jobs=-1)
resamp_list.append(["Tomek", acm.assess_all_models_with_resamp(preprocessor, model_lib, res, X_train, y_train)])

### Combination Over/Under Sampling

In [50]:
from imblearn.combine import SMOTEENN, SMOTETomek

SMOTE + EEN

In [51]:
res = SMOTEENN(random_state=seed)
resamp_list.append(["SMOTE + ENN", acm.assess_all_models_with_resamp(preprocessor, model_lib, res, X_train, y_train)])

SMOTE + Tomek

In [52]:
res = SMOTETomek(random_state=seed)
resamp_list.append(["SMOTE + Tomek", acm.assess_all_models_with_resamp(preprocessor, model_lib, res, X_train, y_train)])

### Compare:

In [56]:
base_scores

[['lr', 0.73330146243373451],
 ['gb', 0.75407568057647445],
 ['ada', 0.57114547512503366],
 ['rf', 0.6627691582805424],
 ['nb', 0.7144980627741957]]

In [53]:
resamp_list

[['Smote',
  defaultdict(list,
              {'lr': 0.73269128361652858,
               'gb': 0.73907751653368658,
               'rf': 0.57182193507513579})],
 ['ADASYN',
  defaultdict(list,
              {'lr': 0.73217064014652899,
               'gb': 0.73470929863486545,
               'rf': 0.55400473594798605})],
 ['Random Under',
  defaultdict(list,
              {'lr': 0.73496864118046057,
               'gb': 0.75389938893481412,
               'rf': 0.67355597078574747})],
 ['ENN',
  defaultdict(list,
              {'lr': 0.73467654458601717,
               'gb': 0.75242067245220523,
               'rf': 0.70550866475275009})],
 ['Near Miss 1',
  defaultdict(list,
              {'lr': 0.68378030743443141,
               'gb': 0.65979990962608182,
               'rf': 0.63034909966596475})],
 ['Near Miss 3',
  defaultdict(list,
              {'lr': 0.5931810037376708,
               'gb': 0.54998075985713291,
               'rf': 0.55660323612216511})],
 ['NCC',
  defaultdict(

### Ensemble of samplers

Combines:
- EasyEnsemble: iteratively applies random under-sampling
- ensemble of Classifiers (default = DecisionTree)
- balances subsets before training

In [58]:
bbc = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(), 
                                random_state=seed, n_jobs=-1, replacement=True)

acm.assess_preproc_model_auc(preprocessor, bbc, X_train, y_train)

0.67951965607208575

In [59]:
bbc2 = BalancedBaggingClassifier(base_estimator=GradientBoostingClassifier(), 
                                 random_state=seed, n_jobs=-1, replacement=True)

acm.assess_preproc_model_auc(preprocessor, bbc2, X_train, y_train)

0.75446790745776027

In [61]:
bbc3 = BalancedBaggingClassifier(base_estimator=GaussianNB(), 
                                 random_state=seed, n_jobs=-1, replacement=True)

acm.assess_preproc_model_auc(preprocessor, bbc3, X_train, y_train)

0.71484724339663985

## Conclusions

The baseline models performed as well as models incorporating over/under sampling