# Imbalanced-learn for

In [1]:
from sklearn.metrics import classification_report,confusion_matrix,log_loss,auc,plot_confusion_matrix
from sklearn.preprocessing import (
    MinMaxScaler, label_binarize, OneHotEncoder, LabelEncoder)
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_val_score
from itertools import cycle, product
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import cycle
import seaborn as sns
import warnings
import random
from tqdm import tqdm_notebook
warnings.filterwarnings('ignore')
%matplotlib inline

## Raw Data

In [2]:
dataset=pd.read_excel('../Case.xlsx',sheet_name='initial')
data=dataset.values

#array=[]
#for i in data:
#    if i[0]==8:
#        array.append(list(i))
#    data=np.array(array)
    
X =data[:,1:18]
Y =data[:,19]
X_train,X_test,y_train,y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

dict_resample = {}
dict_model = {}

In [3]:
scv = StratifiedKFold(n_splits=10, random_state=0,shuffle=True)

## Over-sampling

### RandomOverSample

In [4]:
# 简单的复制样本
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled ,y_resampled= ros.fit_resample(X_train,y_train)
from collections import Counter
print(sorted(Counter(y_resampled).items()))
dict_resample['RandomOverSampler'] = (X_resampled ,y_resampled)

[(0.0, 1800), (1.0, 1800)]


### SMOTE

In [5]:
from imblearn.over_sampling import SMOTE, ADASYN
X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train)
print(sorted(Counter(y_resampled).items()))
dict_resample['SMOTE'] = (X_resampled ,y_resampled)

[(0.0, 1800), (1.0, 1800)]


### ADASYN

In [6]:
from imblearn.over_sampling import SMOTE, ADASYN
X_resampled, y_resampled = ADASYN().fit_resample(X_train, y_train)
print(sorted(Counter(y_resampled).items()))
dict_resample['ADASYN'] = (X_resampled ,y_resampled)

[(0.0, 1800), (1.0, 1828)]


### BorderlineSMOTE

In [7]:
from imblearn.over_sampling import BorderlineSMOTE
X_resampled, y_resampled = BorderlineSMOTE(kind='borderline-1').fit_resample(X_train, y_train)
print(sorted(Counter(y_resampled).items()))
dict_resample['BorderlineSMOTE-1'] = (X_resampled ,y_resampled)

[(0.0, 1800), (1.0, 1800)]


In [8]:
from imblearn.over_sampling import BorderlineSMOTE
X_resampled, y_resampled = BorderlineSMOTE(kind='borderline-2').fit_resample(X_train, y_train)
print(sorted(Counter(y_resampled).items()))
dict_resample['BorderlineSMOTE-2'] = (X_resampled ,y_resampled)

[(0.0, 1800), (1.0, 1799)]


## Over- and under-sampling

### SMOTEENN

In [9]:
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)
print(sorted(Counter(y_resampled).items()))
dict_resample['SMOTEENN'] = (X_resampled ,y_resampled)

[(0.0, 1120), (1.0, 1607)]


### SMOTETomek

In [10]:
from imblearn.combine import SMOTETomek
smote_tomek = SMOTETomek(random_state=0)
X_resampled, y_resampled = smote_tomek.fit_resample(X_train, y_train)
print(sorted(Counter(y_resampled).items()))
dict_resample['SMOTETomek'] = (X_resampled ,y_resampled)

[(0.0, 1766), (1.0, 1766)]


## Bagging

使用训练数据的不同随机子集来训练每个 Base Model，最后每个 Base Model 权重相同，分类问题进行投票，回归问题平均。

随机森林就用到了Bagging，并且具有天然的并行性。

### Bagging

In [11]:
print(sorted(Counter(y_train).items()))
dict_model['Bagging'] = (X_train ,y_train)

[(0.0, 1800), (1.0, 152)]


### BalancedBagging

In [12]:
print(sorted(Counter(y_train).items()))
dict_model['BalancedBagging'] = (X_train ,y_train)

[(0.0, 1800), (1.0, 152)]


### BalancedRandomForest

In [13]:
print(sorted(Counter(y_train).items()))
dict_model['BalancedRandomForest'] = (X_train ,y_train)

[(0.0, 1800), (1.0, 152)]


## Boosting

Boosting是一种迭代的方法，每一次训练会更关心上一次被分错的样本，比如改变被错分的样本的权重的Adaboost方法。还有许多都是基于这种思想，比如Gradient Boosting等。

### RUSBoost

In [14]:
print(sorted(Counter(y_train).items()))
dict_model['RUSBoost'] = (X_train ,y_train)

[(0.0, 1800), (1.0, 152)]


### EasyEnsemble

In [15]:
print(sorted(Counter(y_train).items()))
dict_model['EasyEnsemble'] = (X_train ,y_train)

[(0.0, 1800), (1.0, 152)]


## 评估

In [16]:
from sklearn.svm import LinearSVC
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
################### train #####################################
scores={}
cm = []
for name, (X, y) in tqdm_notebook(list(dict_model.items())):
    if name in ['Bagging']:
        from sklearn.ensemble import BaggingClassifier
        from sklearn.tree import DecisionTreeClassifier
        clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                random_state=0)
    elif name in ['BalancedBagging']:
        from imblearn.ensemble import BalancedBaggingClassifier
        clf = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                        sampling_strategy='auto',
                                        replacement=False,
                                        random_state=0)
    elif name in ['BalancedRandomForest']:
        from imblearn.ensemble import BalancedRandomForestClassifier
        clf = BalancedRandomForestClassifier(n_estimators=100, random_state=0)
    elif name in ['RUSBoost']:
        from imblearn.ensemble import RUSBoostClassifier
        clf = RUSBoostClassifier(random_state=0)
    elif name in ['EasyEnsemble']:
        from imblearn.ensemble import EasyEnsembleClassifier
        clf = EasyEnsembleClassifier(random_state=0)

    score = cross_val_score(clf, X, y, scoring='accuracy', cv=scv, n_jobs=-1)
    scores[name]=np.mean(score)
    #print('{:}: {:.3f}'.format(name,np.mean(scores)))
    clf.fit(X, y)
    y_test_pred = clf.predict(X_test)
    y_test_score = clf.predict_proba(X_test)  # valid score

    # 混淆矩阵
    cnf_matrix = confusion_matrix(y_test, y_test_pred)
    cm.append((name, cnf_matrix))
    # ROC
    #roc_auc = roc_auc_score(
    #    y_test, y_test_score[:,1])

  0%|          | 0/5 [00:00<?, ?it/s]

In [17]:
for name, (X, y) in tqdm_notebook(list(dict_resample.items())):
    RANDOM_STATE = 2019
    clf = RandomForestClassifier(n_estimators=161,
                                     max_depth=49,
                                     max_features="sqrt",
                                     random_state=RANDOM_STATE)

    score = cross_val_score(clf, X, y, scoring='accuracy', cv=scv, n_jobs=-1)
    scores[name]=np.mean(score)
    #print('{:}: {:.3f}'.format(name,np.mean(scores)))
    clf.fit(X, y)
    y_test_pred = clf.predict(X_test)
    y_test_score = clf.predict_proba(X_test)  # valid score

    cnf_matrix = confusion_matrix(y_test, y_test_pred)
    cm.append(('RF'+name, cnf_matrix))
    # ROC
    #roc_auc = roc_auc_score(
    #    y_test, y_test_score[:,1])

  0%|          | 0/7 [00:00<?, ?it/s]

In [35]:
for name, (X, y) in tqdm_notebook(list(dict_resample.items())):
    clf = GradientBoostingClassifier(n_estimators=161,
                                     max_depth=49,
                                     max_features="sqrt", 
                                     random_state=RANDOM_STATE)


    #score = cross_val_score(clf, X, y, scoring='accuracy', cv=scv, n_jobs=-1)
    #scores[name]=np.mean(score)
    #print('{:}: {:.3f}'.format(name,np.mean(scores)))
    
    param_grid = {'max_depth':range(3,14,2), 'min_samples_split':range(100,801,200)}
    grid_search = GridSearchCV(clf, param_grid, scoring='roc_auc',cv=5)
    
    grid_search.fit(X, y)
    #y_test_pred = clf.predict(X_test)
    #y_test_score = clf.predict_proba(X_test)  # valid score
    
    print("Test set score:{:.2f}".format(grid_search.score(X_test,y_test))) 
    print("Best parameters:{}".format(grid_search.best_params_)) 
    print("Best score on train set:{:.2f}".format(grid_search.best_score_))
    
    #cnf_matrix = confusion_matrix(y_test, y_test_pred)
    #cm.append(('GB'+name, cnf_matrix))
    # ROC
    #roc_auc = roc_auc_score(
    #    y_test, y_test_score[:,1])

  0%|          | 0/7 [00:00<?, ?it/s]

Test set score:0.74
Best parameters:{'max_depth': 11, 'min_samples_split': 100}
Best score on train set:1.00
Test set score:0.76
Best parameters:{'max_depth': 13, 'min_samples_split': 100}
Best score on train set:0.98
Test set score:0.75
Best parameters:{'max_depth': 13, 'min_samples_split': 100}
Best score on train set:0.96


KeyboardInterrupt: 

In [22]:
for name, (X, y) in tqdm_notebook(list(dict_resample.items())):
    clf = AdaBoostClassifier(n_estimators=161,
                             learning_rate=0.5,
                                     random_state=RANDOM_STATE)

    score = cross_val_score(clf, X, y, scoring='accuracy', cv=scv, n_jobs=-1)
    scores['AdaBoost'+name]=np.mean(score)
    #print('{:}: {:.3f}'.format(name,np.mean(scores)))
    clf.fit(X, y)
    y_test_pred = clf.predict(X_test)
    y_test_score = clf.predict_proba(X_test)  # valid score

    cnf_matrix = confusion_matrix(y_test, y_test_pred)
    cm.append((name, cnf_matrix))
    # ROC
    #roc_auc = roc_auc_score(
    #    y_test, y_test_score[:,1])

  0%|          | 0/7 [00:00<?, ?it/s]

In [31]:
for name,c in cm:
    print("{:}: {:.3f}".format(name,c[1][1]/(c[1][1]+c[0][1]+c[1][0])))

Bagging: 0.000
BalancedBagging: 0.153
BalancedRandomForest: 0.162
RUSBoost: 0.123
EasyEnsemble: 0.168
RFRandomOverSampler: 0.022
RFSMOTE: 0.102
RFADASYN: 0.125
RFBorderlineSMOTE-1: 0.108
RFBorderlineSMOTE-2: 0.137
RFSMOTEENN: 0.171
RFSMOTETomek: 0.102
GBRandomOverSampler: 0.000
GBSMOTE: 0.071
GBADASYN: 0.123
GBBorderlineSMOTE-1: 0.118
GBBorderlineSMOTE-2: 0.081
GBSMOTEENN: 0.155
GBSMOTETomek: 0.093
RandomOverSampler: 0.141
SMOTE: 0.145
ADASYN: 0.140
BorderlineSMOTE-1: 0.164
BorderlineSMOTE-2: 0.143
SMOTEENN: 0.160
SMOTETomek: 0.144


In [None]:
ytest = pd.Series(y_test) 
ytest.value_counts()