# Imbalanced-learn for

In [1]:
from sklearn.metrics import classification_report,confusion_matrix,log_loss,auc,plot_confusion_matrix
from sklearn.preprocessing import (
    MinMaxScaler, label_binarize, OneHotEncoder, LabelEncoder)
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_val_score
from itertools import cycle, product
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import cycle
import seaborn as sns
import warnings
import random
from tqdm import tqdm_notebook
warnings.filterwarnings('ignore')
%matplotlib inline

## Raw Data

In [2]:
dataset=pd.read_excel('../Case.xlsx',sheet_name='initial')
data=dataset.values

X =data[:,1:18]
Y =data[:,20]
seed = random.randint(1,1000)
X_train,X_test,y_train,y_test = train_test_split(X, Y, test_size=0.2, random_state=seed)

dict_resample={}
dict_model={}

label_dict = {'Normal': 0,
              'rain': 1,
             'thunder': 2,
             'wind': 3}
labels = [key for i in sorted(label_dict.values()) for key,val in label_dict.items() if val==i]
labels_number = sorted(label_dict.values()) # [0, 1, 2, 3]

## Over-sampling

### RandomOverSample

In [3]:
# 简单的复制样本
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled ,y_resampled= ros.fit_resample(X_train,y_train)
from collections import Counter
print(sorted(Counter(y_resampled).items()))
dict_resample['RandomOverSampler'] = (X_resampled ,y_resampled)

[(0.0, 1802), (1.0, 1802), (2.0, 1802), (3.0, 1802)]


### SMOTE

In [4]:
from imblearn.over_sampling import SMOTE, ADASYN
X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train)
print(sorted(Counter(y_resampled).items()))
dict_resample['SMOTE'] = (X_resampled ,y_resampled)

[(0.0, 1802), (1.0, 1802), (2.0, 1802), (3.0, 1802)]


### ADASYN

In [5]:
from imblearn.over_sampling import SMOTE, ADASYN
X_resampled, y_resampled = ADASYN().fit_resample(X_train, y_train)
print(sorted(Counter(y_resampled).items()))
dict_resample['ADASYN'] = (X_resampled ,y_resampled)

[(0.0, 1802), (1.0, 1800), (2.0, 1816), (3.0, 1791)]


### BorderlineSMOTE

In [6]:
from imblearn.over_sampling import BorderlineSMOTE
X_resampled, y_resampled = BorderlineSMOTE(kind='borderline-1').fit_resample(X_train, y_train)
print(sorted(Counter(y_resampled).items()))
dict_resample['BorderlineSMOTE-1'] = (X_resampled ,y_resampled)

[(0.0, 1802), (1.0, 8), (2.0, 1802), (3.0, 1802)]


In [7]:
from imblearn.over_sampling import BorderlineSMOTE
X_resampled, y_resampled = BorderlineSMOTE(kind='borderline-2').fit_resample(X_train, y_train)
print(sorted(Counter(y_resampled).items()))
dict_resample['BorderlineSMOTE-2'] = (X_resampled ,y_resampled)

[(0.0, 1802), (1.0, 8), (2.0, 1802), (3.0, 1802)]


## Over- and under-sampling

### SMOTEENN

In [8]:
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)
print(sorted(Counter(y_resampled).items()))
dict_resample['SMOTEENN'] = (X_resampled ,y_resampled)

[(0.0, 986), (1.0, 1720), (2.0, 1644), (3.0, 1559)]


### SMOTETomek

In [9]:
from imblearn.combine import SMOTETomek
smote_tomek = SMOTETomek(random_state=0)
X_resampled, y_resampled = smote_tomek.fit_resample(X_train, y_train)
print(sorted(Counter(y_resampled).items()))
dict_resample['SMOTETomek'] = (X_resampled ,y_resampled)

[(0.0, 1769), (1.0, 1800), (2.0, 1789), (3.0, 1774)]


## Bagging

使用训练数据的不同随机子集来训练每个 Base Model，最后每个 Base Model 权重相同，分类问题进行投票，回归问题平均。

随机森林就用到了Bagging，并且具有天然的并行性。

### Bagging

In [10]:
print(sorted(Counter(y_train).items()))
dict_model['Bagging'] = (X_train ,y_train)

[(0.0, 1802), (1.0, 8), (2.0, 47), (3.0, 95)]


### BalancedBagging

In [11]:
print(sorted(Counter(y_train).items()))
dict_model['BalancedBagging'] = (X_train ,y_train)

[(0.0, 1802), (1.0, 8), (2.0, 47), (3.0, 95)]


### BalancedRandomForest

In [12]:
print(sorted(Counter(y_train).items()))
dict_model['BalancedRandomForest'] = (X_train ,y_train)

[(0.0, 1802), (1.0, 8), (2.0, 47), (3.0, 95)]


## Boosting

Boosting是一种迭代的方法，每一次训练会更关心上一次被分错的样本，比如改变被错分的样本的权重的Adaboost方法。还有许多都是基于这种思想，比如Gradient Boosting等。

### RUSBoost

In [13]:
print(sorted(Counter(y_train).items()))
dict_model['RUSBoost'] = (X_train ,y_train)

[(0.0, 1802), (1.0, 8), (2.0, 47), (3.0, 95)]


### EasyEnsemble

In [14]:
print(sorted(Counter(y_train).items()))
dict_model['EasyEnsemble'] = (X_train ,y_train)

[(0.0, 1802), (1.0, 8), (2.0, 47), (3.0, 95)]


## 评估

In [15]:
from sklearn.svm import LinearSVC
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
################### train #####################################
scores={}
cm = []
for name, (X, y) in tqdm_notebook(list(dict_model.items())):
    if name in ['Bagging']:
        from sklearn.ensemble import BaggingClassifier
        from sklearn.tree import DecisionTreeClassifier
        clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                random_state=0)
    elif name in ['BalancedBagging']:
        from imblearn.ensemble import BalancedBaggingClassifier
        clf = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                        sampling_strategy='auto',
                                        replacement=False,
                                        random_state=0)
    elif name in ['BalancedRandomForest']:
        from imblearn.ensemble import BalancedRandomForestClassifier
        clf = BalancedRandomForestClassifier(n_estimators=100, random_state=0)
    elif name in ['RUSBoost']:
        from imblearn.ensemble import RUSBoostClassifier
        clf = RUSBoostClassifier(random_state=0)
    elif name in ['EasyEnsemble']:
        from imblearn.ensemble import EasyEnsembleClassifier
        clf = EasyEnsembleClassifier(random_state=0)

    clf.fit(X, y)
    y_test_pred = clf.predict(X_test)
    y_test_score = clf.predict_proba(X_test)  # valid score

    # 混淆矩阵
    cnf_matrix = confusion_matrix(y_test, y_test_pred)
    cm.append((name, cnf_matrix))

  0%|          | 0/5 [00:00<?, ?it/s]

In [16]:
for name, (X, y) in tqdm_notebook(list(dict_resample.items())):
    RANDOM_STATE = 2019
    clf = RandomForestClassifier(n_estimators=161,
                                     max_depth=49,
                                     max_features="sqrt",
                                     random_state=RANDOM_STATE)
    
    clf.fit(X, y)
    y_test_pred = clf.predict(X_test)
    y_test_score = clf.predict_proba(X_test)  # valid score
    
    cnf_matrix = confusion_matrix(y_test, y_test_pred)
    cm.append(('RF'+name, cnf_matrix))

  0%|          | 0/7 [00:00<?, ?it/s]

In [17]:
from sklearn.linear_model import LogisticRegression
for name, (X, y) in tqdm_notebook(list(dict_resample.items())):
    clf = LogisticRegression(penalty="l2",solver="liblinear",C=0.8,max_iter=1000)
       
    clf.fit(X, y)
    y_test_pred = clf.predict(X_test)
    y_test_score = clf.predict_proba(X_test)  # valid score
    
    cnf_matrix = confusion_matrix(y_test, y_test_pred)
    cm.append(('LR'+name, cnf_matrix))

  0%|          | 0/7 [00:00<?, ?it/s]

In [18]:
for name, (X, y) in tqdm_notebook(list(dict_resample.items())):
    clf = AdaBoostClassifier(n_estimators=161,
                             learning_rate=0.5,
                                     random_state=RANDOM_STATE)
    
    clf.fit(X, y)
    y_test_pred = clf.predict(X_test)
    y_test_score = clf.predict_proba(X_test)  # valid score
    
    cnf_matrix = confusion_matrix(y_test, y_test_pred)
    cm.append(('AdaBoost'+name, cnf_matrix))

  0%|          | 0/7 [00:00<?, ?it/s]

In [19]:
for name,c in cm:
    print('{:.3f}'.format(c[1][1]/(c[1][1]+c[1][0]+c[1][2]+c[1][3]+c[0][1]+c[2][1]+c[3][1])))

0.000
0.016
0.000
0.000
0.013
0.000
0.000
0.000
0.000
0.000
0.000
0.000
0.017
0.018
0.037
0.000
0.000
0.015
0.017
0.000
0.036
0.000
0.000
0.000
0.000
0.000


In [20]:
#thunder
for name,c in cm:
    print('{:.3f}'.format(c[2][2]/(c[2][2]+c[2][0]+c[2][1]+c[2][3]+c[0][2]+c[1][2]+c[3][2])))

0.000
0.087
0.071
0.082
0.074
0.000
0.057
0.083
0.045
0.094
0.062
0.053
0.076
0.078
0.084
0.079
0.069
0.090
0.091
0.041
0.065
0.065
0.036
0.063
0.062
0.064


In [21]:
#wind
for name,c in cm:
    print('{:.3f}'.format(c[3][3]/(c[3][3]+c[3][0]+c[3][1]+c[3][2]+c[0][3]+c[1][3]+c[2][3])))

0.000
0.066
0.070
0.058
0.086
0.000
0.100
0.119
0.038
0.092
0.104
0.111
0.069
0.090
0.071
0.094
0.128
0.073
0.080
0.065
0.079
0.065
0.076
0.099
0.061
0.085


In [22]:
for name,c in cm:
    print("{:}".format(name))

Bagging
BalancedBagging
BalancedRandomForest
RUSBoost
EasyEnsemble
RFRandomOverSampler
RFSMOTE
RFADASYN
RFBorderlineSMOTE-1
RFBorderlineSMOTE-2
RFSMOTEENN
RFSMOTETomek
LRRandomOverSampler
LRSMOTE
LRADASYN
LRBorderlineSMOTE-1
LRBorderlineSMOTE-2
LRSMOTEENN
LRSMOTETomek
AdaBoostRandomOverSampler
AdaBoostSMOTE
AdaBoostADASYN
AdaBoostBorderlineSMOTE-1
AdaBoostBorderlineSMOTE-2
AdaBoostSMOTEENN
AdaBoostSMOTETomek


In [23]:
Y = pd.Series(Y) 
Y.value_counts()

0.0    2247
3.0     118
2.0      62
1.0      13
dtype: int64

In [24]:
y_test = pd.Series(y_test) 
y_test.value_counts()

0.0    445
3.0     23
2.0     15
1.0      5
dtype: int64