# Imbalanced-learn for

In [1]:
from sklearn.metrics import classification_report,confusion_matrix,log_loss,auc,plot_confusion_matrix
from sklearn.preprocessing import (
    MinMaxScaler, label_binarize, OneHotEncoder, LabelEncoder)
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV,train_test_split
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from itertools import cycle, product
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import cycle
import seaborn as sns
import warnings
from tqdm import tqdm_notebook
import random
warnings.filterwarnings('ignore')
%matplotlib inline

## Raw Data

In [2]:
dataset=pd.read_excel('../Case.xlsx',sheet_name='initial')
array_0=dataset.values

array=[]
for i in array_0:
    if i[0]==1:
        array.append(list(i))
    data=np.array(array)

X =data[:,1:18]
Y =data[:,20]
seed = random.randint(1,1000)
X_train,X_test,y_train,y_test = train_test_split(X, Y, test_size=0.2, random_state=seed)

train_set_dict = {}
train_set_dict['RawData'] = (X_train,y_train)

label_dict = {'Normal': 0,
              'rain': 1,
             'thunder': 2,
             'wind': 3}
labels = [key for i in sorted(label_dict.values()) for key,val in label_dict.items() if val==i]
labels_number = sorted(label_dict.values()) # [0, 1, 2, 3]

## Over-sampling

### RandomOverSample

In [3]:
# 简单的复制样本
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled ,y_resampled= ros.fit_resample(X_train,y_train)
from collections import Counter
print(sorted(Counter(y_resampled).items()))
train_set_dict['RandomOverSampler'] = (X_resampled ,y_resampled)

[(0.0, 179), (2.0, 179), (3.0, 179)]


### BorderlineSMOTE

In [4]:
from imblearn.over_sampling import BorderlineSMOTE
X_resampled, y_resampled = BorderlineSMOTE(kind='borderline-1',k_neighbors=3).fit_resample(X_train, y_train)
print(sorted(Counter(y_resampled).items()))
train_set_dict['BorderlineSMOTE-1'] = (X_resampled ,y_resampled)

[(0.0, 179), (2.0, 2), (3.0, 179)]


In [5]:
from imblearn.over_sampling import BorderlineSMOTE
X_resampled, y_resampled = BorderlineSMOTE(kind='borderline-2',k_neighbors=3).fit_resample(X_train, y_train)
print(sorted(Counter(y_resampled).items()))
train_set_dict['BorderlineSMOTE-2'] = (X_resampled ,y_resampled)

[(0.0, 179), (2.0, 2), (3.0, 179)]


## Under-sampling

### ClusterCentroids

In [6]:
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=0)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
print(sorted(Counter(y_resampled).items()))
train_set_dict['ClusterCentroids'] = (X_resampled ,y_resampled)

[(0.0, 2), (2.0, 2), (3.0, 2)]


### RandomUnderSampler

In [7]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
print(sorted(Counter(y_resampled).items()))
train_set_dict['RandomUnderSampler'] = (X_resampled ,y_resampled)

[(0.0, 2), (2.0, 2), (3.0, 2)]


### EditedNearestNeighbours

In [8]:
from imblearn.under_sampling import EditedNearestNeighbours
enn = EditedNearestNeighbours()
X_resampled, y_resampled = enn.fit_resample(X_train, y_train)
print(sorted(Counter(y_resampled).items()))
train_set_dict['EditedNearestNeighbours'] = (X_resampled ,y_resampled)

[(0.0, 148), (2.0, 2)]


### RepeatedEditedNearestNeighbours

In [9]:
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
renn = RepeatedEditedNearestNeighbours()
X_resampled, y_resampled = renn.fit_resample(X_train, y_train)
print(sorted(Counter(y_resampled).items()))
train_set_dict['RepeatedEditedNearestNeighbours'] = (X_resampled ,y_resampled)

[(0.0, 148), (2.0, 2)]


### AllKNN

In [10]:
from imblearn.under_sampling import AllKNN
allknn = AllKNN()
X_resampled, y_resampled = allknn.fit_resample(X_train, y_train)
print(sorted(Counter(y_resampled).items()))
train_set_dict['AllKNN'] = (X_resampled ,y_resampled)

[(0.0, 171), (2.0, 2)]


### CondensedNearestNeighbour

In [11]:
from imblearn.under_sampling import CondensedNearestNeighbour
cnn = CondensedNearestNeighbour(random_state=0)
X_resampled, y_resampled = cnn.fit_resample(X_train, y_train)
print(sorted(Counter(y_resampled).items()))
train_set_dict['CondensedNearestNeighbour'] = (X_resampled ,y_resampled)
#slow

[(0.0, 14), (2.0, 2), (3.0, 4)]


### OneSidedSelection

In [12]:
from imblearn.under_sampling import OneSidedSelection
oss = OneSidedSelection(random_state=0)
X_resampled, y_resampled = oss.fit_resample(X_train, y_train)
print(sorted(Counter(y_resampled).items()))
train_set_dict['OneSidedSelection'] = (X_resampled ,y_resampled)

[(0.0, 42), (2.0, 2), (3.0, 2)]


### NeighbourhoodCleaningRule

In [13]:
from imblearn.under_sampling import NeighbourhoodCleaningRule
ncr = NeighbourhoodCleaningRule()
X_resampled, y_resampled = ncr.fit_resample(X_train, y_train)
print(sorted(Counter(y_resampled).items()))
train_set_dict['NeighbourhoodCleaningRule'] = (X_resampled ,y_resampled)

[(0.0, 173), (2.0, 2)]


## Bagging

使用训练数据的不同随机子集来训练每个 Base Model，最后每个 Base Model 权重相同，分类问题进行投票，回归问题平均。

随机森林就用到了Bagging，并且具有天然的并行性。

### Bagging

In [14]:
print(sorted(Counter(y_train).items()))
train_set_dict['Bagging'] = (X_train ,y_train)

[(0.0, 179), (2.0, 2), (3.0, 6)]


### BalancedBagging

In [15]:
print(sorted(Counter(y_train).items()))
train_set_dict['BalancedBagging'] = (X_train ,y_train)

[(0.0, 179), (2.0, 2), (3.0, 6)]


### BalancedRandomForest

In [16]:
print(sorted(Counter(y_train).items()))
train_set_dict['BalancedRandomForest'] = (X_train ,y_train)

[(0.0, 179), (2.0, 2), (3.0, 6)]


## Boosting

Boosting是一种迭代的方法，每一次训练会更关心上一次被分错的样本，比如改变被错分的样本的权重的Adaboost方法。还有许多都是基于这种思想，比如Gradient Boosting等。

### RUSBoost

In [17]:
print(sorted(Counter(y_train).items()))
train_set_dict['RUSBoost'] = (X_train ,y_train)

[(0.0, 179), (2.0, 2), (3.0, 6)]


### EasyEnsemble

In [18]:
print(sorted(Counter(y_train).items()))
train_set_dict['EasyEnsemble'] = (X_train ,y_train)

[(0.0, 179), (2.0, 2), (3.0, 6)]


## 评估

In [24]:
from sklearn.svm import LinearSVC
from imblearn.metrics import classification_report_imbalanced
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
################### train #####################################
plt.figure(figsize=(12, 6))
cm = []
clf_report_list = []
for name, (X, y) in tqdm_notebook(list(train_set_dict.items())):
    RANDOM_STATE = 2019
    if name in ['Bagging']:
        from sklearn.ensemble import BaggingClassifier
        from sklearn.tree import DecisionTreeClassifier
        clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                random_state=0)
    elif name in ['BalancedBagging']:
        from imblearn.ensemble import BalancedBaggingClassifier
        clf = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                        sampling_strategy='auto',
                                        replacement=False,
                                        random_state=0)
    elif name in ['BalancedRandomForest']:
        from imblearn.ensemble import BalancedRandomForestClassifier
        clf = BalancedRandomForestClassifier(n_estimators=100, random_state=0)
    elif name in ['RUSBoost']:
        from imblearn.ensemble import RUSBoostClassifier
        clf = RUSBoostClassifier(random_state=0)
    elif name in ['EasyEnsemble']:
        from imblearn.ensemble import EasyEnsembleClassifier
        clf = EasyEnsembleClassifier(random_state=0)
    else:
        # 模型
        clf = RandomForestClassifier(n_estimators=161,
                                     max_depth=49,
                                     max_features="sqrt",
                                     random_state=RANDOM_STATE)
    clf.fit(X, y)
    y_test_pred = clf.predict(X_test)
    y_test_score = clf.predict_proba(X_test)  # valid score
######################### 测试集评估 ########################

    # 分类报告
    clf_report = classification_report_imbalanced(
        y_test, y_test_pred, digits=4, target_names=labels)
    clf_report_list.append(clf_report)
    # 混淆矩阵
    cnf_matrix = confusion_matrix(y_test, y_test_pred)
    cm.append((name, cnf_matrix))

  0%|          | 0/17 [00:00<?, ?it/s]

<Figure size 864x432 with 0 Axes>