In [3]:
import tensorflow as tf
import numpy as np
import sklearn.feature_selection as fs

TRAIN_DATA_FILE = "train_data.csv"
TEST_DATA_FILE = "test_data.csv"


def read_split_train_test_data():
    training_set = tf.contrib.learn.datasets.base.load_csv_with_header(
        filename=TRAIN_DATA_FILE,
        target_dtype=np.int,
        features_dtype=np.float32,
        target_column=0
    )
    test_set = tf.contrib.learn.datasets.base.load_csv_with_header(
        filename=TEST_DATA_FILE,
        target_dtype=np.int,
        features_dtype=np.float32,
        target_column=0
    )
    # print(training_set)
    X, y = training_set.data, training_set.target
    # print(X.shape)
    # print(y.shape)
    X_test, y_test = test_set.data, test_set.target
    # print(X_test.shape)
    # print(y_test.shape)
    return X, y, X_test, y_test


def read_train_test_data():
    '''不把标签和数据分开'''
    x, y, x_t, y_t = read_split_train_test_data()
    y = y.reshape(x.shape[0], -1)
    # print(y.shape)
    y_t = y_t.reshape(x_t.shape[0], -1)
    x_train = np.concatenate((x, y), axis=1)
    x_test = np.concatenate((x_t, y_t), axis=1)
    # print(x_train.shape)
    return x_train, x_test


def split_data_label(X):
    '''将数据和标签分开,默认标签位于最后一列'''
    data, label = X[:, :-1], X[:, -1]
    return data, label


d_train, d_test = read_train_test_data()


In [1]:
from imblearn.over_sampling import SMOTE, ADASYN
from collections import Counter

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [11]:
x_test, y_test = split_data_label(d_test)
x_train, y_train = split_data_label(d_train)

# 过采样

# 看看采样前类别分布
print(sorted(Counter(y_train).items()))

# 重采样
X_resampled, y_resampled = SMOTE().fit_sample(x_train, y_train)
print(sorted(Counter(y_resampled).items()))

cls = GaussianNB()

y_pred = cls.fit(X_resampled, y_resampled).predict(x_test)
acc = metrics.accuracy_score(y_test, y_pred)
print(acc)

[(0.0, 205), (1.0, 7431), (2.0, 53), (3.0, 1441), (4.0, 446), (5.0, 969)]


[(0.0, 7431), (1.0, 7431), (2.0, 7431), (3.0, 7431), (4.0, 7431), (5.0, 7431)]
0.716666666667


In [12]:
# 试一下降采样

from imblearn.under_sampling import ClusterCentroids

X_resampled, y_resampled = ClusterCentroids().fit_sample(x_train, y_train)
print(sorted(Counter(y_resampled).items()))

cls = GaussianNB()

y_pred = cls.fit(X_resampled, y_resampled).predict(x_test)
acc = metrics.accuracy_score(y_test, y_pred)
print(acc)

[(0.0, 53), (1.0, 53), (2.0, 53), (3.0, 53), (4.0, 53), (5.0, 53)]
0.62


In [13]:
# 结合降采样和过采样
from imblearn.combine import SMOTEENN

X_resampled, y_resampled = SMOTEENN().fit_sample(x_train, y_train)
print(sorted(Counter(y_resampled).items()))

cls = GaussianNB()

y_pred = cls.fit(X_resampled, y_resampled).predict(x_test)
acc = metrics.accuracy_score(y_test, y_pred)
print(acc)

[(0.0, 7429), (1.0, 7200), (2.0, 7431), (3.0, 7391), (4.0, 7428), (5.0, 7414)]
0.716666666667


In [38]:
# 结合降采样和过采样
from imblearn.combine import SMOTETomek

X_resampled, y_resampled = SMOTETomek().fit_sample(x_train, y_train)
print(sorted(Counter(y_resampled).items()))

cls = GaussianNB()

y_pred = cls.fit(X_resampled, y_resampled).predict(x_test)
acc = metrics.accuracy_score(y_test, y_pred)
print(acc)
print(metrics.confusion_matrix(y_test, y_pred))


[(0.0, 7431), (1.0, 7431), (2.0, 7431), (3.0, 7431), (4.0, 7431), (5.0, 7431)]
0.713333333333
[[37  2  0  2  2  3]
 [ 0 58  6  6  8  0]
 [ 0 13 32  2  0  0]
 [ 0  7  3 39  3  1]
 [ 1  8  2  2 20  3]
 [ 5  1  0  0  6 28]]


In [65]:
# 集成
from imblearn.ensemble import BalancedBaggingClassifier

bbc = BalancedBaggingClassifier(base_estimator=RandomForestClassifier())

y_pred = bbc.fit(x_train, y_train).predict(x_test)
acc = metrics.accuracy_score(y_test, y_pred)
print(acc)
print(metrics.confusion_matrix(y_test, y_pred))


0.726666666667
[[38  2  0  3  1  2]
 [ 0 50  6  5 17  0]
 [ 1  8 33  2  3  0]
 [ 0  2  4 42  4  1]
 [ 0  5  2  8 21  0]
 [ 1  1  0  0  4 34]]
