In [1]:
import tensorflow as tf
import numpy as np
import sklearn.feature_selection as fs

TRAIN_DATA_FILE = "train_data.csv"
TEST_DATA_FILE = "test_data.csv"


def read_split_train_test_data():
    training_set = tf.contrib.learn.datasets.base.load_csv_with_header(
        filename=TRAIN_DATA_FILE,
        target_dtype=np.int,
        features_dtype=np.float32,
        target_column=0
    )
    test_set = tf.contrib.learn.datasets.base.load_csv_with_header(
        filename=TEST_DATA_FILE,
        target_dtype=np.int,
        features_dtype=np.float32,
        target_column=0
    )
    # print(training_set)
    X, y = training_set.data, training_set.target
    # print(X.shape)
    # print(y.shape)
    X_test, y_test = test_set.data, test_set.target
    # print(X_test.shape)
    # print(y_test.shape)
    return X, y, X_test, y_test


def read_train_test_data():
    '''不把标签和数据分开'''
    x, y, x_t, y_t = read_split_train_test_data()
    y = y.reshape(x.shape[0], -1)
    # print(y.shape)
    y_t = y_t.reshape(x_t.shape[0], -1)
    x_train = np.concatenate((x, y), axis=1)
    x_test = np.concatenate((x_t, y_t), axis=1)
    # print(x_train.shape)
    return x_train, x_test


def split_data_label(X):
    '''将数据和标签分开,默认标签位于最后一列'''
    data, label = X[:, :-1], X[:, -1]
    return data, label


d_train, d_test = read_train_test_data()


In [62]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from collections import Counter

In [68]:
# 提取出5个类别,把1(森林去掉)
x_temp = np.array([d_train[i] for i in range(d_train.shape[0]) if d_train[i][-1] != 1])
x_train, y_train = split_data_label(x_temp)
x_temp = np.array([list(d_test[i]) for i in range(d_test.shape[0]) if d_test[i][-1] != 1])
x_test, y_test = split_data_label(x_temp)

cls = GaussianNB()

y_pred = cls.fit(x_train, y_train).predict(x_test)
acc = metrics.accuracy_score(y_test, y_pred)
print(acc)
print(metrics.confusion_matrix(y_test, y_pred))

# SMOTE+RF

# 集成

bbc = BalancedBaggingClassifier(base_estimator=RandomForestClassifier())

y_pred = bbc.fit(x_train, y_train).predict(x_test)
acc = metrics.accuracy_score(y_test, y_pred)
print(acc)
print(metrics.confusion_matrix(y_test, y_pred))

# 结合降采样和过采样

X_resampled, y_resampled = SMOTE().fit_sample(x_train, y_train)
print(sorted(Counter(y_resampled).items()))

cls = GaussianNB()

y_pred = cls.fit(X_resampled, y_resampled).predict(x_test)
acc = metrics.accuracy_score(y_test, y_pred)
print(acc)
print(metrics.confusion_matrix(y_test, y_pred))


0.779279279279
[[37  0  2  4  3]
 [ 0 32 12  3  0]
 [ 0  2 46  3  2]
 [ 1  1  7 24  3]
 [ 0  0  2  4 34]]
0.810810810811
[[38  0  2  4  2]
 [ 0 39  5  3  0]
 [ 0  4 44  4  1]
 [ 1  2  8 25  0]
 [ 0  0  3  3 34]]


[(0.0, 1441), (2.0, 1441), (3.0, 1441), (4.0, 1441), (5.0, 1441)]
0.765765765766
[[37  0  3  3  3]
 [ 0 37  7  3  0]
 [ 0  3 43  6  1]
 [ 0  2  6 25  3]
 [ 5  0  1  6 28]]


In [71]:
# 提取出4个类别,把1(森林)和4(草地)去掉
x_temp = np.array([d_train[i] for i in range(d_train.shape[0]) if d_train[i][-1] != 1])
x_temp = np.array([x_temp[i] for i in range(x_temp.shape[0]) if x_temp[i][-1] != 4])
x_train, y_train = split_data_label(x_temp)
x_temp = np.array([d_test[i] for i in range(d_test.shape[0]) if d_test[i][-1] != 1])
x_temp = np.array([x_temp[i] for i in range(x_temp.shape[0]) if x_temp[i][-1] != 4])
x_test, y_test = split_data_label(x_temp)

cls = GaussianNB()

y_pred = cls.fit(x_train, y_train).predict(x_test)
acc = metrics.accuracy_score(y_test, y_pred)
print(acc)
print(metrics.confusion_matrix(y_test, y_pred))

# SMOTE+RF

# 集成

bbc = BalancedBaggingClassifier(base_estimator=RandomForestClassifier())

y_pred = bbc.fit(x_train, y_train).predict(x_test)
acc = metrics.accuracy_score(y_test, y_pred)
print(acc)
print(metrics.confusion_matrix(y_test, y_pred))

# 结合降采样和过采样

X_resampled, y_resampled = SMOTE().fit_sample(x_train, y_train)
print(sorted(Counter(y_resampled).items()))

cls = GaussianNB()

y_pred = cls.fit(X_resampled, y_resampled).predict(x_test)
acc = metrics.accuracy_score(y_test, y_pred)
print(acc)
print(metrics.confusion_matrix(y_test, y_pred))


0.827956989247
[[37  0  6  3]
 [ 0 33 14  0]
 [ 0  2 49  2]
 [ 0  0  5 35]]
0.865591397849
[[38  0  4  4]
 [ 1 41  5  0]
 [ 0  3 47  3]
 [ 1  0  4 35]]


[(0.0, 1441), (2.0, 1441), (3.0, 1441), (5.0, 1441)]
0.811827956989
[[37  0  6  3]
 [ 0 36 11  0]
 [ 0  3 48  2]
 [ 5  0  5 30]]


In [76]:
# 提取出剩余2个类别,1(森林)和4(草地)
x_temp = np.array([d_train[i] for i in range(d_train.shape[0]) if d_train[i][-1] == 1 or d_train[i][-1] == 4])
x_train, y_train = split_data_label(x_temp)
x_temp = np.array([d_test[i] for i in range(d_test.shape[0]) if d_test[i][-1] == 1 or d_test[i][-1] == 4])
x_test, y_test = split_data_label(x_temp)

cls = GaussianNB()

y_pred = cls.fit(x_train, y_train).predict(x_test)
acc = metrics.accuracy_score(y_test, y_pred)
print(acc)
print(metrics.confusion_matrix(y_test, y_pred))

# 集成

bbc = BalancedBaggingClassifier(base_estimator=RandomForestClassifier())

y_pred = bbc.fit(x_train, y_train).predict(x_test)
acc = metrics.accuracy_score(y_test, y_pred)
print(acc)
print(metrics.confusion_matrix(y_test, y_pred))

# 结合降采样和过采样

X_resampled, y_resampled = SMOTE().fit_sample(x_train, y_train)
print(sorted(Counter(y_resampled).items()))

cls = GaussianNB()

y_pred = cls.fit(X_resampled, y_resampled).predict(x_test)
acc = metrics.accuracy_score(y_test, y_pred)
print(acc)
print(metrics.confusion_matrix(y_test, y_pred))


0.798245614035
[[69  9]
 [14 22]]


0.736842105263
[[56 22]
 [ 8 28]]
[(1.0, 7431), (4.0, 7431)]
0.780701754386
[[65 13]
 [12 24]]
