In [None]:
# 导入基本的库
import numpy as np
import pandas as pd

In [None]:
# 读入特征名称，添加label列名
with open('data/features.txt', 'r') as fr:
    features = np.array([line.strip().split(': ') for line in fr.readlines()])
features = np.append(features, ['label', 'symbolic.']).reshape(-1, 2)
symbolic_features = features[features[:, 1] == 'symbolic.'][:, 0]
features.shape, symbolic_features

In [None]:
# 读取数据文件，展示头几个样本。
data_df = pd.read_csv('data/kddcup.data_10_percent', header=None, names=features[:, 0])
data_df.head()

In [None]:
# 内存不够，只保留前10万行数据
data_df.drop(data_df.index[100000:], inplace=True)

In [None]:
# 查看数据规模
data_df.shape

In [None]:
# 列数太多，分批次查看数据，了解数据特点
data_df.iloc[:, :14].head()

In [None]:
data_df.iloc[:, 14:28].head()

In [None]:
data_df.iloc[:, 28:].head()

In [None]:
# 查看标签分布
data_df.groupby(['label']).size()

In [None]:
# 标签分布极不平衡，合并标签，按攻击类别处理
with open('data/labels.txt', 'r') as fr:
    labels = np.array([line.strip().split(': ') for line in fr.readlines()])
label_types = {}
for pair in labels:
    label_types[pair[0]] = pair[1]
data_df['label_type'] = [label_types[label] for label in data_df['label']]
data_df.groupby(['label_type']).size()

In [None]:
# 判断是否是attack
data_df['label_attack'] = ['attack' if label != 'normal.' else 'normal' for label in data_df['label']]
data_df.groupby(['label_attack']).size()

In [None]:
# 有哪些列是类别特征，是否与features.txt中一致？
data_df.columns[data_df.dtypes == 'object'], symbolic_features

In [None]:
# 有几个类别特征被pandas当做数值处理了，需要转换成object类型
to_object = ['land', 'logged_in', 'is_host_login', 'is_guest_login']
data_df[to_object] = data_df[to_object].astype('object')
object_features = data_df.columns[data_df.dtypes == 'object']
numberic_features = data_df.columns[data_df.dtypes != 'object']
symbolic_features, object_features, numberic_features

In [None]:
# 查看类别类型的数据特点，观察unique值，判断转换成独热编码的数据的规模
data_df[object_features].describe()

In [None]:
# 将类别特征转换成独热编码，label、label_type、normal是标签，不需要转换
object_features_one_hot = pd.get_dummies(data_df[object_features[:-3]])
# 将数据按照标签+独热编码+数值类型排列，获得用来训练和测试的数据集
warm_df = data_df.iloc[:, -3:].join(object_features_one_hot).join(data_df[numberic_features])
warm_df.head()

In [None]:
# 把标签类别当做决策树的标签，将数据集分成训练数据和测试数据

feature_names = np.array(warm_df.columns[3:].tolist())

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    warm_df[feature_names].values, 
    warm_df['label_type'].values,
    test_size=0.2
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

params_min_samples_split = [3, 5, 7, 9]
params_min_samples_leaf = [1, 2, 4]
params_max_depth = np.arange(10, 51, 5)

# 构造决策树，并做交叉验证
parameters = {'min_samples_split': params_min_samples_split,
             'min_samples_leaf': params_min_samples_leaf,
             'max_depth': params_max_depth}
dt_clf = DecisionTreeClassifier(criterion='entropy')
model = GridSearchCV(dt_clf, parameters, cv=5, scoring='f1_weighted', n_jobs=4)
model.fit(X_train, y_train)

# 输出最好的参数
print(model.best_params_)

In [None]:
# 使用获得的最佳参数，重新训练模型
# best_min_samples_split = 3
# best_min_samples_leaf = 1
# best_max_depth = 20
best_min_samples_split = model.best_params_['min_samples_split']
best_min_samples_leaf = model.best_params_['min_samples_leaf']
best_max_depth = model.best_params_['max_depth']

In [None]:
from sklearn.metrics import classification_report

dt_clf = DecisionTreeClassifier(criterion='entropy',
                                min_samples_split=best_min_samples_split,
                                min_samples_leaf=best_min_samples_leaf,
                                max_depth=best_max_depth)
dt_clf.fit(X_train, y_train)

# 查看在测试集上的效果
predictions = dt_clf.predict(X_test)
print(classification_report(y_test, predictions))

In [None]:
# 使用同样的参数，如果只考虑标签是否为attack，结果如何？

X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(
    warm_df[feature_names].values, 
    warm_df['label_attack'].values,
    test_size=0.2
)
dt_clf_1 = DecisionTreeClassifier(criterion='entropy',
                                min_samples_split=best_min_samples_split,
                                min_samples_leaf=best_min_samples_leaf,
                                max_depth=best_max_depth)
dt_clf_1.fit(X_train_1, y_train_1)
predictions = dt_clf_1.predict(X_test_1)
print(classification_report(y_test_1, predictions))

In [None]:
# 使用同样的参数，考虑所有类型的标签呢？

X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(
    warm_df[feature_names].values, 
    warm_df['label'].values,
    test_size=0.2
)
dt_clf_2 = DecisionTreeClassifier(criterion='entropy',
                                min_samples_split=best_min_samples_split,
                                min_samples_leaf=best_min_samples_leaf,
                                max_depth=best_max_depth)
dt_clf_2.fit(X_train_2, y_train_2)
predictions = dt_clf_2.predict(X_test_2)
print(classification_report(y_test_2, predictions))

In [None]:
from IPython.display import Image
from sklearn import tree
import pydotplus

dot_data = tree.export_graphviz(dt_clf, out_file=None,
                         feature_names=feature_names,
                         class_names=warm_df['label_type'].unique(),
                         filled=True, rounded=True,
                         special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("kdd99.label_type.pdf")
Image(graph.create_png())