In [None]:
# 导入基本的库
import numpy as np
import pandas as pd

In [None]:
import time

class Benchmark():
    """Benchmark programs."""
    def __init__(self, prefix=None):
        self.prefix = prefix + ' ' if prefix else ''

    def __enter__(self):
        self.start = time.time()

    def __exit__(self, *args):
        print('%stime: %.4f sec' % (self.prefix, time.time() - self.start))

In [None]:
def show_data(line):
    pd.set_option('display.max_rows', None)
    tmp_df = pd.DataFrame([line])
    tmp_df.columns = warm_df.iloc[:, 3:].columns
    print(tmp_df.head().stack())

In [None]:
# 读入特征名称，添加label列名
with open('data/features.txt', 'r') as fr:
    features = np.array([line.strip().split(': ') for line in fr.readlines()])
features = np.append(features, ['label', 'symbolic.']).reshape(-1, 2)
symbolic_features = features[features[:, 1] == 'symbolic.'][:, 0]
features.shape, symbolic_features

In [None]:
# 读取数据文件，展示头几个样本。
data_df = pd.read_csv('data/kddcup.data_10_percent', header=None, names=features[:, 0])
data_df.head()

In [None]:
# 内存不够，只保留前10万行数据
data_df.drop(data_df.index[100000:], inplace=True)

In [None]:
# 标签分布极不平衡，合并标签，按攻击类别处理
with open('data/labels.txt', 'r') as fr:
    labels = np.array([line.strip().split(': ') for line in fr.readlines()])
label_types = {}
for pair in labels:
    label_types[pair[0]] = pair[1]
data_df['label_type'] = [label_types[label] for label in data_df['label']]
data_df.groupby(['label_type']).size()

In [None]:
# 判断是否是attack
data_df['label_attack'] = ['attack' if label != 'normal.' else 'normal' for label in data_df['label']]
data_df.groupby(['label_attack']).size()

In [None]:
# 有几个类别特征被pandas当做数值处理了，需要转换成object类型
to_object = ['land', 'logged_in', 'is_host_login', 'is_guest_login']
data_df[to_object] = data_df[to_object].astype('object')
object_features = data_df.columns[data_df.dtypes == 'object']
numberic_features = data_df.columns[data_df.dtypes != 'object']

In [None]:
# 将类别特征转换成独热编码，label、label_type、normal是标签，不需要转换
object_features_one_hot = pd.get_dummies(data_df[object_features[:-3]])
# 将数据按照标签+独热编码+数值类型排列，获得用来训练和测试的数据集
warm_df = data_df.iloc[:, -3:].join(object_features_one_hot).join(data_df[numberic_features])
warm_df.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

feature_names = np.array(warm_df.columns[3:].tolist())

X_train, X_test, y_train, y_test = train_test_split(
    warm_df[feature_names].values, 
    warm_df['label_type'].values,
    test_size=0.2
)


dt_clf = DecisionTreeClassifier(criterion='entropy',
                                min_samples_split=3,
                                min_samples_leaf=1,
                                max_depth=20)
dt_clf.fit(X_train, y_train)

In [None]:
X_test[1]

In [None]:
dt_clf.predict([X_test[20]])

In [None]:
[X_test[20]]

In [None]:
with Benchmark("predict one"):
    dt_clf.predict(X_train[:50000])
with Benchmark("predict one"):
    dt_clf.predict(X_train[:100000])



In [None]:
show_data(X_test[20])