In [None]:
import utils
from net.resnet import *
import time
import pickle
from datetime import datetime

import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.nn import functional as F

import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder

In [None]:
path = './UNSW-NB15/'
df_train = pd.read_csv(path + 'UNSW_NB15_training-set.csv')
df_test = pd.read_csv(path + 'UNSW_NB15_testing-set.csv')
df = pd.concat([df_train, df_test])
df = df.drop(['id', 'label'], axis=1)

#区分数值列和非数值列
number_col = df.select_dtypes(include=['number']).columns
cat_col = df.columns.difference(number_col)
cat_col = cat_col.drop('attack_cat')
df_cat = df[cat_col].copy()

# one-hot编码
one_hot_data = pd.get_dummies(df_cat, columns=cat_col)

# 将原数据的分类变量去掉
one_hot_df = pd.concat([df, one_hot_data],axis=1)
one_hot_df.drop(columns=cat_col, inplace=True)


normalized_df = utils.normalization(one_hot_df.copy(), number_col)

# 为不同的类别进行编码
labels = pd.DataFrame(df.attack_cat)
label_encoder = LabelEncoder()
enc_label = labels.apply(label_encoder.fit_transform)
normalized_df.attack_cat = enc_label
label_encoder.classes_
label_num = len(label_encoder.classes_)

#print(enc_label[enc_label['attack_cat'] == 0])

data = normalized_df

X = data.drop(columns=['attack_cat'])
y = data['attack_cat']
X_train = X[0:df_train.shape[0]]
y_train = y[0:df_train.shape[0]]
X_test = X[df_train.shape[0]:]
y_test = y[df_train.shape[0]:]

    
train_data = utils.LoadData(X_train, y_train)
test_data = utils.LoadData(X_test, y_test)

batch_size = 256

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

resnet_model = ResNet(label_num)

epochs = 40
lr = 1e-4 
momentum = 0.9
optimizer = torch.optim.Adam(resnet_model.parameters(), lr=lr)
loss_fn = nn.CrossEntropyLoss()

resnet_model.to(device=device)

In [None]:
# 未挑选特征时的训练和测试
timestamp = datetime.fromtimestamp(time.time()).strftime('%Y%m%d_%H%M%S_')
old_model_name = ''
new_model_name = timestamp + 'resnet_model.pth'

if old_model_name != '':
    resnet_model.load_state_dict(torch.load(path + old_model_name))
else:
    losses, iter = utils.train(train_data, batch_size, device, resnet_model, optimizer, loss_fn, epochs)
    torch.save(resnet_model.state_dict(), path + new_model_name)

    utils.loss_value_plot(losses, iter)
    plt.savefig(path + timestamp + 'resnet_loss.png')

#utils.confusion_matrix_per_class_acc(test_data, batch_size, device, resnet_model, label_encoder.classes_)

In [None]:
utils.test(test_data, batch_size, device, resnet_model, loss_fn)

In [None]:
# 利用全部特征训练出的模型来挑选特征 permutation feature importance
_, loss_ori = utils.test(test_data, batch_size, device, resnet_model, loss_fn)
losses = []
for i in range(X_test.shape[1]):
    X_test_new = X_test
    X_test_new.iloc[:,i] = X_test_new.iloc[:,i].sample(frac=1).reset_index(drop=True)
    test_data = utils.LoadData(X_test_new, y_test)
    _, loss_i = utils.test(test_data, batch_size, device, resnet_model, loss_fn)
    losses.append((loss_i - loss_ori, i))
    print((loss_i-loss_ori, i))

losses.sort(key=lambda x: x[0], reverse=True)
print(losses)

# 保存列表到文件
with open('losses.pkl', 'wb') as file:
    pickle.dump(losses, file)

In [None]:
# 加载持久化的特征重要程度
with open('losses.pkl', 'rb') as file:
    losses = pickle.load(file)

#特征重要性排序图
plt.rcParams['font.family'] = 'Microsoft YaHei'
plt.bar(range(len(losses)), [x[0] for x in losses])
plt.title('Permutation feature importance 排序图')
plt.xlabel('特征')
plt.ylabel('重要性')
plt.xticks(rotation=60)

plt.show()

# 特征重要性排序
k = 50
k_index = []
for i in range(k):
    k_index.append(losses[i][1])

#新特征构成的数据集
selected_X_train = pd.DataFrame(X_train.iloc[:, k_index])
selected_X_test = pd.DataFrame(X_test.iloc[:, k_index])

In [None]:
# 使用挑选后的特征训练新的模型
epochs = 40
train_data = utils.LoadData(selected_X_train, y_train) 
test_data = utils.LoadData(selected_X_test, y_test)

losses, iter = utils.train(train_data, batch_size, device, resnet_model, optimizer, loss_fn, epochs)
acc, loss_i = utils.test(test_data, batch_size, device, resnet_model, loss_fn)

In [None]:
utils.test(test_data, batch_size, device, resnet_model, loss_fn)

In [None]:
utils.confusion_matrix_per_class_acc(test_data, batch_size, device, resnet_model, label_encoder.classes_)