In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd '/content/drive/MyDrive/security'

/content/drive/MyDrive/security


In [None]:
from Utils import open_pickle
import pandas as pd
import numpy as np

In [None]:
train_path = r"./train.csv"
valid_path = r"./valid.csv"

train_df = pd.read_csv(train_path)
valid_df = pd.read_csv(valid_path)

In [None]:
table = ['C', 'E', 'U', 'A', 'P', 'R', 'S', 'F']
for i in range(len(table)):
    train_df[table[i]] = 0
    train_df[table[i]] = np.where(train_df["Flags"].str.contains(table[i]), 1, train_df[table[i]])
    valid_df[table[i]] = 0
    valid_df[table[i]] = np.where(valid_df["Flags"].str.contains(table[i]), 1, valid_df[table[i]])

In [None]:
# train Protocol 처리
tcp = train_df[train_df["Protocol"] == "TCP"].index
udp = train_df[train_df["Protocol"] == "UDP"].index
zero = train_df[train_df["Protocol"] == "0"].index

train_df['TCP'] = 0
train_df['UDP'] = 0
train_df['0'] = 0

train_df.loc[tcp, 'TCP'] = 1
train_df.loc[udp, 'UDP'] = 1
train_df.loc[zero, '0'] = 1

# valid Protocol 처리
tcp1 = valid_df[valid_df["Protocol"] == "TCP"].index
udp1 = valid_df[valid_df["Protocol"] == "UDP"].index
zero1 = valid_df[valid_df["Protocol"] == "0"].index

valid_df['TCP'] = 0
valid_df['UDP'] = 0
valid_df['0'] = 0

valid_df.loc[tcp1, 'TCP'] = 1
valid_df.loc[udp1, 'UDP'] = 1
valid_df.loc[zero1, '0'] = 1

In [None]:
train_df["Label"].replace('Benign_IP', 0, inplace=True)
train_df["Label"].replace('Attack_IP', 1, inplace=True)
valid_df["Label"].replace('Benign_IP', 0, inplace=True)
valid_df["Label"].replace('Attack_IP', 1, inplace=True)

# seed값을 주지 않고 섞어서 다른 결과가 나올 수 있음
train_df = pd.concat([train_df, valid_df], ignore_index=True)
train_df = train_df.sample(frac=1).reset_index(drop=True)

valid_df = train_df[:10000]
train_df = train_df[10000:]

train_y = train_df["Label"].values.tolist()
valid_y = valid_df["Label"].values.tolist()

train_list = train_df.drop(columns = ["Src IP", "Dst IP", "Protocol", "Timestamp", "Flags", "Label"]).values.tolist() 
valid_list = valid_df.drop(columns = ["Src IP", "Dst IP", "Protocol", "Timestamp", "Flags", "Label"]).values.tolist()

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# scaler = MinMaxScaler()
scaler = StandardScaler()

scaler.fit(train_list)
train_X = scaler.transform(train_list)

# scaler_ = MinMaxScaler()
scaler_ = StandardScaler()

scaler_.fit(valid_list)
valid_X = scaler.transform(valid_list)

In [None]:
train_X = train_X.astype(np.float32)
valid_X = valid_X.astype(np.float32)
train_y = np.array(train_y, dtype=np.float32)
valid_y = np.array(valid_y, dtype=np.float32)

In [None]:
from xgboost import XGBClassifier

model = XGBClassifier(
    learning_rate =0.1,
    n_estimators=1500,
    max_depth=10,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    nthread=-1,
    scale_pos_weight=1,
    seed=123,
    tree_method='gpu_hist', 
    gpu_id=0
)

In [None]:
xgboost = model.fit(train_X, train_y)

In [None]:
pred_y = xgboost.predict_proba(valid_X)[:,1]

In [None]:
round_valid_y = valid_y.astype("int")
round_predict = np.round(pred_y)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

print("confusion_matrix\n", confusion_matrix(round_valid_y, round_predict))
print(f"accuracy\t: {accuracy_score(round_valid_y, round_predict): .5f}")
print(f"precision\t: {precision_score(round_valid_y, round_predict): .5f}")
print(f"recall\t\t: {recall_score(round_valid_y, round_predict): .5f}")
print(f"F-1\t\t: {f1_score(round_valid_y, round_predict): .5f}")

confusion_matrix
 [[5740   53]
 [  27 4180]]
accuracy	:  0.99200
precision	:  0.98748
recall		:  0.99358
F-1		:  0.99052


In [None]:
# 정상 외부 IP set
outer_benignIP_set = open_pickle("./outer_benign_IP.pkl")
# 악성 외부 IP set
outer_malIP_set = open_pickle("./outer_mal_IP.pkl")

In [None]:
# Problem #3을 위해 dictionary형태로 변경
labelIP_dict = dict()

for ip in outer_benignIP_set:
    labelIP_dict[ip] = 0
for ip in outer_malIP_set:
    labelIP_dict[ip] = 1

In [None]:
from collections import defaultdict

theta = 0.9

# 정답을 위한 것
predictIP_dict = defaultdict()
# 외부 IP별로 dataframe의 전체 flow를 담을 사전 
# groupIP_dict = defaultdict(list)

# 기본 외부 IP 지정
for i in valid_df['Src IP']:
    if i in labelIP_dict:
        predictIP_dict[i] = 0

for i in valid_df['Dst IP']:
    if i in labelIP_dict:
        predictIP_dict[i] = 0

# 이상값 변화, IP별 그룹핑, 각 IP값에 해당 flow가 추가될 것임
for i, pred in enumerate(pred_y):
    if valid_df['Src IP'][i] in labelIP_dict:
        if pred >= theta:
            predictIP_dict[valid_df['Src IP'][i]] = 1
        # groupIP_dict[valid_df['Src IP'][i]].append(valid_df.loc[i])

    if valid_df['Dst IP'][i] in labelIP_dict:
        if pred >= theta:
            predictIP_dict[valid_df['Dst IP'][i]] = 1
        # groupIP_dict[valid_df['Dst IP'][i]].append(valid_df.loc[i])

In [None]:
# 정답 IP dictionary와 예측 IP dictionary를 비교하여 성능 평가
pred_IP = []
label_IP = []

for k in predictIP_dict.keys():
    pred_IP.append(predictIP_dict[k])
    label_IP.append(labelIP_dict[k])

print("confusion_matrix\n", confusion_matrix(label_IP, pred_IP))
print(f"accuracy\t: {accuracy_score(label_IP, pred_IP): .5f}")
print(f"precision\t: {precision_score(label_IP, pred_IP): .5f}")
print(f"recall\t\t: {recall_score(label_IP, pred_IP): .5f}")
print(f"F-1\t\t: {f1_score(label_IP, pred_IP): .5f}")

confusion_matrix
 [[363   6]
 [  2 292]]
accuracy	:  0.98793
precision	:  0.97987
recall		:  0.99320
F-1		:  0.98649


### Test

In [None]:
outer_ip_set = open_pickle("./outer_ip_set.pkl")

test_path = r"./project2_test.csv"
test_df = pd.read_csv(test_path)

table = ['C', 'E', 'U', 'A', 'P', 'R', 'S', 'F']

for i in range(len(table)):
    test_df[table[i]] = 0
    test_df[table[i]] = np.where(test_df["Flags"].str.contains(table[i]), 1, test_df[table[i]])

tcp = test_df[test_df["Protocol"] == "TCP"].index
udp = test_df[test_df["Protocol"] == "UDP"].index
zero = test_df[test_df["Protocol"] == "0"].index

test_df['TCP'] = 0
test_df['UDP'] = 0
test_df['0'] = 0

test_df.loc[tcp, 'TCP'] = 1
test_df.loc[udp, 'UDP'] = 1
test_df.loc[zero, '0'] = 1

test_list = test_df.drop(columns = ["Src IP", "Dst IP", "Protocol", "Timestamp", "Flags"]).values.tolist() 

scaler2 = StandardScaler()

scaler2.fit(test_list)
test_X = scaler.transform(test_list)

test_X = test_X.astype(np.float32)

pred_y = xgboost.predict_proba(test_X)[:,1]

from collections import defaultdict

theta = 0.9

predictIP_dict = defaultdict()

for i in test_df['Src IP']:
    if i in outer_ip_set:
        predictIP_dict[i] = 0

for i in test_df['Dst IP']:
    if i in outer_ip_set:
        predictIP_dict[i] = 0

for i, pred in enumerate(pred_y):
    if test_df['Src IP'][i] in outer_ip_set:
        if pred >= theta:
            predictIP_dict[test_df['Src IP'][i]] = 1

    if test_df['Dst IP'][i] in outer_ip_set:
        if pred >= theta:
            predictIP_dict[test_df['Dst IP'][i]] = 1

In [None]:
res = pd.DataFrame([x for x in zip(list(predictIP_dict.keys()), list(predictIP_dict.values()))])
res.columns = ['IP', 'Prediction']
res.to_csv("result.csv", index = False)

from google.colab import files
files.download("result.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>