In [171]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder


date_cols = ['Stime', 'Ltime']
fields1 = ['ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd', 'Stime', 'Ltime']
fields2 = ['is_ftp_login', 'ct_ftp_cmd', 'Stime', 'Ltime']
fields3 = ['ct_ftp_cmd', 'Stime', 'Ltime']
cols = [
    'srcip',
    'sport',
    'dstip',
    'dsport',
    'proto',
    'state',
    'dur',
    'sbytes',
    'dbytes',
    'sttl',
    'dttl',
    'sloss',
    'dloss',
    'service',
    'Sload',
    'Dload',
    'Spkts',
    'Dpkts',
    'swin',
    'dwin',
    'stcpb',
    'dtcpb',
    'smeansz',
    'dmeansz',
    
    'trans_depth',
    'res_bdy_len',
    'Sjit',
    'Djit',
    'Stime',
    'Ltime',
    'Sintpkt',
    'Dintpkt',
    'tcprtt',
    'synack',
    'ackdat',
    'is_sm_ips_ports',
    'ct_state_ttl',
    
#     'ct_flw_http_mthd',
#     'is_ftp_login',
#     'ct_ftp_cmd',
    
    'ct_srv_src',
    'ct_srv_dst',
    
#     'ct_dst_ltm',
#     'ct_src_ltm',
    
    'ct_src_dport_ltm',
    'ct_dst_sport_ltm',
    'ct_dst_src_ltm',
    'attack_cat',
    'Label',
]
fields5 = ['attack_cat', 'Stime', 'Ltime']


def strip_int(text):
    return int(text.strip(' '))

def ip_to_int(ip):
    z = ''
    parts = ip.split('.')
    for prt in range(3):
        if len(parts[prt]) == 3:
            z = parts[prt] + z
        elif len(parts[prt]) == 2:
            z = '0' + parts[prt] + z
        else:
            z = '00' + parts[prt] + z
    return int(z)

def encode_attack_cat(cat):
    mapping = {
        'Generic': 1,
        'Fuzzers': 2,
        'Exploits': 3,
        'DoS': 4,
        'Reconnaissance': 5,
        'Backdoor': 6,
        'Backdoors': 6,
        'Analysis': 7,
        'Shellcode': 8,
        'Worms': 9,
    }
    if (not cat):
        return 0
    return mapping[cat.strip(' ')]

def decode_attack_cat(encoded_cat):
    mapping = {
        0: '',
        1: 'Generic',
        2: 'Fuzzers',
        3: 'Exploits',
        4: 'DoS',
        5: 'Reconnaissance',
        6: 'Backdoors',
        7: 'Analysis',
        8: 'Shellcode',
        9: 'Worms',
    }
    return mapping[encoded_cat]

def encode_state(state):
    mapping = {
        'CON': 0,
        'FIN': 1,
        'INT': 2,
        'REQ': 3,
        'URH': 4,
        'RST': 5,
        'ECR': 6,
        'ECO': 7,
        'CLO': 8,
        'PAR': 9,
        'ACC': 10,
        'URN': 11,
        'MAS': 12,
    }
    return mapping[state]

def sanitize_port(port):
    return int(port.replace('-', 0))

converters = {
    'srcip': ip_to_int,
    'dstip': ip_to_int,
}

# Read csv
df = pd.read_csv(
    '../data/UNSW-NB15-BALANCED-TRAIN.csv',
    parse_dates=date_cols,
    usecols=cols,
    skipinitialspace=True,
    converters=converters
)

df['sport'] = df['sport'].replace('-', 0)
df['dsport'] = df['dsport'].replace('-', 0)
df['sport'] = df['sport'].apply(lambda x: int(str(x), 16))
df['dsport'] = df['dsport'].apply(lambda x: int(str(x), 16))
df['service'] = df['service'].replace('-', '')


le = LabelEncoder()
df['proto'] = le.fit_transform(df['proto'])
df['state'] = le.fit_transform(df['state'])
df['service'] = le.fit_transform(df['service'])
df['attack_cat'] = le.fit_transform(df['attack_cat'])

df.head()

  df = pd.read_csv(


Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ackdat,is_sm_ips_ports,ct_state_ttl,ct_srv_src,ct_srv_dst,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,Label
0,166059,349328,126171149,83,119,2,0.00105,146,178,31,...,0.0,0,0,3,4,1,1,1,13,0
1,166059,157526,126171149,33,113,5,4.264797,2934,3738,31,...,0.000134,0,0,5,5,5,5,8,13,0
2,176045175,4163,126171149,83,119,6,3e-06,114,0,254,...,0.0,0,2,17,17,14,14,17,7,1
3,176045175,4163,126171149,83,119,6,3e-06,114,0,254,...,0.0,0,2,25,25,25,25,25,7,1
4,166059,214130,126171149,66601,113,5,0.052957,3718,43086,31,...,0.00013,0,0,5,8,1,1,2,13,0


In [162]:
df.describe()

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ackdat,is_sm_ips_ports,ct_state_ttl,ct_srv_src,ct_srv_dst,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,Label
count,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,...,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0
mean,96469510.0,166478.437687,128439900.0,42955.83,114.453261,5.085528,0.694259,5092.681,22776.25,138.566168,...,0.006775,0.000972,0.931353,13.858387,13.683721,8.416918,6.496405,12.401215,9.646331,0.5
std,85709290.0,139797.213314,12316860.0,1140547.0,12.58373,1.293619,13.697272,115707.7,144940.4,109.317462,...,0.026773,0.031155,0.98252,13.818018,13.880075,11.03397,7.860205,14.575122,3.540252,0.500001
min,166059.0,0.0,224.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
25%,166059.0,4163.0,126171100.0,83.0,113.0,5.0,8e-06,114.0,0.0,31.0,...,0.0,0.0,0.0,3.0,3.0,1.0,1.0,1.0,7.0,0.0
50%,176045200.0,161571.0,126171100.0,83.0,119.0,5.0,0.001058,264.0,178.0,62.0,...,0.0,0.0,1.0,7.0,7.0,2.0,1.0,4.0,12.5,0.5
75%,176045200.0,291897.0,126171100.0,5923.0,119.0,6.0,0.098976,2054.0,3276.0,254.0,...,0.000136,0.0,2.0,24.0,24.0,16.0,13.0,24.0,13.0,1.0
max,241168200.0,415029.0,241168200.0,538989300.0,132.0,12.0,8760.777344,14355770.0,14657530.0,255.0,...,3.551121,1.0,6.0,67.0,67.0,67.0,60.0,67.0,13.0,1.0


In [163]:
df.values

array([[166059, 349328, 126171149, ..., 1, 13, 0],
       [166059, 157526, 126171149, ..., 8, 13, 0],
       [176045175, 4163, 126171149, ..., 17, 7, 1],
       ...,
       [166059, 201106, 126171149, ..., 1, 13, 0],
       [176045175, 4163, 126171149, ..., 41, 7, 1],
       [166059, 354344, 126171149, ..., 1, 13, 0]], dtype=object)

In [218]:
# Correlation / Co-variance
threshold = 0.95

# Absolute value correlation matrix
corr_matrix = X_train.corr().abs()
corr_matrix.head()

  corr_matrix = X_train.corr().abs()


Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,tcprtt,synack,ackdat,is_sm_ips_ports,ct_state_ttl,ct_srv_src,ct_srv_dst,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm
srcip,1.0,0.222034,0.069884,0.026011,0.016012,0.629746,0.027422,0.004483,0.143386,0.906766,...,0.226278,0.199044,0.229962,0.028536,0.880653,0.531331,0.532702,0.538267,0.581248,0.595763
sport,0.222034,1.0,0.020551,0.005123,0.142196,0.186677,0.039229,0.008771,0.037608,0.22186,...,0.091341,0.08355,0.089232,0.036666,0.238567,0.147156,0.14967,0.163015,0.205147,0.164443
dstip,0.069884,0.020551,1.0,0.006534,0.075958,0.116439,0.091786,0.009526,0.028709,0.124582,...,0.046315,0.041724,0.045963,0.118612,0.165665,0.23941,0.239318,0.236287,0.250846,0.24376
dsport,0.026011,0.005123,0.006534,1.0,0.02017,0.006027,0.00366,0.000549,0.000934,0.023362,...,0.004117,0.003585,0.004225,0.000958,0.021802,0.015872,0.016526,0.020258,0.021071,0.021173
proto,0.016012,0.142196,0.075958,0.02017,1.0,0.073926,0.210411,0.006479,0.017822,0.016613,...,0.029046,0.026169,0.028823,0.265169,0.021316,0.231091,0.231724,0.212455,0.216525,0.219977


In [219]:
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
upper.head()

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,tcprtt,synack,ackdat,is_sm_ips_ports,ct_state_ttl,ct_srv_src,ct_srv_dst,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm
srcip,,0.222034,0.069884,0.026011,0.016012,0.629746,0.027422,0.004483,0.143386,0.906766,...,0.226278,0.199044,0.229962,0.028536,0.880653,0.531331,0.532702,0.538267,0.581248,0.595763
sport,,,0.020551,0.005123,0.142196,0.186677,0.039229,0.008771,0.037608,0.22186,...,0.091341,0.08355,0.089232,0.036666,0.238567,0.147156,0.14967,0.163015,0.205147,0.164443
dstip,,,,0.006534,0.075958,0.116439,0.091786,0.009526,0.028709,0.124582,...,0.046315,0.041724,0.045963,0.118612,0.165665,0.23941,0.239318,0.236287,0.250846,0.24376
dsport,,,,,0.02017,0.006027,0.00366,0.000549,0.000934,0.023362,...,0.004117,0.003585,0.004225,0.000958,0.021802,0.015872,0.016526,0.020258,0.021071,0.021173
proto,,,,,,0.073926,0.210411,0.006479,0.017822,0.016613,...,0.029046,0.026169,0.028823,0.265169,0.021316,0.231091,0.231724,0.212455,0.216525,0.219977


In [220]:
# Drop duplicate columns by finding their correlations between each other
to_drop_raw = []

# Column for column in upper.columns if any upper[column] > threshold
for column in upper.columns:
    for index, correlation in enumerate(upper[column]):
        if (correlation > threshold):
            to_drop_raw.append(upper.columns[index])
            
to_drop = list(set(to_drop_raw))
to_drop

['dbytes', 'swin', 'ct_srv_src', 'tcprtt', 'dloss', 'ct_srv_dst', 'sbytes']

In [229]:
# Drop label columns and duplicate columns
label_cols = ['attack_cat', 'Label']
drop_cols = label_cols + to_drop
drop_cols
X = df.drop(columns=['attack_cat', 'Label'])
y = df['Label']

X.columns

Index(['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur', 'sbytes',
       'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'Sload', 'Dload',
       'Spkts', 'Dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz',
       'dmeansz', 'trans_depth', 'res_bdy_len', 'Sjit', 'Djit', 'Stime',
       'Ltime', 'Sintpkt', 'Dintpkt', 'tcprtt', 'synack', 'ackdat',
       'is_sm_ips_ports', 'ct_state_ttl', 'ct_srv_src', 'ct_srv_dst',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm'],
      dtype='object')

In [230]:
# X_train_validation includes training set and validation set
# X_test includes test set
# y_train_validation includes the labels for both the training set and validation set
# y_test includes the labels for the test set
X_train_validation, X_test, y_train_validation, y_test = train_test_split(X, y, test_size=0.2)

X_train_validation.head()

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,tcprtt,synack,ackdat,is_sm_ips_ports,ct_state_ttl,ct_srv_src,ct_srv_dst,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm
18282,176045175,0,126171149,0,18,6,3e-06,200,0,254,...,0.0,0.0,0.0,0,2,4,4,2,2,4
104009,166059,139809,126171149,83,119,2,0.00107,146,178,31,...,0.0,0.0,0.0,0,0,5,3,2,1,2
270916,176045175,4163,126171149,83,119,6,9e-06,114,0,254,...,0.0,0.0,0.0,0,2,35,35,22,22,35
240680,166059,272696,126171149,34,113,5,2.275536,12648,13410,31,...,0.001233,0.00112,0.000113,0,0,1,2,1,1,4
111902,166059,30056,126171149,128,113,5,1.004723,1580,10168,31,...,0.000652,0.000504,0.000148,0,0,2,4,1,1,1


In [231]:
# X_train is the training set
# X_validation is the validation set
# y_train is the labels for the training set
# y_validation is the labels for the validation set
X_train, X_validation, y_train, y_validation = train_test_split(X_train_validation, y_train_validation, test_size=0.2)

X_train.head()

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,tcprtt,synack,ackdat,is_sm_ips_ports,ct_state_ttl,ct_srv_src,ct_srv_dst,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm
253630,126171149,4163,176045175,83,119,6,8e-06,264,0,60,...,0.0,0.0,0.0,0,0,15,15,4,3,15
67511,166059,235106,126171149,37,113,5,0.089825,37322,3380,31,...,0.005086,0.004916,0.00017,0,0,1,1,1,1,1
413748,166059,333190,126171149,26753,113,5,0.015433,1540,1644,31,...,0.000562,0.000432,0.00013,0,0,11,10,4,1,5
442183,176045175,0,126171149,0,120,6,3e-06,200,0,254,...,0.0,0.0,0.0,0,2,3,3,2,2,4
352584,126171149,291897,176045175,83,119,6,7e-06,264,0,60,...,0.0,0.0,0.0,0,0,43,43,19,19,43


In [232]:
# Train
# model = DecisionTreeClassifier()
# model.fit(X_train, y_train)

# KNN - K-Nearest Neighbours Classifier




In [233]:
# Validate model
predictions = model.predict(X_validation)

score = accuracy_score(y_validation, predictions)
score

0.9906625166740773