In [171]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder


date_cols = ['Stime', 'Ltime']
fields1 = ['ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd', 'Stime', 'Ltime']
fields2 = ['is_ftp_login', 'ct_ftp_cmd', 'Stime', 'Ltime']
fields3 = ['ct_ftp_cmd', 'Stime', 'Ltime']
cols = [
    'srcip',
    'sport',
    'dstip',
    'dsport',
    'proto',
    'state',
    'dur',
    'sbytes',
    'dbytes',
    'sttl',
    'dttl',
    'sloss',
    'dloss',
    'service',
    'Sload',
    'Dload',
    'Spkts',
    'Dpkts',
    'swin',
    'dwin',
    'stcpb',
    'dtcpb',
    'smeansz',
    'dmeansz',
    
    'trans_depth',
    'res_bdy_len',
    'Sjit',
    'Djit',
    'Stime',
    'Ltime',
    'Sintpkt',
    'Dintpkt',
    'tcprtt',
    'synack',
    'ackdat',
    'is_sm_ips_ports',
    'ct_state_ttl',
    
#     'ct_flw_http_mthd',
#     'is_ftp_login',
#     'ct_ftp_cmd',
    
    'ct_srv_src',
    'ct_srv_dst',
    
#     'ct_dst_ltm',
#     'ct_src_ltm',
    
    'ct_src_dport_ltm',
    'ct_dst_sport_ltm',
    'ct_dst_src_ltm',
    'attack_cat',
    'Label',
]
fields5 = ['attack_cat', 'Stime', 'Ltime']


def strip_int(text):
    return int(text.strip(' '))

def ip_to_int(ip):
    z = ''
    parts = ip.split('.')
    for prt in range(3):
        if len(parts[prt]) == 3:
            z = parts[prt] + z
        elif len(parts[prt]) == 2:
            z = '0' + parts[prt] + z
        else:
            z = '00' + parts[prt] + z
    return int(z)

def encode_attack_cat(cat):
    mapping = {
        'Generic': 1,
        'Fuzzers': 2,
        'Exploits': 3,
        'DoS': 4,
        'Reconnaissance': 5,
        'Backdoor': 6,
        'Backdoors': 6,
        'Analysis': 7,
        'Shellcode': 8,
        'Worms': 9,
    }
    if (not cat):
        return 0
    return mapping[cat.strip(' ')]

def decode_attack_cat(encoded_cat):
    mapping = {
        0: '',
        1: 'Generic',
        2: 'Fuzzers',
        3: 'Exploits',
        4: 'DoS',
        5: 'Reconnaissance',
        6: 'Backdoors',
        7: 'Analysis',
        8: 'Shellcode',
        9: 'Worms',
    }
    return mapping[encoded_cat]

def encode_state(state):
    mapping = {
        'CON': 0,
        'FIN': 1,
        'INT': 2,
        'REQ': 3,
        'URH': 4,
        'RST': 5,
        'ECR': 6,
        'ECO': 7,
        'CLO': 8,
        'PAR': 9,
        'ACC': 10,
        'URN': 11,
        'MAS': 12,
    }
    return mapping[state]

def sanitize_port(port):
    return int(port.replace('-', 0))

converters = {
    'srcip': ip_to_int,
    'dstip': ip_to_int,
}

# Read csv
df = pd.read_csv(
    '../data/UNSW-NB15-BALANCED-TRAIN.csv',
    parse_dates=date_cols,
    usecols=cols,
    skipinitialspace=True,
    converters=converters
)

df['sport'] = df['sport'].replace('-', 0)
df['dsport'] = df['dsport'].replace('-', 0)
df['sport'] = df['sport'].apply(lambda x: int(str(x), 16))
df['dsport'] = df['dsport'].apply(lambda x: int(str(x), 16))
df['service'] = df['service'].replace('-', '')


le = LabelEncoder()
df['proto'] = le.fit_transform(df['proto'])
df['state'] = le.fit_transform(df['state'])
df['service'] = le.fit_transform(df['service'])
df['attack_cat'] = le.fit_transform(df['attack_cat'])

df.head()

  df = pd.read_csv(


Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ackdat,is_sm_ips_ports,ct_state_ttl,ct_srv_src,ct_srv_dst,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,Label
0,166059,349328,126171149,83,119,2,0.00105,146,178,31,...,0.0,0,0,3,4,1,1,1,13,0
1,166059,157526,126171149,33,113,5,4.264797,2934,3738,31,...,0.000134,0,0,5,5,5,5,8,13,0
2,176045175,4163,126171149,83,119,6,3e-06,114,0,254,...,0.0,0,2,17,17,14,14,17,7,1
3,176045175,4163,126171149,83,119,6,3e-06,114,0,254,...,0.0,0,2,25,25,25,25,25,7,1
4,166059,214130,126171149,66601,113,5,0.052957,3718,43086,31,...,0.00013,0,0,5,8,1,1,2,13,0


In [162]:
df.describe()

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ackdat,is_sm_ips_ports,ct_state_ttl,ct_srv_src,ct_srv_dst,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,Label
count,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,...,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0
mean,96469510.0,166478.437687,128439900.0,42955.83,114.453261,5.085528,0.694259,5092.681,22776.25,138.566168,...,0.006775,0.000972,0.931353,13.858387,13.683721,8.416918,6.496405,12.401215,9.646331,0.5
std,85709290.0,139797.213314,12316860.0,1140547.0,12.58373,1.293619,13.697272,115707.7,144940.4,109.317462,...,0.026773,0.031155,0.98252,13.818018,13.880075,11.03397,7.860205,14.575122,3.540252,0.500001
min,166059.0,0.0,224.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
25%,166059.0,4163.0,126171100.0,83.0,113.0,5.0,8e-06,114.0,0.0,31.0,...,0.0,0.0,0.0,3.0,3.0,1.0,1.0,1.0,7.0,0.0
50%,176045200.0,161571.0,126171100.0,83.0,119.0,5.0,0.001058,264.0,178.0,62.0,...,0.0,0.0,1.0,7.0,7.0,2.0,1.0,4.0,12.5,0.5
75%,176045200.0,291897.0,126171100.0,5923.0,119.0,6.0,0.098976,2054.0,3276.0,254.0,...,0.000136,0.0,2.0,24.0,24.0,16.0,13.0,24.0,13.0,1.0
max,241168200.0,415029.0,241168200.0,538989300.0,132.0,12.0,8760.777344,14355770.0,14657530.0,255.0,...,3.551121,1.0,6.0,67.0,67.0,67.0,60.0,67.0,13.0,1.0


In [163]:
df.values

array([[166059, 349328, 126171149, ..., 1, 13, 0],
       [166059, 157526, 126171149, ..., 8, 13, 0],
       [176045175, 4163, 126171149, ..., 17, 7, 1],
       ...,
       [166059, 201106, 126171149, ..., 1, 13, 0],
       [176045175, 4163, 126171149, ..., 41, 7, 1],
       [166059, 354344, 126171149, ..., 1, 13, 0]], dtype=object)

In [164]:
X = df.drop(columns=['attack_cat', 'Label'])
y = df['Label']

X

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,tcprtt,synack,ackdat,is_sm_ips_ports,ct_state_ttl,ct_srv_src,ct_srv_dst,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm
0,166059,349328,126171149,83,119,2,0.001050,146,178,31,...,0.000000,0.000000,0.000000,0,0,3,4,1,1,1
1,166059,157526,126171149,33,113,5,4.264797,2934,3738,31,...,0.001011,0.000877,0.000134,0,0,5,5,5,5,8
2,176045175,4163,126171149,83,119,6,0.000003,114,0,254,...,0.000000,0.000000,0.000000,0,2,17,17,14,14,17
3,176045175,4163,126171149,83,119,6,0.000003,114,0,254,...,0.000000,0.000000,0.000000,0,2,25,25,25,25,25
4,166059,214130,126171149,66601,113,5,0.052957,3718,43086,31,...,0.000650,0.000520,0.000130,0,0,5,8,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
449791,176045175,75571,126171149,33,113,5,0.977167,1072,1656,62,...,0.051045,0.006791,0.044254,0,1,1,1,1,1,2
449792,176045175,4163,126171149,83,119,6,0.000004,114,0,254,...,0.000000,0.000000,0.000000,0,2,16,16,7,3,7
449793,166059,201106,126171149,161686,113,5,0.019671,3182,35916,31,...,0.000765,0.000609,0.000156,0,0,2,8,1,1,1
449794,176045175,4163,126171149,83,119,6,0.000010,114,0,254,...,0.000000,0.000000,0.000000,0,2,41,41,25,14,41


In [165]:
# X_train_validation includes training set and validation set
# X_test includes test set
# y_train_validation includes the labels for both the training set and validation set
# y_test includes the labels for the test set
X_train_validation, X_test, y_train_validation, y_test = train_test_split(X, y, test_size=0.2)

X_train_validation.head()

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,tcprtt,synack,ackdat,is_sm_ips_ports,ct_state_ttl,ct_srv_src,ct_srv_dst,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm
354382,176045175,336256,126171149,128,113,5,0.289019,846,1340,62,...,0.056965,0.010531,0.046434,0,1,1,1,1,1,1
350354,166059,17446,126171149,83,119,2,0.001043,146,178,31,...,0.0,0.0,0.0,0,0,22,11,1,1,1
387583,166059,198713,126171149,128,113,5,1.277831,1684,10168,31,...,0.000726,0.000562,0.000164,0,0,1,3,1,1,2
28925,166059,96114,126171149,82073,113,5,0.009857,2334,16528,31,...,0.000759,0.000618,0.000141,0,0,2,14,1,1,1
329623,176045175,0,126171149,0,84,6,9e-06,200,0,254,...,0.0,0.0,0.0,0,2,5,5,3,3,5


In [166]:
# X_train is the training set
# X_validation is the validation set
# y_train is the labels for the training set
# y_validation is the labels for the validation set
X_train, X_validation, y_train, y_validation = train_test_split(X_train_validation, y_train_validation, test_size=0.2)

X_train.head()

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,tcprtt,synack,ackdat,is_sm_ips_ports,ct_state_ttl,ct_srv_src,ct_srv_dst,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm
364547,166059,407428,126171149,34,113,5,0.020219,9400,12298,31,...,0.00069,0.000568,0.000122,0,0,3,1,1,1,1
221876,166059,345139,126171149,26753,113,5,0.013464,1540,1644,31,...,0.000569,0.000453,0.000116,0,0,5,6,3,1,5
216009,176045175,4163,126171149,83,119,6,4e-06,114,0,254,...,0.0,0.0,0.0,0,2,30,30,30,12,30
237501,166059,90627,126171149,128,113,5,1.160631,1684,10168,31,...,0.000662,0.000539,0.000123,0,0,3,1,1,1,1
158340,166059,149350,126171149,366600,113,5,0.017945,2750,26406,31,...,0.000732,0.000599,0.000133,0,0,10,4,1,1,1


In [167]:
# Train
model = DecisionTreeClassifier()
model.fit(X_train, y_train)




In [168]:
# Validate model
predictions = model.predict(X_validation)

score = accuracy_score(y_validation, predictions)
score

0.991315584704313

In [177]:

# Correlation / Co-variance
threshold = 0.95

# Absolute value correlation matrix
corr_matrix = X_train.corr().abs()
corr_matrix


  corr_matrix = X_train.corr().abs()


Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,tcprtt,synack,ackdat,is_sm_ips_ports,ct_state_ttl,ct_srv_src,ct_srv_dst,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm
srcip,1.0,0.222034,0.069884,0.026011,0.016012,0.629746,0.027422,0.004483,0.143386,0.906766,...,0.226278,0.199044,0.229962,0.028536,0.880653,0.531331,0.532702,0.538267,0.581248,0.595763
sport,0.222034,1.0,0.020551,0.005123,0.142196,0.186677,0.039229,0.008771,0.037608,0.22186,...,0.091341,0.08355,0.089232,0.036666,0.238567,0.147156,0.14967,0.163015,0.205147,0.164443
dstip,0.069884,0.020551,1.0,0.006534,0.075958,0.116439,0.091786,0.009526,0.028709,0.124582,...,0.046315,0.041724,0.045963,0.118612,0.165665,0.23941,0.239318,0.236287,0.250846,0.24376
dsport,0.026011,0.005123,0.006534,1.0,0.02017,0.006027,0.00366,0.000549,0.000934,0.023362,...,0.004117,0.003585,0.004225,0.000958,0.021802,0.015872,0.016526,0.020258,0.021071,0.021173
proto,0.016012,0.142196,0.075958,0.02017,1.0,0.073926,0.210411,0.006479,0.017822,0.016613,...,0.029046,0.026169,0.028823,0.265169,0.021316,0.231091,0.231724,0.212455,0.216525,0.219977
state,0.629746,0.186677,0.116439,0.006027,0.073926,1.0,0.049233,0.021237,0.021296,0.590616,...,0.04117,0.039174,0.038517,0.021793,0.61576,0.473985,0.472861,0.450885,0.485651,0.49852
dur,0.027422,0.039229,0.091786,0.00366,0.210411,0.049233,1.0,0.194652,0.252027,0.014109,...,0.091476,0.084577,0.088347,0.019774,0.058395,0.118126,0.117724,0.095266,0.097282,0.106387
sbytes,0.004483,0.008771,0.009526,0.000549,0.006479,0.021237,0.194652,1.0,0.019065,0.013857,...,0.043578,0.038344,0.044275,0.00135,0.01011,0.034359,0.034409,0.027886,0.029277,0.031371
dbytes,0.143386,0.037608,0.028709,0.000934,0.017822,0.021296,0.252027,0.019065,1.0,0.147804,...,0.000896,0.002269,0.000753,0.004823,0.125421,0.085491,0.085609,0.089746,0.098237,0.102504
sttl,0.906766,0.22186,0.124582,0.023362,0.016613,0.590616,0.014109,0.013857,0.147804,1.0,...,0.106804,0.096743,0.105405,0.039054,0.922927,0.525279,0.525915,0.514941,0.553947,0.57671


In [176]:
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
upper.head()

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,tcprtt,synack,ackdat,is_sm_ips_ports,ct_state_ttl,ct_srv_src,ct_srv_dst,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm
srcip,,0.222034,0.069884,0.026011,0.016012,0.629746,0.027422,0.004483,0.143386,0.906766,...,0.226278,0.199044,0.229962,0.028536,0.880653,0.531331,0.532702,0.538267,0.581248,0.595763
sport,,,0.020551,0.005123,0.142196,0.186677,0.039229,0.008771,0.037608,0.22186,...,0.091341,0.08355,0.089232,0.036666,0.238567,0.147156,0.14967,0.163015,0.205147,0.164443
dstip,,,,0.006534,0.075958,0.116439,0.091786,0.009526,0.028709,0.124582,...,0.046315,0.041724,0.045963,0.118612,0.165665,0.23941,0.239318,0.236287,0.250846,0.24376
dsport,,,,,0.02017,0.006027,0.00366,0.000549,0.000934,0.023362,...,0.004117,0.003585,0.004225,0.000958,0.021802,0.015872,0.016526,0.020258,0.021071,0.021173
proto,,,,,,0.073926,0.210411,0.006479,0.017822,0.016613,...,0.029046,0.026169,0.028823,0.265169,0.021316,0.231091,0.231724,0.212455,0.216525,0.219977


In [206]:
to_drop = []

for column in upper.columns:
    print('==========', column)
#     print(upper[column])
    for index, correlation in enumerate(upper[column]):
#         print(upper[column][index])
        if (correlation > threshold):
            print(upper.columns[index])
            to_drop.push(upper.columns[index])
            
to_drop
    

srcip              NaN
sport              NaN
dstip              NaN
dsport             NaN
proto              NaN
state              NaN
dur                NaN
sbytes             NaN
dbytes             NaN
sttl               NaN
dttl               NaN
sloss              NaN
dloss              NaN
service            NaN
Sload              NaN
Dload              NaN
Spkts              NaN
Dpkts              NaN
swin               NaN
dwin               NaN
stcpb              NaN
dtcpb              NaN
smeansz            NaN
dmeansz            NaN
trans_depth        NaN
res_bdy_len        NaN
Sjit               NaN
Djit               NaN
Sintpkt            NaN
Dintpkt            NaN
tcprtt             NaN
synack             NaN
ackdat             NaN
is_sm_ips_ports    NaN
ct_state_ttl       NaN
ct_srv_src         NaN
ct_srv_dst         NaN
ct_src_dport_ltm   NaN
ct_dst_sport_ltm   NaN
ct_dst_src_ltm     NaN
Name: srcip, dtype: float64
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan


AttributeError: 'list' object has no attribute 'push'