In [153]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder


date_cols = ['Stime', 'Ltime']
fields1 = ['ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd', 'Stime', 'Ltime']
fields2 = ['is_ftp_login', 'ct_ftp_cmd', 'Stime', 'Ltime']
fields3 = ['ct_ftp_cmd', 'Stime', 'Ltime']
cols = [
    'srcip',
    'sport',
    'dstip',
    'dsport',
    'proto',
    'state',
    'dur',
    'sbytes',
    'dbytes',
    'sttl',
    'dttl',
    'sloss',
    'dloss',
    'service',
    'Sload',
    'Dload',
    'Spkts',
    'Dpkts',
    'swin',
    'dwin',
    'stcpb',
    'dtcpb',
    'smeansz',
    'dmeansz',
    
    'trans_depth',
    'res_bdy_len',
    'Sjit',
    'Djit',
    'Stime',
    'Ltime',
    'Sintpkt',
    'Dintpkt',
    'tcprtt',
    'synack',
    'ackdat',
    'is_sm_ips_ports',
    'ct_state_ttl',
    
#     'ct_flw_http_mthd',
#     'is_ftp_login',
#     'ct_ftp_cmd',
    
    'ct_srv_src',
    'ct_srv_dst',
    
#     'ct_dst_ltm',
#     'ct_src_ltm',
    
    'ct_src_dport_ltm',
    'ct_dst_sport_ltm',
    'ct_dst_src_ltm',
    'attack_cat',
    'Label',
]
fields5 = ['attack_cat', 'Stime', 'Ltime']


def strip_int(text):
    return int(text.strip(' '))

def ip_to_int(ip):
    z = ''
    parts = ip.split('.')
    for prt in range(3):
        if len(parts[prt]) == 3:
            z = parts[prt] + z
        elif len(parts[prt]) == 2:
            z = '0' + parts[prt] + z
        else:
            z = '00' + parts[prt] + z
    return int(z)

def encode_attack_cat(cat):
    mapping = {
        'Generic': 1,
        'Fuzzers': 2,
        'Exploits': 3,
        'DoS': 4,
        'Reconnaissance': 5,
        'Backdoor': 6,
        'Backdoors': 6,
        'Analysis': 7,
        'Shellcode': 8,
        'Worms': 9,
    }
    if (not cat):
        return 0
    return mapping[cat.strip(' ')]

def decode_attack_cat(encoded_cat):
    mapping = {
        0: '',
        1: 'Generic',
        2: 'Fuzzers',
        3: 'Exploits',
        4: 'DoS',
        5: 'Reconnaissance',
        6: 'Backdoors',
        7: 'Analysis',
        8: 'Shellcode',
        9: 'Worms',
    }
    return mapping[encoded_cat]

def encode_state(state):
    mapping = {
        'CON': 0,
        'FIN': 1,
        'INT': 2,
        'REQ': 3,
        'URH': 4,
        'RST': 5,
        'ECR': 6,
        'ECO': 7,
        'CLO': 8,
        'PAR': 9,
        'ACC': 10,
        'URN': 11,
        'MAS': 12,
    }
    return mapping[state]

def sanitize_port(port):
    return int(port.replace('-', 0))

converters = {
    'srcip': ip_to_int,
    'dstip': ip_to_int,
}

# Read csv
df = pd.read_csv(
    '../data/UNSW-NB15-BALANCED-TRAIN.csv',
    parse_dates=date_cols,
    usecols=cols,
    skipinitialspace=True,
    converters=converters
)

df['sport'] = df['sport'].replace('-', 0)
df['dsport'] = df['dsport'].replace('-', 0)
df['sport'] = df['sport'].apply(lambda x: int(str(x), 16))
df['dsport'] = df['dsport'].apply(lambda x: int(str(x), 16))


le = LabelEncoder()
df['proto'] = le.fit_transform(df['proto'])
df['state'] = le.fit_transform(df['state'])
df['service'] = le.fit_transform(df['service'])
df['attack_cat'] = le.fit_transform(df['attack_cat'])

df.head()

  df = pd.read_csv(


Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ackdat,is_sm_ips_ports,ct_state_ttl,ct_srv_src,ct_srv_dst,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,Label
0,166059,349328,126171149,83,119,2,0.00105,146,178,31,...,0.0,0,0,3,4,1,1,1,13,0
1,166059,157526,126171149,33,113,5,4.264797,2934,3738,31,...,0.000134,0,0,5,5,5,5,8,13,0
2,176045175,4163,126171149,83,119,6,3e-06,114,0,254,...,0.0,0,2,17,17,14,14,17,7,1
3,176045175,4163,126171149,83,119,6,3e-06,114,0,254,...,0.0,0,2,25,25,25,25,25,7,1
4,166059,214130,126171149,66601,113,5,0.052957,3718,43086,31,...,0.00013,0,0,5,8,1,1,2,13,0


In [154]:
df.describe()

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ackdat,is_sm_ips_ports,ct_state_ttl,ct_srv_src,ct_srv_dst,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,Label
count,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,...,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0
mean,96469510.0,166478.437687,128439900.0,42955.83,114.453261,5.085528,0.694259,5092.681,22776.25,138.566168,...,0.006775,0.000972,0.931353,13.858387,13.683721,8.416918,6.496405,12.401215,9.646331,0.5
std,85709290.0,139797.213314,12316860.0,1140547.0,12.58373,1.293619,13.697272,115707.7,144940.4,109.317462,...,0.026773,0.031155,0.98252,13.818018,13.880075,11.03397,7.860205,14.575122,3.540252,0.500001
min,166059.0,0.0,224.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
25%,166059.0,4163.0,126171100.0,83.0,113.0,5.0,8e-06,114.0,0.0,31.0,...,0.0,0.0,0.0,3.0,3.0,1.0,1.0,1.0,7.0,0.0
50%,176045200.0,161571.0,126171100.0,83.0,119.0,5.0,0.001058,264.0,178.0,62.0,...,0.0,0.0,1.0,7.0,7.0,2.0,1.0,4.0,12.5,0.5
75%,176045200.0,291897.0,126171100.0,5923.0,119.0,6.0,0.098976,2054.0,3276.0,254.0,...,0.000136,0.0,2.0,24.0,24.0,16.0,13.0,24.0,13.0,1.0
max,241168200.0,415029.0,241168200.0,538989300.0,132.0,12.0,8760.777344,14355770.0,14657530.0,255.0,...,3.551121,1.0,6.0,67.0,67.0,67.0,60.0,67.0,13.0,1.0


In [155]:
df.values

array([[166059, 349328, 126171149, ..., 1, 13, 0],
       [166059, 157526, 126171149, ..., 8, 13, 0],
       [176045175, 4163, 126171149, ..., 17, 7, 1],
       ...,
       [166059, 201106, 126171149, ..., 1, 13, 0],
       [176045175, 4163, 126171149, ..., 41, 7, 1],
       [166059, 354344, 126171149, ..., 1, 13, 0]], dtype=object)

In [156]:
X = df.drop(columns=['attack_cat', 'Label'])
y = df['Label']

X

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,tcprtt,synack,ackdat,is_sm_ips_ports,ct_state_ttl,ct_srv_src,ct_srv_dst,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm
0,166059,349328,126171149,83,119,2,0.001050,146,178,31,...,0.000000,0.000000,0.000000,0,0,3,4,1,1,1
1,166059,157526,126171149,33,113,5,4.264797,2934,3738,31,...,0.001011,0.000877,0.000134,0,0,5,5,5,5,8
2,176045175,4163,126171149,83,119,6,0.000003,114,0,254,...,0.000000,0.000000,0.000000,0,2,17,17,14,14,17
3,176045175,4163,126171149,83,119,6,0.000003,114,0,254,...,0.000000,0.000000,0.000000,0,2,25,25,25,25,25
4,166059,214130,126171149,66601,113,5,0.052957,3718,43086,31,...,0.000650,0.000520,0.000130,0,0,5,8,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
449791,176045175,75571,126171149,33,113,5,0.977167,1072,1656,62,...,0.051045,0.006791,0.044254,0,1,1,1,1,1,2
449792,176045175,4163,126171149,83,119,6,0.000004,114,0,254,...,0.000000,0.000000,0.000000,0,2,16,16,7,3,7
449793,166059,201106,126171149,161686,113,5,0.019671,3182,35916,31,...,0.000765,0.000609,0.000156,0,0,2,8,1,1,1
449794,176045175,4163,126171149,83,119,6,0.000010,114,0,254,...,0.000000,0.000000,0.000000,0,2,41,41,25,14,41


In [157]:
# X_train_validation includes training set and validation set
# X_test includes test set
# y_train_validation includes the labels for both the training set and validation set
# y_test includes the labels for the test set
X_train_validation, X_test, y_train_validation, y_test = train_test_split(X, y, test_size=0.2)

X_train_validation.head()

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,tcprtt,synack,ackdat,is_sm_ips_ports,ct_state_ttl,ct_srv_src,ct_srv_dst,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm
299039,166059,402536,126171149,406612,113,5,0.018998,2974,33374,31,...,0.000972,0.000812,0.00016,0,0,7,11,1,1,1
312088,176045175,0,126171149,0,120,6,0.0,200,0,254,...,0.0,0.0,0.0,0,2,7,7,3,3,9
217066,166059,165513,126171149,333429,113,5,0.013924,2542,21006,31,...,0.000747,0.000537,0.00021,0,0,9,11,1,1,2
302453,176045175,291897,126171149,83,119,6,3e-06,114,0,254,...,0.0,0.0,0.0,0,2,39,39,33,16,39
439121,166059,337480,126171149,83,119,2,0.001132,146,178,31,...,0.0,0.0,0.0,0,0,3,2,2,1,1


In [158]:
# X_train is the training set
# X_validation is the validation set
# y_train is the labels for the training set
# y_validation is the labels for the validation set
X_train, X_validation, y_train, y_validation = train_test_split(X_train_validation, y_train_validation, test_size=0.2)

X_train.head()

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,tcprtt,synack,ackdat,is_sm_ips_ports,ct_state_ttl,ct_srv_src,ct_srv_dst,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm
69604,176045175,4163,126171149,83,119,6,9e-06,114,0,254,...,0.0,0.0,0.0,0,2,43,44,43,25,43
173178,176045175,365672,126171149,128,113,5,0.199419,852,268,254,...,0.061473,0.006031,0.055442,0,1,1,1,1,1,1
225824,166059,267008,126171149,230705,113,5,0.008729,2230,14994,31,...,0.000648,0.000529,0.000119,0,0,4,7,1,1,1
340437,176045175,4163,126171149,83,119,6,9e-06,114,0,254,...,0.0,0.0,0.0,0,2,26,26,17,17,26
149754,166059,37143,126171149,26753,113,5,0.048132,2856,64718,31,...,0.000632,0.000493,0.000139,0,0,9,10,6,1,9


In [159]:
# Train
model = DecisionTreeClassifier()
model.fit(X_train, y_train)




In [160]:
# Validate model
predictions = model.predict(X_validation)

score = accuracy_score(y_validation, predictions)
score

0.9907597821253891