In [49]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder


date_cols = ['Stime', 'Ltime']
fields1 = ['ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd', 'Stime', 'Ltime']
fields2 = ['is_ftp_login', 'ct_ftp_cmd', 'Stime', 'Ltime']
fields3 = ['ct_ftp_cmd', 'Stime', 'Ltime']
cols = [
    'srcip',
    'sport',
    'dstip',
    'dsport',
    'proto',
    'state',
    'dur',
    'sbytes',
    'dbytes',
    'sttl',
    'dttl',
    'sloss',
    'dloss',
    'service',
    'Sload',
    'Dload',
    'Spkts',
    'Dpkts',
    'swin',
    'dwin',
    'stcpb',
    'dtcpb',
    'smeansz',
    'dmeansz',
    'trans_depth',
    'res_bdy_len',
    'Sjit',
    'Djit',
    'Stime',
    'Ltime',
    'Sintpkt',
    'Dintpkt',
    'tcprtt',
    'synack',
    'ackdat',
    'is_sm_ips_ports',
    'ct_state_ttl',
#     'ct_flw_http_mthd',
#     'is_ftp_login',
#     'ct_ftp_cmd',
    'ct_srv_src',
    'ct_srv_dst',
#     'ct_dst_ltm',
#     'ct_src_ltm',
    'ct_src_dport_ltm',
    'ct_dst_sport_ltm',
    'ct_dst_src_ltm',
    'attack_cat',
    'Label',
]
fields5 = ['attack_cat', 'Stime', 'Ltime']


def strip_int(text):
    return int(text.strip(' '))

def ip_to_int(ip):
    z = ''
    parts = ip.split('.')
    for prt in range(3):
        if len(parts[prt]) == 3:
            z = parts[prt] + z
        elif len(parts[prt]) == 2:
            z = '0' + parts[prt] + z
        else:
            z = '00' + parts[prt] + z
    return int(z)

def encode_attack_cat(cat):
    mapping = {
        'Generic': 1,
        'Fuzzers': 2,
        'Exploits': 3,
        'DoS': 4,
        'Reconnaissance': 5,
        'Backdoor': 6,
        'Backdoors': 6,
        'Analysis': 7,
        'Shellcode': 8,
        'Worms': 9,
    }
    if (not cat):
        return 0
    return mapping[cat.strip(' ')]

def decode_attack_cat(encoded_cat):
    mapping = {
        0: '',
        1: 'Generic',
        2: 'Fuzzers',
        3: 'Exploits',
        4: 'DoS',
        5: 'Reconnaissance',
        6: 'Backdoors',
        7: 'Analysis',
        8: 'Shellcode',
        9: 'Worms',
    }
    return mapping[encoded_cat]

def encode_state(state):
    mapping = {
        'CON': 0,
        'FIN': 1,
        'INT': 2,
        'REQ': 3,
        'URH': 4,
        'RST': 5,
        'ECR': 6,
        'ECO': 7,
        'CLO': 8,
        'PAR': 9,
        'ACC': 10,
        'URN': 11,
        'MAS': 12,
    }
    return mapping[state]

converters = {
    'srcip': ip_to_int,
    'dstip': ip_to_int,
#     'state': encode_state,
#     'attack_cat': encode_attack_cat,
}

# Read csv
df = pd.read_csv(
    '../data/UNSW-NB15-BALANCED-TRAIN.csv',
    parse_dates=date_cols,
    usecols=cols,
    skipinitialspace=True,
    converters=converters
)

le = LabelEncoder()
df['proto'] = le.fit_transform(df['proto'])
df['state'] = le.fit_transform(df['state'])
df['service'] = le.fit_transform(df['service'])
df['attack_cat'] = le.fit_transform(df['attack_cat'])

df.head()

  df = pd.read_csv(


Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ackdat,is_sm_ips_ports,ct_state_ttl,ct_srv_src,ct_srv_dst,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,Label
0,166059,55490,126171149,53,119,2,0.00105,146,178,31,...,0.0,0,0,3,4,1,1,1,13,0
1,166059,26756,126171149,21,113,5,4.264797,2934,3738,31,...,0.000134,0,0,5,5,5,5,8,13,0
2,176045175,1043,126171149,53,119,6,3e-06,114,0,254,...,0.0,0,2,17,17,14,14,17,7,1
3,176045175,1043,126171149,53,119,6,3e-06,114,0,254,...,0.0,0,2,25,25,25,25,25,7,1
4,166059,34472,126171149,10429,113,5,0.052957,3718,43086,31,...,0.00013,0,0,5,8,1,1,2,13,0


In [50]:
df.state.unique()

array([ 2,  5,  6,  9, 11, 10,  4,  3,  1,  8,  0, 12,  7])

In [51]:
df.describe()

Unnamed: 0,srcip,dstip,proto,state,dur,sbytes,dbytes,sttl,dttl,sloss,...,ackdat,is_sm_ips_ports,ct_state_ttl,ct_srv_src,ct_srv_dst,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,Label
count,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,...,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0,449796.0
mean,96469510.0,128439900.0,114.453261,5.085528,0.694259,5092.681,22776.25,138.566168,37.336844,4.107438,...,0.006775,0.000972,0.931353,13.858387,13.683721,8.416918,6.496405,12.401215,9.646331,0.5
std,85709290.0,12316860.0,12.58373,1.293619,13.697272,115707.7,144940.4,109.317462,71.775605,44.01516,...,0.026773,0.031155,0.98252,13.818018,13.880075,11.03397,7.860205,14.575122,3.540252,0.500001
min,166059.0,224.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
25%,166059.0,126171100.0,113.0,5.0,8e-06,114.0,0.0,31.0,0.0,0.0,...,0.0,0.0,0.0,3.0,3.0,1.0,1.0,1.0,7.0,0.0
50%,176045200.0,126171100.0,119.0,5.0,0.001058,264.0,178.0,62.0,29.0,0.0,...,0.0,0.0,1.0,7.0,7.0,2.0,1.0,4.0,12.5,0.5
75%,176045200.0,126171100.0,119.0,6.0,0.098976,2054.0,3276.0,254.0,29.0,4.0,...,0.000136,0.0,2.0,24.0,24.0,16.0,13.0,24.0,13.0,1.0
max,241168200.0,241168200.0,132.0,12.0,8760.777344,14355770.0,14657530.0,255.0,254.0,5319.0,...,3.551121,1.0,6.0,67.0,67.0,67.0,60.0,67.0,13.0,1.0


In [52]:
df.values

array([[166059, 55490, 126171149, ..., 1, 13, 0],
       [166059, 26756, 126171149, ..., 8, 13, 0],
       [176045175, 1043, 126171149, ..., 17, 7, 1],
       ...,
       [166059, 31192, 126171149, ..., 1, 13, 0],
       [176045175, 1043, 126171149, ..., 41, 7, 1],
       [166059, 56828, 126171149, ..., 1, 13, 0]], dtype=object)

In [53]:
X = df.drop(columns=['attack_cat', 'Label'])
y = df['Label']

X

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,tcprtt,synack,ackdat,is_sm_ips_ports,ct_state_ttl,ct_srv_src,ct_srv_dst,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm
0,166059,55490,126171149,53,119,2,0.001050,146,178,31,...,0.000000,0.000000,0.000000,0,0,3,4,1,1,1
1,166059,26756,126171149,21,113,5,4.264797,2934,3738,31,...,0.001011,0.000877,0.000134,0,0,5,5,5,5,8
2,176045175,1043,126171149,53,119,6,0.000003,114,0,254,...,0.000000,0.000000,0.000000,0,2,17,17,14,14,17
3,176045175,1043,126171149,53,119,6,0.000003,114,0,254,...,0.000000,0.000000,0.000000,0,2,25,25,25,25,25
4,166059,34472,126171149,10429,113,5,0.052957,3718,43086,31,...,0.000650,0.000520,0.000130,0,0,5,8,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
449791,176045175,12733,126171149,21,113,5,0.977167,1072,1656,62,...,0.051045,0.006791,0.044254,0,1,1,1,1,1,2
449792,176045175,1043,126171149,53,119,6,0.000004,114,0,254,...,0.000000,0.000000,0.000000,0,2,16,16,7,3,7
449793,166059,31192,126171149,27796,113,5,0.019671,3182,35916,31,...,0.000765,0.000609,0.000156,0,0,2,8,1,1,1
449794,176045175,1043,126171149,53,119,6,0.000010,114,0,254,...,0.000000,0.000000,0.000000,0,2,41,41,25,14,41


In [54]:
# X_train_validation includes training set and validation set
# X_test includes test set
# y_train_validation includes the labels for both the training set and validation set
# y_test includes the labels for the test set
X_train_validation, X_test, y_train_validation, y_test = train_test_split(X, y, test_size=0.2)

X_train_validation.head()

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,tcprtt,synack,ackdat,is_sm_ips_ports,ct_state_ttl,ct_srv_src,ct_srv_dst,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm
118415,166059,27630,126171149,55320,113,5,0.026317,2438,16334,31,...,0.000702,0.000566,0.000136,0,0,12,9,1,1,1
40132,176045175,1043,126171149,53,119,6,5e-06,114,0,254,...,0.0,0.0,0.0,0,2,28,28,12,12,28
171468,126171149,1043,176045175,53,119,6,1e-06,264,0,60,...,0.0,0.0,0.0,0,0,25,25,16,7,25
264454,176045175,0,126171149,0,120,6,9e-06,200,0,254,...,0.0,0.0,0.0,0,2,7,7,3,3,12
432262,166059,45573,126171149,49299,119,2,0.001782,528,304,31,...,0.0,0.0,0.0,0,0,13,5,1,1,3


In [55]:
# X_train is the training set
# X_validation is the validation set
# y_train is the labels for the training set
# y_validation is the labels for the validation set
X_train, X_validation, y_train, y_validation = train_test_split(X_train_validation, y_train_validation, test_size=0.2)

X_train.head()

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,tcprtt,synack,ackdat,is_sm_ips_ports,ct_state_ttl,ct_srv_src,ct_srv_dst,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm
144064,176045175,59664,126171149,25,113,5,0.877401,22443,1798,62,...,0.065868,0.009716,0.056152,0,1,1,1,1,1,1
351925,166059,48143,126171149,46715,113,5,0.004791,4040,2456,31,...,0.000685,0.000562,0.000123,0,0,6,4,1,1,3
70573,166059,17871,126171149,53,119,2,0.001021,146,178,31,...,0.0,0.0,0.0,0,0,2,1,1,1,2
20187,176045175,47439,126171149,53,119,6,9e-06,114,0,254,...,0.0,0.0,0.0,0,2,22,22,11,4,22
110542,176045175,1043,126171149,53,119,6,1e-06,114,0,254,...,0.0,0.0,0.0,0,2,13,13,13,13,13


In [56]:
# Train
model = DecisionTreeClassifier()
model.fit(X_train, y_train)




ValueError: could not convert string to float: '0x000c'

In [None]:
# Validate model
predictions = model.predict(X_validation)

score = accuracy_score(y_validation, predictions)
score