In [1]:
import pandas as pd
import numpy as np
import statistics
import torch
from torch.utils.data import Dataset
from torch import nn
import random
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# TODO: figure out where I want to do data normalization, decide what features to keep or not
# idea is we can normalize everything first in the raw_da               taset, then individually convert to tensors and use get_dummies for categorical

RAW_DATA_PATH = "./data/UNSW-NB15_?.csv" # replace ? with 1,2,3,4
FEATURE_PATH = "./data/NUSW-NB15_features.csv"

In [22]:
# Read datasets (see README.md for source)

features = pd.read_csv(FEATURE_PATH, encoding='cp1252').drop(columns=['No.'])
column_names = list(features['Name'])
column_names[column_names.index('ct_src_ ltm')] = 'ct_src_ltm'

column_types = {}

# for some reason there is some weirdness columns like ports being in hexadecimal sometimes?
for ind, row in features.iterrows():
    typ = row['Type '].lower()
    if typ == 'nominal':
        column_types[row['Name']] = str
    elif typ == 'integer' or typ == 'timestamp' or typ == 'binary':
        column_types[row['Name']] = np.int64
    elif typ == 'float':
        column_types[row['Name']] = np.float64

dfs = []
for i in range(1,5):
    temp = pd.read_csv(RAW_DATA_PATH.replace('?', str(i)), names=column_names)

    dfs.append(temp)

raw_data = pd.concat(dfs)

# sport and dport are string instead of int because there are some weird entries 
# exclude rows with '-' ports and convert hex ports to int 
exclude_ind = []

for ind, d in raw_data.iterrows():
    if d['sport'] == '-' or d['dsport'] == '-':
        exclude_ind.append(ind)

raw_data = raw_data.drop(exclude_ind)

# convert the hex values to ints
def convert_to_int(v): 
    if type(v) == str and v.startswith('0x'):
        return int(v, 16)
    return v

raw_data['sport'] = raw_data['sport'].apply(convert_to_int).astype(np.int64)
raw_data['dsport'] = raw_data['dsport'].apply(convert_to_int).astype(np.int64)

  temp = pd.read_csv(RAW_DATA_PATH.replace('?', str(i)), names=column_names)
  temp = pd.read_csv(RAW_DATA_PATH.replace('?', str(i)), names=column_names)


In [27]:
# Normalize and filter
# For some reason there is one dsport > 65535 (like 1)
raw_data = raw_data[raw_data['dsport'] <= 65535]
raw_data['ct_ftp_cmd'] = raw_data['ct_ftp_cmd'].apply(lambda x: 0 if x == ' ' else int(x)).astype(np.int64)

# ok now lets normalize certain columns
# NOTE: stime and ltime columns werent normalized because we wont be needing them for the model, but we still need them to compute the sequences
# TODO: look into maybe applying log transforms to some of these
normalize_these = ['sport', 'dsport', 'dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'Sload', 'Dload', 'Spkts',
                   'Dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'trans_depth', 'res_bdy_len',
                  'Sjit', 'Djit', 'Sintpkt', 'Dintpkt', 'tcprtt', 'synack', 'ackdat', 'ct_state_ttl', 'ct_ftp_cmd',
                  'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm']
raw_data_normalized = raw_data.copy()
raw_data_normalized[normalize_these] = scaler.fit_transform(raw_data[normalize_these])

# deal with NaN situation in column ct_flw_http_mthd
raw_data_normalized['ct_flw_http_mthd_is_nan'] = raw_data_normalized['ct_flw_http_mthd'].isna()
raw_data_normalized['ct_flw_http_mthd'] = raw_data_normalized['ct_flw_http_mthd'].apply(lambda x: 0 if pd.isna(x) else x)

# deal with is_ftp_login having NaN and 2/4 values (type is supposedly binary)
raw_data_normalized['is_ftp_login_is_ambiguous'] = ((raw_data_normalized['is_ftp_login'].isna()) | (raw_data_normalized['is_ftp_login'] == 2))
raw_data_normalized['is_ftp_login'] = raw_data_normalized['is_ftp_login'].apply(lambda x: 0 if pd.isna(x) or x == 2 or x == 4 else x)

raw_data_normalized

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,Label,ct_flw_http_mthd_is_nan,is_ftp_login_is_ambiguous
0,59.166.0.0,0.021210,149.171.126.6,0.000809,udp,CON,1.200687e-07,0.000009,0.000011,0.121569,...,0.090909,0.000000,0.030303,0.000000,0.000000,0.000000,,0,False,False
1,59.166.0.0,0.513634,149.171.126.9,0.015625,udp,CON,4.112267e-06,0.000037,0.000021,0.121569,...,0.045455,0.015152,0.030303,0.000000,0.000000,0.015152,,0,False,False
2,59.166.0.6,0.022339,149.171.126.7,0.000809,udp,CON,1.273525e-07,0.000010,0.000012,0.121569,...,0.106061,0.000000,0.015152,0.015152,0.000000,0.000000,,0,False,False
3,59.166.0.5,0.054826,149.171.126.5,0.000809,udp,CON,1.375953e-07,0.000009,0.000011,0.121569,...,0.121212,0.000000,0.000000,0.000000,0.000000,0.000000,,0,False,False
4,59.166.0.3,0.757824,149.171.126.0,0.000809,udp,CON,1.330429e-07,0.000010,0.000012,0.121569,...,0.121212,0.000000,0.000000,0.000000,0.000000,0.000000,,0,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
440039,59.166.0.5,0.504982,149.171.126.7,0.662745,tcp,FIN,9.936224e-06,0.000022,0.000125,0.121569,...,0.015152,0.030303,0.030303,0.000000,0.000000,0.030303,,0,True,True
440040,59.166.0.7,0.318120,149.171.126.4,0.000320,tcp,CON,4.154695e-05,0.000032,0.000024,0.121569,...,0.015152,0.015152,0.015152,0.015152,0.016949,0.015152,,0,True,True
440041,59.166.0.3,0.328237,149.171.126.9,0.000320,tcp,CON,7.209987e-04,0.000126,0.000142,0.121569,...,0.015152,0.045455,0.015152,0.015152,0.016949,0.015152,,0,True,True
440042,59.166.0.9,0.540673,149.171.126.0,0.001221,tcp,CON,2.504865e-04,0.000244,0.011329,0.121569,...,0.000000,0.015152,0.045455,0.015152,0.016949,0.015152,,0,False,True


In [3]:
# Now we want to break out data into sequences
TIME_WINDOW = 60 * 2 

raw_data = raw_data.sort_values(by=['Stime'])

groups = raw_data.groupby(by=['srcip', 'dstip']) # should be same order as in original df

In [4]:
all_seq = {}
for name, g in groups:
    sequences = []
    window_start_time = 0
    current_seq = []
    for ind, data in g.iterrows():
        if window_start_time == 0:
            window_start_time = data['Stime']
        
        current_seq.append(data)
    
        if (data['Stime']-window_start_time) >= TIME_WINDOW:
            window_start_time = 0
            sequences.append(current_seq)
            current_seq = []

    all_seq[name] = sequences
    #print(f'{name} has {len(sequences)} sequences')

In [5]:
# Analysis and filtering of sequence data
MIN_SEQ_COUNT = 30 # want to exclude IP pairs that have less than 30 sequences, since these may not provide enough information to be relevant for training

raw_sequences = []
attack_indices = []
for k, v in all_seq.items():
    seq_count = len(v)
    
    lengths = [len(d) for d in v]
    if len(lengths) > MIN_SEQ_COUNT:
        #print(f'{k}: {len(v)} total sequences, {min(lengths)} min length, {max(lengths)} max length, {statistics.mean(lengths):.2f} average length')

        for s in v:
            raw_sequences.append(s)
            
            for con in s:
                if con['Label'] == 1:
                    attack_indices.append(len(raw_sequences)-1)
                    break

print(f'We have {len(raw_sequences)} sequences in our dataset')
print(f'Out of these, {len(attack_indices)} contain attacks')

We have 79616 sequences in our dataset
Out of these, 9859 contain attacks


In [None]:
# Utilities
def raw_sequence_2_tensor(seq):
    pass

In [None]:
# Dataset class
class SequenceDataset(Dataset):
    def __init__(self, sequences, attack_inds):
        self.raw_sequences = sequences
        self.raw_labels = []

        for index, s in enumerate(self.raw_sequences):
            self.raw_labels.append(1 if index in attack_inds else 0)

    def __len__(self):
        return len(self.raw_sequences)

    def __getitem__(self, index):
        return self.raw_sequences[index], self.raw_labels[index]


ds = SequenceDataset(raw_sequences, attack_indices)
raw_seq_2_tensor(ds[0][0])

In [None]:
# DEBUG CODE

for j in dfs:
    zero_cnt = 0
    for i in j['Stime'].to_numpy():
        if i == 0: zero_cnt += 1

zero_cnt

###
for i in range(len(column_names)):
    print(f'{column_names[i]} - {dfs[0][column_names[i]].dtype} {dfs[1][column_names[i]].dtype} {dfs[2][column_names[i]].dtype} {dfs[3][column_names[i]].dtype}')


###
for c in column_names:
    print(f'Column {c} has type {raw_data[c].dtype}')

### sequence generation
g = groups.get_group(('59.166.0.0', '149.171.126.6'))
start_times = g['Stime']

sequences = []
window_start_time = 0
current_seq = []
for ind, data in g.iterrows():
    if window_start_time == 0:
        window_start_time = data['Stime']
    
    #print(f'{ind} is {data['Stime']}')

    current_seq.append(data)

    if (data['Stime']-window_start_time) >= TIME_WINDOW:
        window_start_time = 0
        sequences.append(current_seq)
        current_seq = []

print(len(sequences))

In [21]:
column_names.index('ct_src_ ltm')

43