# Input function

In [5]:
import re
import pandas as pd
import time
import datetime
import math
from __future__ import division
import numpy as np
import pickle
import itertools
import ipaddress

# load snort log

In [6]:
snort_df = pd.read_pickle('snort_alert_df.pkl')
snort_df.head()

Unnamed: 0,time,behavior,sour_ip,dest_ip
0,1331901000.0,Web Application Attack,192.168.202.79:50465,192.168.229.251:80
1,1331901000.0,Web Application Attack,192.168.202.79:50467,192.168.229.251:80
2,1331901000.0,Web Application Attack,192.168.202.79:50469,192.168.229.251:80
3,1331901000.0,Web Application Attack,192.168.202.79:50471,192.168.229.251:80
4,1331901000.0,Unsuccessful User Privilege Gain,192.168.229.153:445,192.168.202.79:55173


# load Bro-IDS connection log

In [7]:
conn_df = pd.read_pickle('bro_conn_df.pkl')
conn_df.head()

Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,...,local_orig,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,threat,sample
0,1331901000.0,CCUIP21wTjqkj8ZqX5,192.168.202.79,50463,192.168.229.251,80,tcp,-,-,-,...,-,0,Fa,1,52,1,52,(empty),,
1,1331901000.0,Csssjd3tX0yOTPDpng,192.168.202.79,46117,192.168.229.254,443,tcp,-,-,-,...,-,0,dDafFr,3,382,9,994,(empty),,
2,1331901000.0,CHEt7z3AzG4gyCNgci,192.168.202.79,50465,192.168.229.251,80,tcp,http,0.010000,166,...,-,0,ShADfFa,4,382,3,382,(empty),,
3,1331901000.0,CKnDAp2ohlvN6rpiXl,192.168.202.79,50467,192.168.229.251,80,tcp,http,0.010000,166,...,-,0,ShADfFa,4,382,3,382,(empty),,
4,1331901000.0,CGUBcoXKxBE8gTNl,192.168.202.79,46119,192.168.229.254,443,tcp,ssl,0.020000,544,...,-,0,ShADadfFr,8,968,13,1744,(empty),,


# Choose which attacks you want

In [8]:
target_behavior_list = ['Potential Corporate Privacy Violation',
                        'Generic Protocol Command Decode',
                        'Detection of a Network Scan',
                        'Attempted Denial of Service']

filter_snort_df = snort_df[snort_df['behavior'].isin(target_behavior_list)]
filter_snort_df['behavior'].value_counts()

Potential Corporate Privacy Violation    148862
Generic Protocol Command Decode           28040
Detection of a Network Scan                1814
Attempted Denial of Service                 469
Name: behavior, dtype: int64

# Snort IP data preprocessing

In [9]:
def valid_ip(address):
    try:
        ipaddress.ip_address(address)
        return True
    except:
        return False
    
filter_snort_df['sour_ip'] = filter_snort_df['sour_ip'].apply(lambda x: x if valid_ip(x) else ':'.join(x.split(':')[:-1]))
filter_snort_df['dest_ip'] = filter_snort_df['dest_ip'].apply(lambda x: x if valid_ip(x) else ':'.join(x.split(':')[:-1]))
filter_snort_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0,time,behavior,sour_ip,dest_ip
9378,1331901000.0,Generic Protocol Command Decode,192.168.202.79,192.168.229.153
9379,1331901000.0,Generic Protocol Command Decode,192.168.202.79,192.168.229.153
9380,1331901000.0,Generic Protocol Command Decode,192.168.202.79,192.168.229.153
9381,1331901000.0,Generic Protocol Command Decode,192.168.202.79,192.168.229.153
9384,1331901000.0,Generic Protocol Command Decode,192.168.202.79,192.168.229.153


In [10]:
sour_ip_value_counts = filter_snort_df['sour_ip'].value_counts()
dest_ip_value_counts = filter_snort_df['dest_ip'].value_counts()

merge_value_counts = sour_ip_value_counts.add(dest_ip_value_counts, fill_value=0)
filter_merge_value_counts = merge_value_counts[merge_value_counts > 200].sort_values(axis=0, ascending=False)
ip_list = list(filter_merge_value_counts.axes[0])
len(ip_list)

58

# Print out the current number of IP

In [11]:
target_ip_list = ip_list[-10:]
target_ip_list

['192.168.22.102',
 '192.168.24.101',
 '192.168.25.25',
 '192.168.21.101',
 '192.168.229.153',
 '192.168.207.4',
 '192.168.21.152',
 '192.168.229.252',
 '192.168.202.71',
 '192.168.21.102']

# Bro IP data preprocessing

In [12]:
filter_conn_df = conn_df[conn_df['id.orig_h'].isin(target_ip_list) | 
                         conn_df['id.resp_h'].isin(target_ip_list)]
filter_conn_df.shape

(1442389, 22)

In [13]:
mix_filter_conn_df = filter_conn_df.copy()
count, _ = mix_filter_conn_df.shape

for behavior in target_behavior_list:
    mix_filter_conn_df[behavior] = [0] * count

mix_filter_conn_df.head()

Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,...,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,threat,sample,Potential Corporate Privacy Violation,Generic Protocol Command Decode,Detection of a Network Scan,Attempted Denial of Service
707,1331901000.0,CTgyMB2GL0FCCKgv04,192.168.202.71,137,192.168.202.255,137,udp,dns,2.010000,650,...,1014,0,0,(empty),,,0,0,0,0
748,1331901000.0,ClW5Ax1HMMLQKa41Ac,192.168.202.79,55173,192.168.229.153,445,tcp,-,55.200000,6135377,...,8402145,43683,3975036,(empty),,,0,0,0,0
752,1331901000.0,CetF5Pzr1hMNEqq7k,192.168.202.79,46827,192.168.229.252,636,tcp,-,-,-,...,112,2,100,(empty),,,0,0,0,0
754,1331901000.0,C420bf3sV7wgQSDOLd,192.168.202.79,46832,192.168.229.252,636,tcp,-,0.010000,132,...,296,2,100,(empty),,,0,0,0,0
756,1331901000.0,CoNOSc2XPv9SVSz259,192.168.202.79,34494,192.168.229.153,5357,tcp,http,0.030000,166,...,382,3,677,(empty),,,0,0,0,0


# Bro orig_Port, resp_Port, one hot encoding

In [14]:
adjusted_mix_filter_conn_df = mix_filter_conn_df.copy()
adjusted_mix_filter_conn_df['id.orig_p'] = pd.cut(adjusted_mix_filter_conn_df['id.orig_p'], 
                                                  bins=[-1, 1023, 49151, np.inf], 
                                                  labels=False, right=True)
adjusted_mix_filter_conn_df['id.resp_p'] = pd.cut(adjusted_mix_filter_conn_df['id.resp_p'],
                                                  bins=[-1, 1023, 49151, np.inf], 
                                                  labels=False, right=True)
adjusted_mix_filter_conn_df.head()

Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,...,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,threat,sample,Potential Corporate Privacy Violation,Generic Protocol Command Decode,Detection of a Network Scan,Attempted Denial of Service
707,1331901000.0,CTgyMB2GL0FCCKgv04,192.168.202.71,0,192.168.202.255,0,udp,dns,2.010000,650,...,1014,0,0,(empty),,,0,0,0,0
748,1331901000.0,ClW5Ax1HMMLQKa41Ac,192.168.202.79,2,192.168.229.153,0,tcp,-,55.200000,6135377,...,8402145,43683,3975036,(empty),,,0,0,0,0
752,1331901000.0,CetF5Pzr1hMNEqq7k,192.168.202.79,1,192.168.229.252,0,tcp,-,-,-,...,112,2,100,(empty),,,0,0,0,0
754,1331901000.0,C420bf3sV7wgQSDOLd,192.168.202.79,1,192.168.229.252,0,tcp,-,0.010000,132,...,296,2,100,(empty),,,0,0,0,0
756,1331901000.0,CoNOSc2XPv9SVSz259,192.168.202.79,1,192.168.229.153,1,tcp,http,0.030000,166,...,382,3,677,(empty),,,0,0,0,0


### The function contains one hot and data pre-processing, normalize

In [15]:
def one_hot(df, cols):
    for each in cols:
        dummies = pd.get_dummies(df[each], prefix=each, drop_first=False)
        df = pd.concat([df, dummies], axis=1)
    df = df.drop(cols, axis=1)
    
    return df

def add_no_value(df, cols):
    for col in cols:
        df['no_' + col] = df[col].astype(str) == '-'
        df['no_' + col] = df['no_' + col].astype(int)
    return df

def normalize(df, cols):
    temp_df = df[cols]
    temp_df[cols] = temp_df[cols].astype(str)
    temp_df.replace({'-': '0'}, inplace=True)
    
    # transfer type
    for col in cols:
        temp_df[col] = temp_df[col].astype(float)
        
    temp_df = (temp_df - temp_df.min()) / (temp_df.max() - temp_df.min())
    
    for col in cols:
        df[col] = temp_df[col]
    
    return df

### Select the field you want to process before the data

In [16]:
adjusted_mix_filter_conn_df = add_no_value(adjusted_mix_filter_conn_df, 
                                           cols=['duration', 'orig_bytes', 'resp_bytes'])
adjusted_mix_filter_conn_df = normalize(adjusted_mix_filter_conn_df, 
                                        cols=['duration', 'orig_bytes', 'resp_bytes', 
                                              'orig_pkts', 'orig_ip_bytes',
                                              'resp_pkts', 'resp_ip_bytes'])
adjusted_mix_filter_conn_df = one_hot(adjusted_mix_filter_conn_df, 
                                      cols=['id.orig_p', 'id.resp_p','proto'])

adjusted_mix_filter_conn_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  limit=limit, regex=regex)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,ts,uid,id.orig_h,id.resp_h,service,duration,orig_bytes,resp_bytes,conn_state,local_orig,...,no_resp_bytes,id.orig_p_0,id.orig_p_1,id.orig_p_2,id.resp_p_0,id.resp_p_1,id.resp_p_2,proto_icmp,proto_tcp,proto_udp
707,1331901000.0,CTgyMB2GL0FCCKgv04,192.168.202.71,192.168.202.255,dns,0.001282,1.832717e-07,0.0,S0,-,...,0,1,0,0,1,0,0,0,0,1
748,1331901000.0,ClW5Ax1HMMLQKa41Ac,192.168.202.79,192.168.229.153,-,0.035217,0.001729909,0.001302153,RSTR,-,...,0,0,0,1,1,0,0,0,1,0
752,1331901000.0,CetF5Pzr1hMNEqq7k,192.168.202.79,192.168.229.252,-,0.0,0.0,0.0,RSTR,-,...,1,0,1,0,1,0,0,0,1,0
754,1331901000.0,C420bf3sV7wgQSDOLd,192.168.202.79,192.168.229.252,-,6e-06,3.721825e-08,0.0,RSTR,-,...,0,0,1,0,1,0,0,0,1,0
756,1331901000.0,CoNOSc2XPv9SVSz259,192.168.202.79,192.168.229.153,http,1.9e-05,4.680477e-08,3.91773e-07,SF,-,...,0,0,1,0,0,1,0,0,1,0


In [17]:
list(adjusted_mix_filter_conn_df)

['ts',
 'uid',
 'id.orig_h',
 'id.resp_h',
 'service',
 'duration',
 'orig_bytes',
 'resp_bytes',
 'conn_state',
 'local_orig',
 'missed_bytes',
 'history',
 'orig_pkts',
 'orig_ip_bytes',
 'resp_pkts',
 'resp_ip_bytes',
 'tunnel_parents',
 'threat',
 'sample',
 'Potential Corporate Privacy Violation',
 'Generic Protocol Command Decode',
 'Detection of a Network Scan',
 'Attempted Denial of Service',
 'no_duration',
 'no_orig_bytes',
 'no_resp_bytes',
 'id.orig_p_0',
 'id.orig_p_1',
 'id.orig_p_2',
 'id.resp_p_0',
 'id.resp_p_1',
 'id.resp_p_2',
 'proto_icmp',
 'proto_tcp',
 'proto_udp']

# target_ip_list choose the IP you want

In [18]:
sub_adjusted_mix_filter_conn_df_list = []
sub_filter_conn_df_list = []

for target_ip in target_ip_list:
    sub_adjusted_mix_filter_conn_df = adjusted_mix_filter_conn_df[adjusted_mix_filter_conn_df['id.orig_h'].isin([target_ip]) | 
                                                                  adjusted_mix_filter_conn_df['id.resp_h'].isin([target_ip])]
    sub_adjusted_mix_filter_conn_df_list.append(sub_adjusted_mix_filter_conn_df)
    
    # 可以改 ==
    sub_filter_conn_df = filter_snort_df[filter_snort_df['sour_ip'].str.contains(target_ip) | 
                                         filter_snort_df['dest_ip'].str.contains(target_ip)]
    sub_filter_conn_df_list.append(sub_filter_conn_df)
    print(target_ip, sub_adjusted_mix_filter_conn_df.shape, sub_filter_conn_df.shape)

192.168.22.102 (149334, 35) (257, 4)
192.168.24.101 (173854, 35) (244, 4)
192.168.25.25 (152404, 35) (1432, 4)
192.168.21.101 (145859, 35) (236, 4)
192.168.229.153 (246930, 35) (220, 4)
192.168.207.4 (113148, 35) (212, 4)
192.168.21.152 (59417, 35) (210, 4)
192.168.229.252 (260471, 35) (209, 4)
192.168.202.71 (5412, 35) (206, 4)
192.168.21.102 (150646, 35) (204, 4)


### Predicting abnormal behavior within a few minutes

In [None]:
window_time = 3600.0

for i in range(len(target_ip_list)):
    target_ip = target_ip_list[i]
    sub_adjusted_mix_filter_conn_df = sub_adjusted_mix_filter_conn_df_list[i]
    sub_filter_conn_df = sub_filter_conn_df_list[i]
    
    #
    #last_time = sub_filter_conn_df.tail(1)['time'].values[0]
    print(target_ip, sub_adjusted_mix_filter_conn_df.shape)
    
    for index, row in sub_adjusted_mix_filter_conn_df.iterrows():
        #if row['ts'] >= last_time:
        #    continue
        if row['duration'] != '-':
            start_ts = row['ts'] + float(row['duration'])
        else:
            start_ts = row['ts']
            
        end_ts = start_ts + window_time
        orig_h = row['id.orig_h']
        resp_h = row['id.resp_h']
        
        temp_snort_df = sub_filter_conn_df[sub_filter_conn_df['time'] > start_ts]
        temp_snort_df = temp_snort_df[temp_snort_df['time'] <= end_ts]
        
        count, _ = temp_snort_df.shape
        if count != 0:
            for behavior in list(temp_snort_df['behavior'].unique()):
                sub_adjusted_mix_filter_conn_df.loc[index, behavior] = 1

192.168.22.102 (149334, 35)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [None]:
adjusted_mix_filter_conn_df = pd.concat(sub_adjusted_mix_filter_conn_df_list)

In [None]:
attack_count = 4
status_list = list(set(itertools.permutations( [0, 1] * attack_count, attack_count)))
status_list

In [None]:
for status in status_list:
    temp_df = adjusted_mix_filter_conn_df.copy()
    for index in range(len(status)):
        attack_status = status[index]
        target_behavior = target_behavior_list[index]
        temp_df = temp_df[temp_df[target_behavior] == attack_status]
    count, _ = temp_df[target_behavior_list].shape
    print(status, count)

# Sampling

In [None]:
N = 20000
sample_df_list = []

for target_behavior in target_behavior_list:
    print(target_behavior)
    temp_df = adjusted_mix_filter_conn_df[adjusted_mix_filter_conn_df[target_behavior] == 1]
    sample_df = temp_df.sample(N)
    sample_df_list.append(sample_df)
    
sample_df = pd.concat(sample_df_list)
print(len(sample_df))
sample_df.head()

# See how many flows you want to refer to

In [None]:
window_size = 10
data_label_df_list = []

for index, row in sample_df.iterrows():
    for sub_adjusted_mix_filter_conn_df in sub_adjusted_mix_filter_conn_df_list:
        if index in sub_adjusted_mix_filter_conn_df.index:
            data_label_df = sub_adjusted_mix_filter_conn_df.loc[:index].tail(window_size)
            count, _ = data_label_df.shape
            if count != window_size:
                # 如果 < window_size，就不training 及 testing
                print(count)
            else:
                data_label_df_list.append(data_label_df)
                
            break

print(len(data_label_df_list))

# Remove unwanted fields

In [None]:
drop_cols = ['ts', 'uid', 'id.orig_h', 'id.resp_h', 'local_orig', 
             'missed_bytes', 'history', 'tunnel_parents', 'threat', 'sample','service', 'conn_state']

data_list = []
label_list = []

for data_label_df in data_label_df_list:
    data_label_df = data_label_df.drop(drop_cols, axis=1)

    label_df = data_label_df[target_behavior_list]
    label = label_df.iloc[-1].tolist()
        
    data_cols = data_label_df.columns.difference(target_behavior_list)
    data_df = data_label_df[data_cols]
    data = data_df.values.tolist()
    
    data_list.append(data)
    label_list.append(label)

# Store data for training

In [None]:
with open('data_list_netflow_10_30', 'wb') as fp:
    pickle.dump(data_list, fp)

with open('label_list_netflow_10_30', 'wb') as fp:
    pickle.dump(label_list, fp)