In [1]:
import pandas as pd
import numpy as np

# check data
df = pd.read_csv('traffic1/anomaly2.csv')
df.head()

Unnamed: 0,as_src,as_dst,ip_dst,port_dst,ip_proto,packets,bytes,flows,timestamp_start_residual,timestamp_start
0,36040,50550,109.207.99.116,43081,6,8192,12468224,1,29725,2020-06-25 10:41:38
1,6507,51960,91.222.24.76,50672,17,8192,4120576,1,30836,2020-06-25 10:41:38
2,36351,51960,185.78.73.86,54888,6,8192,3235840,1,31095,2020-06-25 10:41:38
3,0,51960,91.222.24.124,49357,6,8192,12402688,1,31171,2020-06-25 10:41:38
4,40676,51960,91.222.24.65,50895,17,8192,9846784,1,31189,2020-06-25 10:41:38


# 1. get dataset for training

In [6]:
%%time

# ip address to int value (4 1B dec)
def get_ip_fea(ip):
    value = 0
    for i in range(len(ip.split('.'))):
        item = int( ip.split('.')[i] )
        value += item*(256**(3-i))
    return value

# time stamp to int value (total seconds)
# remove year-month-day since all 2020/6/25
def get_time_fea(tt):
    tt = tt.split(' ')[-1]
    value = 0
    for i in range(len(tt.split(':'))):
        item = int( tt.split(':')[i] )
        value += item*(60**(2-i))
    return value

def get_dataset():
    x_data=[]
    y_data=[]
    count_anomaly = 0
    df = pd.read_csv('traffic1/anomaly1.csv')
    for index, row in df.iterrows():
        fea = [int(row['as_src']), int(row['as_dst']), get_ip_fea(row['ip_dst']), int(row['port_dst']), int(row['ip_proto']), 
               int(row['packets']), int(row['bytes']), int(row['flows']), int(row['timestamp_start_residual']), get_time_fea(row['timestamp_start']) ]
        x_data.append(fea)
        if '185.78.74.30' in row['ip_dst']:
            y_data.append(1)
            count_anomaly+=1
        else:
            y_data.append(0)
    
    df = pd.read_csv('traffic1/anomaly2.csv')
    for index, row in df.iterrows():
        fea = [int(row['as_src']), int(row['as_dst']), get_ip_fea(row['ip_dst']), int(row['port_dst']), int(row['ip_proto']), 
               int(row['packets']), int(row['bytes']), int(row['flows']), int(row['timestamp_start_residual']), get_time_fea(row['timestamp_start']) ]
        x_data.append(fea)
        if '185.78.74.30' in row['ip_dst']:
            y_data.append(1)
            count_anomaly+=1
        else:
            y_data.append(0)

    df = pd.read_csv('traffic1/anomaly3.csv')
    for index, row in df.iterrows():
        fea = [int(row['as_src']), int(row['as_dst']), get_ip_fea(row['ip_dst']), int(row['port_dst']), int(row['ip_proto']), 
               int(row['packets']), int(row['bytes']), int(row['flows']), int(row['timestamp_start_residual']), get_time_fea(row['timestamp_start']) ]
        x_data.append(fea)
        if '185.78.74.30' in row['ip_dst']:
            y_data.append(1)
            count_anomaly+=1
        else:
            y_data.append(0)
            
    df = pd.read_csv('traffic1/anomaly-mostly.csv')
    for index, row in df.iterrows():
        fea = [int(row['as_src']), int(row['as_dst']), get_ip_fea(row['ip_dst']), int(row['port_dst']), int(row['ip_proto']), 
               int(row['packets']), int(row['bytes']), int(row['flows']), int(row['timestamp_start_residual']), get_time_fea(row['timestamp_start']) ]
        x_data.append(fea)
        y_data.append(1)
        count_anomaly+=1
            
    print('count_anomaly=',count_anomaly)
    return x_data, y_data

x_data, y_data = get_dataset()

count_anomaly= 2338
CPU times: user 1min 4s, sys: 40 ms, total: 1min 4s
Wall time: 1min 4s


In [5]:
x_data, y_data = np.array(x_data), np.array(y_data)
print(x_data.shape, y_data.shape)

(206925, 10) (206925,)


# 2. StandardScaler

In [7]:
%%time
from sklearn.preprocessing import StandardScaler

scalar = StandardScaler()
x_data = scalar.fit_transform(x_data)# normalization

CPU times: user 1.25 s, sys: 84 ms, total: 1.33 s
Wall time: 2.8 s


# 3. RandomForestClassifier

In [9]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(x_data,y_data)
model.score(x_data,y_data)

0.9888321816670012

In [10]:
model.feature_importances_

array([0.04113733, 0.00321185, 0.02030687, 0.10276845, 0.01661686,
       0.        , 0.04071235, 0.        , 0.13608083, 0.63916546])

In [13]:
feature_list = ['as_src','as_dst','ip_dst','port_dst','ip_proto','packets','bytes','flows','timestamp_start_residual','timestamp_start']
for i in range(len(feature_list)):
    print( '{0}\t\t{1}'.format(feature_list[i], model.feature_importances_[i]))

as_src		0.04113732575164137
as_dst		0.003211848472232502
ip_dst		0.020306874739227227
port_dst		0.10276844950380493
ip_proto		0.016616861587658897
packets		0.0
bytes		0.040712347695042526
flows		0.0
timestamp_start_residual		0.1360808313738974
timestamp_start		0.6391654608764951


# 4. what if remove timestamp_start	

In [14]:
x_data2 = np.delete(x_data, np.s_[-1:], axis=1)# remove last column
print('x_data.shape={0}, x_data2.shape={1}'.format(x_data.shape,x_data2.shape))

x_data.shape=(209262, 10), x_data2.shape=(209262, 9)


In [15]:
from sklearn.ensemble import RandomForestClassifier

model2 = RandomForestClassifier()
model2.fit(x_data2,y_data)
model2.score(x_data2,y_data)

0.9888321816670012

In [16]:
feature_list = ['as_src','as_dst','ip_dst','port_dst','ip_proto','packets','bytes','flows','timestamp_start_residual']
for i in range(len(feature_list)):
    print( '{0}\t\t{1}'.format(feature_list[i], model2.feature_importances_[i]))

as_src		0.06702220762098197
as_dst		0.002856756073643305
ip_dst		0.035099158745460345
port_dst		0.2847206243880571
ip_proto		0.0441421364871197
packets		0.0
bytes		0.08028659710022826
flows		0.0
timestamp_start_residual		0.4858725195845092


In [17]:
# read  anomaly data to see
df1 = pd.read_csv('traffic1/anomaly-mostly.csv')
df1.head()

Unnamed: 0,as_src,as_dst,ip_dst,port_dst,ip_proto,packets,bytes,flows,timestamp_start_residual,timestamp_start
0,16276,51960,91.222.24.137,51989,6,8192,12402688,1,4293,2020-06-25 09:32:52
1,32934,51960,91.222.27.201,50400,6,8192,11649024,1,4340,2020-06-25 09:32:52
2,15133,51960,193.9.123.223,57334,6,8192,12402688,1,4604,2020-06-25 09:32:52
3,2906,51960,91.222.24.102,26417,6,8192,12304384,1,4700,2020-06-25 09:32:52
4,2906,51960,193.9.123.95,51357,6,8192,12402688,1,4733,2020-06-25 09:32:52


In [19]:
# read  no-anomaly data to see
df2 = pd.read_csv('traffic1/no-anomaly.csv')
df2.head()

Unnamed: 0,as_src,as_dst,ip_dst,port_dst,ip_proto,packets,bytes,flows,timestamp_start_residual,timestamp_start
0,16276,51960,91.222.25.49,57424,6,8192,630784,1,1585,2020-06-25 10:38:07
1,0,51960,193.9.123.80,48813,17,8192,11468800,1,8250,2020-06-25 10:38:07
2,2906,51960,91.222.24.209,60053,6,8192,12402688,1,85735,2020-06-25 10:38:07
3,36040,50550,109.207.96.150,58094,6,8192,12402688,1,86533,2020-06-25 10:38:07
4,2906,51960,193.9.123.226,51060,6,8192,12402688,1,86759,2020-06-25 10:38:07
