In [1]:
from keras.models import load_model
import pandas as pd
import numpy as np
# from data_prep import generate_features, prepare_samples, read_pcap_to_df
import logging
import time
import pyshark
import math
from sklearn.preprocessing import MinMaxScaler

In [2]:
def highest_layer(packet):
    layer = packet.layers[-1].layer_name.upper()
    if layer.startswith('DATA'):
        layer = packet.layers[-2].layer_name.upper()
    return layer


def get_ports(packet):
    if hasattr(packet, 'tcp'):
        return(packet.tcp.srcport, packet.tcp.dstport)
    if hasattr(packet, 'udp'):
        return(packet.udp.srcport, packet.udp.dstport)
    return 


def read_pcap_to_df(file_path):
    capture = pyshark.FileCapture(file_path)
    capture_summaries = pyshark.FileCapture(file_path, only_summaries=True)

    protocol_list = []
    timestamp_list = []
    src_dst_list = []
    src_port_list = []
    dst_port_list = [] 
    length_list = []
    http_method_list = []
    http_response_list = []

    for packet in capture:
        protocol = highest_layer(packet)
        ports = get_ports(packet)
        if hasattr(packet, 'http'):
            try:
                method = packet.http.request_method
            except Exception as e:
                print(e)
                method = ''
            try:
                response_code = packet.http.response_code
            except Exception as e:
                print(e)
                response_code = ''
        else:
            method = ''
            response_code = ''

        http_method_list.append(method)
        http_response_list.append(response_code) 
        protocol_list.append(protocol) 
        timestamp_list.append(packet.sniff_time)
        if ports is not None:
            src_port_list.append(ports[0])
            dst_port_list.append(ports[1])
        else:
            src_port_list.append(None)
            dst_port_list.append(None)

    timestamp_list.pop(0)
    protocol_list.pop(0)
    src_port_list.pop(0)
    dst_port_list.pop(0)
    http_method_list.pop(0)
    http_response_list.pop(0)

    for summary in capture_summaries:
        src_dst_list.append(summary.source + ' -> ' + summary.destination)
        length_list.append(summary.length)

    capture.close()
    capture_summaries.close()

    length_list = list(map(int, length_list))
    timeseries_data = {
        'Date': timestamp_list,
        'Protocol': protocol_list,
        'Pair': src_dst_list,
        'Src_port': src_port_list,
        'Dst_port': dst_port_list,
        'Length': length_list,
        'HTTP_method': http_method_list,
        'HTTP_response': http_response_list
    }

    # assemble the dataframe 
    dataframe = pd.DataFrame(timeseries_data, columns=['Date','Protocol', 'Pair','Src_port', 'Dst_port', 'Length', 'HTTP_method', 'HTTP_response'])
    return dataframe

In [3]:
def fill_up_list(l):
    print(l)
    if isinstance(l, float):
        if math.isnan(l):
            l = []
    if not isinstance(l, list):     
        l = l.tolist()
    n = 5
    if len(l) == n:
        return l
    diff = n - len(l)
    while diff > 0:
        l.append('')
        diff -= 1
    return l


In [4]:
def split_protocols(dataframe):
    dataframe['Top_5_internal_protocols_list'] = dataframe['Top_5_internal_protocols'].values.tolist()
    dataframe['Top_5_protocols_list'] = dataframe['Top_5_protocols'].values.tolist()

    dataframe['Top_5_internal_protocols_list'] = dataframe['Top_5_internal_protocols_list'].apply(fill_up_list)
    dataframe['Top_5_protocols_list'] = dataframe['Top_5_protocols_list'].apply(fill_up_list)
    print(dataframe.head(10))

    df2 = pd.DataFrame(dataframe['Top_5_internal_protocols_list'].to_list(), columns = ['i_protocol_1', 'i_protocol_2', 'i_protocol_3', 
                                                                                'i_protocol_4', 'i_protocol_5'])
    df1 = pd.DataFrame(dataframe['Top_5_protocols_list'].to_list(), columns = ['protocol_1', 'protocol_2', 'protocol_3',  
                                                                                   'protocol_4', 'protocol_5'])
    
    df1 = pd.get_dummies(df1, prefix=['protocol_1', 'protocol_2', 'protocol_3', 'protocol_4', 'protocol_5'])
    df2 = pd.get_dummies(df2, prefix=['i_protocol_1', 'i_protocol_2', 'i_protocol_3', 'i_protocol_4', 'i_protocol_5'])
    
    # reset indices to aviod getting nan values    
    df2.reset_index(drop=True, inplace=True)
    df1.reset_index(drop=True, inplace=True)
    dataframe.reset_index(drop=True, inplace=True)
    
    dataframe_joined = dataframe.join(df2)
    dataframe_joined = dataframe_joined.join(df1)
    dataframe_joined = dataframe_joined.drop(columns=['Top_5_internal_protocols_list', 'Top_5_protocols_list',
                                                      'Top_5_internal_protocols', 'Top_5_protocols'])

    print(dataframe_joined.head(10))
    return dataframe_joined

In [5]:
def generate_features(dataframe, frequency):
    src_dst_splitted = dataframe['Pair'].str.split(" -> ", n = 1, expand = True)
  
    # making separate first name column from new data frame
    dataframe['Src_address']= src_dst_splitted[0]
    # making separate last name column from new data frame
    dataframe['Dst_address']= src_dst_splitted[1]
    
    # resample to certain frequency chunks for internal to external pair count ratio 
    df_all_grouped = dataframe.groupby(['Pair', 'Protocol', pd.Grouper(freq=frequency, key='Date')])['Length'].count()

    df_all_grouped = pd.DataFrame(df_all_grouped).reset_index()
    df_internal_grouped = df_all_grouped.loc[(df_all_grouped['Pair'].str.startswith('172.20.1.') & 
        df_all_grouped['Pair'].str.contains('-> 172.20.1.')) | df_all_grouped['Protocol'].str.startswith('ARP')]

    df_all_grouped_pairs = df_all_grouped.groupby('Date')['Pair'].unique().reset_index()

    df_all_grouped_pairs['All_pairs_count'] = df_all_grouped_pairs['Pair'].str.len()
    df_all_grouped_pairs = df_all_grouped_pairs.drop(columns=['Pair'])

    df_internal_grouped_pairs = df_internal_grouped.groupby('Date')['Pair'].unique().reset_index()

    df_internal_grouped_pairs['Internal_pairs_count'] = df_internal_grouped_pairs['Pair'].str.len()
    df_internal_grouped_pairs = df_internal_grouped_pairs.drop(columns=['Pair'])

    merged_pairs = pd.merge(df_all_grouped_pairs, df_internal_grouped_pairs, on='Date', how='outer')
    merged_pairs ['Pairs_ratio'] = merged_pairs['Internal_pairs_count'] / merged_pairs['All_pairs_count']

    # resample to certain frequency chunks for internal to external packet count ratio and packet length median
    df_all_packet_count = dataframe.groupby([pd.Grouper(freq=frequency, key='Date')])['Length'].count().reset_index()
    df_all_packet_count = df_all_packet_count.rename(columns={'Length':'All_packets_count'})

    df_all_packet_mean = dataframe.groupby([pd.Grouper(freq=frequency, key='Date')])['Length'].mean().reset_index()
    df_all_packet_mean = df_all_packet_mean.rename(columns={'Length':'All_packets_mean'})

    df_all_packet_sum = dataframe.groupby([pd.Grouper(freq=frequency, key='Date')])['Length'].sum().reset_index()
    df_all_packet_sum = df_all_packet_sum.rename(columns={'Length':'All_packets_sum'})

    df_internal = dataframe.loc[(dataframe['Pair'].str.startswith('172.20.1.') & dataframe['Pair'].str.contains('-> 172.20.1.')) 
                                | dataframe['Protocol'].str.startswith('ARP')]

    df_internal_packet_count =  df_internal.groupby([pd.Grouper(freq=frequency, key='Date')])['Length'].count().reset_index()
    df_internal_packet_count = df_internal_packet_count.rename(columns={'Length':'Internal_packets_count'})

    df_internal_packet_mean =  df_internal.groupby([pd.Grouper(freq=frequency, key='Date')])['Length'].mean().reset_index()
    df_internal_packet_mean = df_internal_packet_mean.rename(columns={'Length':'Internal_packets_mean'})

    df_internal_packet_sum = df_internal.groupby([pd.Grouper(freq=frequency, key='Date')])['Length'].sum().reset_index()
    df_internal_packet_sum = df_internal_packet_sum.rename(columns={'Length':'Internal_packets_sum'})
 
    df_all_port_dest_count= dataframe.groupby([pd.Grouper(freq=frequency, key='Date')])['Dst_port'].unique().reset_index()
    df_all_port_dest_count['All_dst_ports_count'] = df_all_port_dest_count['Dst_port'].str.len()
    df_all_port_dest_count = df_all_port_dest_count.drop(columns=['Dst_port'])

    df_all_port_src_count= dataframe.groupby([pd.Grouper(freq=frequency, key='Date')])['Src_port'].unique().reset_index()
    df_all_port_src_count['All_src_ports_count'] = df_all_port_src_count['Src_port'].str.len()
    df_all_port_src_count = df_all_port_src_count.drop(columns=['Src_port'])

    df_all_ports = pd.merge(df_all_port_src_count, df_all_port_dest_count, on='Date', how='outer')
    df_all_ports['Ports_ratio'] = df_all_ports['All_dst_ports_count'] / df_all_ports['All_src_ports_count']

    # count source addresses
    df_all_add_src_count= dataframe.groupby([pd.Grouper(freq=frequency, key='Date')])['Src_address'].unique().reset_index()
    df_all_add_src_count['All_src_add_count'] = df_all_add_src_count['Src_address'].str.len()
    df_all_add_src_count = df_all_add_src_count.drop(columns=['Src_address'])

    # destination addresses 
    df_all_add_dest_count= dataframe.groupby([pd.Grouper(freq=frequency, key='Date')])['Dst_address'].unique().reset_index()
    df_all_add_dest_count['All_dst_add_count'] = df_all_add_dest_count['Dst_address'].str.len()
    df_all_add_dest_count = df_all_add_dest_count.drop(columns=['Dst_address'])

    # TCP internal packets
    df_internal_tcp = df_internal.loc[(df_internal['Protocol'] == 'TCP')]
    df_internal_tcp_count = df_internal_tcp.groupby([pd.Grouper(freq=frequency, key='Date')])['Length'].count().reset_index()
    df_internal_tcp_count = df_internal_tcp_count.rename(columns={'Length':'Internal_TCP_count'})

    # ARP packets
    df_arp = dataframe.loc[(dataframe['Protocol'] == 'ARP')]
    df_arp_count = df_arp.groupby([pd.Grouper(freq=frequency, key='Date')])['Length'].count().reset_index()
    df_arp_count = df_arp_count.rename(columns={'Length':'ARP_count'})

    # SSH internal packets
    df_ssh = dataframe.loc[(dataframe['Protocol'] == 'SSH')]
    df_ssh_count = df_ssh.groupby([pd.Grouper(freq=frequency, key='Date')])['Length'].count().reset_index()
    df_ssh_count = df_ssh_count.rename(columns={'Length':'SSH_count'})

    df_all_small_packets = dataframe.loc[(dataframe['Length'] < 100)]
    df_all_small_packets_count = df_all_small_packets.groupby([pd.Grouper(freq=frequency, key='Date')])['Length'].count().reset_index()
    df_all_small_packets_count = df_all_small_packets_count.rename(columns={'Length':'All_small_packet_count'})

    # packet count per internal source address
    df_packet_count_per_src_add = df_internal.groupby(['Src_address', pd.Grouper(freq=frequency, key='Date')])['Length'].count().reset_index()
    df_packet_avg_packet_per_src_add = df_packet_count_per_src_add.groupby([pd.Grouper(freq=frequency, key='Date')])['Length'].mean().reset_index()
    df_packet_avg_packet_per_src_add = df_packet_avg_packet_per_src_add.rename(columns={'Length':'Avg_packet_count_per_src_add'})

    # packet count per internal destination address
    df_packet_count_per_dst_add = df_internal.groupby(['Dst_address', pd.Grouper(freq=frequency, key='Date')])['Length'].count().reset_index()
    df_packet_avg_packet_per_dst_add = df_packet_count_per_dst_add.groupby([pd.Grouper(freq=frequency, key='Date')])['Length'].mean().reset_index()
    df_packet_avg_packet_per_dst_add = df_packet_avg_packet_per_dst_add.rename(columns={'Length':'Avg_packet_count_per_dst_add'})

    # packet count per source port
    df_packet_count_per_src_port = df_internal.groupby(['Src_port', pd.Grouper(freq=frequency, 
                                                                            key='Date')])['Length'].count().reset_index()
    df_packet_avg_packet_per_src_port = df_packet_count_per_src_port.groupby([pd.Grouper(freq=frequency, 
                                                                                key='Date')])['Length'].mean().reset_index()
    df_packet_avg_packet_per_src_port = df_packet_avg_packet_per_src_port.rename(columns={'Length':'Avg_packet_count_per_src_port'})

    # packet count per destination port
    df_packet_count_per_dst_port = df_internal.groupby(['Dst_port', pd.Grouper(freq=frequency, key='Date')])['Length'].count().reset_index()
    df_packet_avg_packet_per_dst_port = df_packet_count_per_dst_port.groupby([pd.Grouper(freq=frequency, key='Date')])['Length'].mean().reset_index()
    df_packet_avg_packet_per_dst_port = df_packet_avg_packet_per_dst_port.rename(columns={'Length':'Avg_packet_count_per_dst_port'})

    # packet count per internal pair
    df_packet_count_per_pair = df_internal.groupby(['Pair', pd.Grouper(freq=frequency, key='Date') ])['Length'].count().reset_index()
    df_avg_packet_count_per_pair = df_packet_count_per_pair.groupby([pd.Grouper(freq=frequency, key='Date')])['Length'].mean().reset_index()
    df_avg_packet_count_per_pair = df_avg_packet_count_per_pair.rename(columns={'Length':'Avg_packet_count_per_pair'})


    dataframe[['HTTP_response','HTTP_method' ]] = dataframe[['HTTP_response','HTTP_method' ]].fillna('')

    # count 5xx codes
    df_http_codes_5xx = dataframe.loc[(dataframe['HTTP_response'].str.match('^5\d{2}\.0$')== True)]
    df_http_5xx_count = df_http_codes_5xx.groupby([pd.Grouper(freq=frequency, key='Date')])['Length'].count().reset_index()
    df_http_5xx_count = df_http_5xx_count.rename(columns={'Length':'HTTP_5xx_count'})

    # count 4xx codes
    df_http_codes_4xx = dataframe.loc[(dataframe['HTTP_response'].str.match('^4\d{2}\.0$')== True)]
    df_http_4xx_count = df_http_codes_4xx.groupby([pd.Grouper(freq=frequency, key='Date')])['Length'].count().reset_index()
    df_http_4xx_count = df_http_4xx_count.rename(columns={'Length':'HTTP_4xx_count'})

    # count other than 'GET', 'POST' methods
    df_other_http_requests = dataframe.loc[((dataframe['HTTP_method'] != 'GET') & (dataframe['HTTP_method'] != 'POST') &  
                                            (dataframe['HTTP_method'] != ''))]
    df_other_http_requests_count = df_other_http_requests.groupby([pd.Grouper(freq=frequency, key='Date')])['Length'].count().reset_index()
    df_other_http_requests_count = df_other_http_requests_count.rename(columns={'Length':'Other_HTTP_count'})

    means_merged = pd.merge(df_all_packet_mean, df_internal_packet_mean, on='Date', how='outer')
    means_merged = pd.merge(means_merged, df_packet_avg_packet_per_src_add, on='Date', how='outer')
    means_merged = pd.merge(means_merged, df_packet_avg_packet_per_dst_add, on='Date', how='outer')
    means_merged = pd.merge(means_merged, df_packet_avg_packet_per_src_port, on='Date', how='outer')
    means_merged = pd.merge(means_merged, df_packet_avg_packet_per_dst_port, on='Date', how='outer')
    means_merged = pd.merge(means_merged, df_avg_packet_count_per_pair, on='Date', how='outer')
    means_merged = means_merged.set_index('Date')

    merged = pd.merge(merged_pairs, df_all_add_src_count, on='Date', how='outer')
    merged = pd.merge(merged, df_all_add_dest_count, on='Date', how='outer')
    merged = pd.merge(merged, df_all_ports, on='Date', how='outer')
    merged = pd.merge(merged, df_all_packet_sum, on='Date', how='outer')
    merged = pd.merge(merged, df_internal_packet_sum, on='Date', how='outer')
    merged = pd.merge(merged, df_all_packet_count, on='Date', how='outer' )
    merged = pd.merge(merged, df_all_small_packets_count, on='Date', how='outer')
    merged = pd.merge(merged, df_internal_packet_count, on='Date', how='outer')
    merged = pd.merge(merged, df_arp_count, on='Date', how='outer' )
    merged = pd.merge(merged, df_internal_tcp_count, on='Date', how='outer')
    merged = pd.merge(merged, df_ssh_count, on='Date', how='outer')
    merged = pd.merge(merged, df_http_5xx_count, on='Date', how='outer')
    merged = pd.merge(merged, df_http_4xx_count, on='Date', how='outer')
    merged = pd.merge(merged, df_other_http_requests_count, on='Date', how='outer')


    merged ['Dst_src_address_ratio'] =  merged['All_dst_add_count'] / merged['All_src_add_count']
    merged ['Packet_count_ratio'] = merged['Internal_packets_count'] / merged['All_packets_count']
    merged ['Packet_sum_ratio'] = merged['Internal_packets_sum'] / merged['All_packets_sum']
    merged ['Small_packet_ratio'] = merged['All_small_packet_count'] / merged['All_packets_count']
    merged ['ARP_packet_ratio'] = merged['ARP_count'] /merged['All_packets_count']
    merged ['TCP_packet_ratio'] = merged['Internal_TCP_count'] /merged['Internal_packets_count']
    merged ['SSH_packet_ratio'] = merged['SSH_count'] /merged['All_packets_count']

    merged = merged.set_index('Date')

    cut_first = 1
    cut_last = 1

    merged = merged.iloc[cut_first: , :]
    merged = merged.iloc[:-cut_last,:]

    means_merged = means_merged.iloc[cut_first: , :]
    means_merged = means_merged.iloc[:-cut_last,:]

    df_top_protocols = get_top5_protocols(dataframe, frequency, internal=False)
    df_top_internal_protocols = get_top5_protocols(df_internal, frequency, internal=True)

    features_merged = pd.merge(merged, means_merged, on='Date', how='outer')
    features_merged = pd.merge(features_merged, df_top_protocols, on='Date', how='outer')
    features_merged = pd.merge(features_merged, df_top_internal_protocols, on='Date', how='outer')
    return features_merged


def get_top5_protocols(dataframe, frequency, internal):
    # top 5 protocols used among all pairs
    df_protocols_count = dataframe.groupby(['Protocol', pd.Grouper(freq=frequency, key='Date')])['Length'].count().reset_index()
    # print(df_protocols_count.to_string())
    df_top_protocols = df_protocols_count.groupby('Date')['Length'].nlargest(5).reset_index()
    protocol_list = df_protocols_count['Protocol'].iloc[df_top_protocols['level_1']]
    protocol_list = protocol_list.to_frame()
    protocol_list.reset_index(drop=True, inplace=True)

    df_top_protocols['Protocol'] = protocol_list['Protocol']
    df_top_protocols = df_top_protocols.drop(columns=['level_1', 'Length'])
    df_top_protocols = df_top_protocols.groupby('Date')['Protocol'].unique()

    df_top_protocols = df_top_protocols.to_frame()

    if internal: 
        column_name = 'Top_5_internal_protocols'
    else: 
        column_name = 'Top_5_protocols'
    df_top_protocols = df_top_protocols.rename(columns={'Protocol':column_name})

    # remove last and the first interval as they might not carry full time interval defined by frequency 
    df_top_protocols = df_top_protocols.iloc[1: , :]
    df_top_protocols = df_top_protocols.iloc[:-1,:]
    return df_top_protocols

In [6]:
def prepare_samples(dataframe, for_training):
#     dataframe = split_protocols(dataframe)


    # when preparing the data for training process, classes need to be encoded
    if for_training:
        class2_non_normalized =  dataframe['class']
        dataframe = encode_classes(dataframe)
        dataframe = dataframe.drop(columns = ['class'])

    # drop unncecessary columns
#     print(dataframe.head(5))
#     print(dataframe.columns)
#     dataframe = dataframe.drop(columns = ['Date'])
    # remove nulls
    dataframe = dataframe.fillna(0) 
    x = dataframe.values #returns a numpy array
    # normalize data 
    min_max_scaler = MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df_normalized = pd.DataFrame(x_scaled, columns=dataframe.columns)
    if for_training:
        df_normalized['class2'] = class2_non_normalized
    return df_normalized

In [7]:
pcap_path = 'standardized_data/regular_traffic2_29_04.pcap'
frequency = '30S'
dataframe_regular_traffic = read_pcap_to_df(pcap_path)
dataframe_features_regular_traffic = generate_features(dataframe_regular_traffic, frequency)
dataframe_features_regular_traffic = split_protocols(dataframe_features_regular_traffic)





['ARP' 'NBNS' 'BROWSER']
['ARP']
['ARP']
['ARP']
['UDP' 'GQUIC' 'ARP' 'STP' 'TCP']
['UDP' 'QUIC' 'GQUIC' 'ARP' 'STP']
['UDP' 'QUIC' 'GQUIC' 'TCP' 'DNS']
['UDP' 'GQUIC' 'TCP' 'ARP' 'STP']
                     All_pairs_count  Internal_pairs_count  Pairs_ratio  \
Date                                                                      
2022-04-29 16:33:00               56                     8     0.142857   
2022-04-29 16:33:30               69                     5     0.072464   
2022-04-29 16:34:00               68                     3     0.044118   
2022-04-29 16:34:30               74                     5     0.067568   

                     All_src_add_count  All_dst_add_count  \
Date                                                        
2022-04-29 16:33:00                 26                 21   
2022-04-29 16:33:30                 31                 24   
2022-04-29 16:34:00                 30                 27   
2022-04-29 16:34:30                 31               

In [8]:
# read and prepare training data 
dataframe = pd.read_csv('features_csv_with_http/features_merged_30_05_reduced_fixed_list.csv')  
# print(dataframe)

dataframe['Top_5_internal_protocols'] = dataframe['Top_5_internal_protocols'].str.replace(']', '')
dataframe['Top_5_internal_protocols'] = dataframe['Top_5_internal_protocols'].str.replace('[', '')
dataframe['Top_5_protocols'] = dataframe['Top_5_protocols'].str.replace(']', '')
dataframe['Top_5_protocols'] = dataframe['Top_5_protocols'].str.replace('[', '')


dataframe['Top_5_internal_protocols_list'] = dataframe['Top_5_internal_protocols'].str.split(',')
dataframe['Top_5_protocols_list'] = dataframe['Top_5_protocols'].str.split(',')
dataframe = dataframe.drop(columns=['Top_5_internal_protocols', 'Top_5_protocols'])
# print(dataframe['Top_5_internal_protocols_list'].to_string())


dataframe['Top_5_internal_protocols_list'] = dataframe['Top_5_internal_protocols_list'].apply(fill_up_list)
dataframe['Top_5_protocols_list'] = dataframe['Top_5_protocols_list'].apply(fill_up_list)
print('after filling up list')
print(dataframe.head(10))

df2 = pd.DataFrame(dataframe['Top_5_internal_protocols_list'].to_list(), columns = ['i_protocol_1', 'i_protocol_2', 'i_protocol_3', 
                                                                            'i_protocol_4', 'i_protocol_5'])
df1 = pd.DataFrame(dataframe['Top_5_protocols_list'].to_list(), columns = ['protocol_1', 'protocol_2', 'protocol_3', 
                                                                            'protocol_4', 'protocol_5'])

dataframe_joined = dataframe.join(df2)
dataframe_joined = dataframe_joined.join(df1)
# print(dataframe_joined)

dataframe_joined = dataframe_joined.drop(columns=['Top_5_internal_protocols_list', 'Top_5_protocols_list'])
df_dummies = pd.get_dummies(data = dataframe_joined, columns = ['i_protocol_1', 'i_protocol_2', 'i_protocol_3', 
                                                 'i_protocol_4', 'i_protocol_5', 'protocol_1', 'protocol_2', 
                                                 'protocol_3', 'protocol_4', 'protocol_5' ])
df_dummies = df_dummies.drop(columns = ['class', 'Date', 'Unnamed: 0'])

for col in df_dummies.columns:
    df_dummies.rename(columns={col:col.replace("'","")},inplace=True)

["'ARP'", "'ICMP'"]
["'ARP'", "'ICMP'"]
["'ARP'", "'ICMP'"]
["'ARP'", "'ICMP'"]
["'ARP'", "'ICMP'"]
["'ARP'", "'ICMP'"]
["'ARP'", "'ICMP'"]
["'ARP'", "'ICMP'"]
["'ARP'", "'ICMP'"]
["'TCP'", "'HTTP'", "'ARP'", "'JSON'", "'NBNS'"]
["'TCP'", "'ARP'", "'HTTP'", "'JSON'", "'NBNS'"]
["'TCP'", "'ARP'", "'HTTP'", "'JSON'"]
["'TCP'", "'ARP'", "'HTTP'", "'JSON'", "'BROWSER'"]
["'TCP'", "'ARP'", "'HTTP'", "'JSON'"]
["'TCP'", "'ARP'", "'HTTP'", "'JSON'"]
["'TCP'", "'ARP'", "'HTTP'", "'JSON'", "'BROWSER'"]
["'TCP'", "'ARP'", "'HTTP'", "'JSON'"]
["'TCP'", "'ARP'", "'HTTP'", "'JSON'"]
["'TCP'", "'ARP'", "'FTP'", "'NBNS'", "'UDP'"]
["'TCP'", "'ARP'", "'FTP'", "'UDP'"]
["'TCP'", "'ARP'", "'FTP'", "'UDP'"]
["'TCP'", "'ARP'", "'FTP'", "'UDP'"]
["'TCP'", "'ARP'", "'FTP'", "'BROWSER'"]
["'TCP'", "'ARP'", "'FTP'"]
["'TCP'", "'ARP'", "'FTP'"]
["'TCP'", "'ARP'", "'FTP'"]
["'TCP'", "'ARP'", "'FTP'", "'NBNS'"]
["'TCP'", "'SSH'", "'ARP'", "'UDP'"]
["'TCP'", "'SSH'", "'ARP'", "'UDP'"]
["'TCP'", "'SSH'", "'ARP'", 

  dataframe['Top_5_internal_protocols'] = dataframe['Top_5_internal_protocols'].str.replace(']', '')
  dataframe['Top_5_internal_protocols'] = dataframe['Top_5_internal_protocols'].str.replace('[', '')
  dataframe['Top_5_protocols'] = dataframe['Top_5_protocols'].str.replace(']', '')
  dataframe['Top_5_protocols'] = dataframe['Top_5_protocols'].str.replace('[', '')


In [9]:
frames = [dataframe, dataframe_features_regular_traffic]
result_df = pd.concat([df_dummies, dataframe_features_regular_traffic], ignore_index=True, sort=False)
result_df
dataframe_prepared_result = prepare_samples(result_df, for_training=False)

In [21]:
# read the features list for multiclassification model 
dataframe = dataframe_prepared_result
merged_features_list = []
with open(r'features_multiclass_model.txt', 'r') as fp:
    for line in fp:
        x = line[:-1]
        merged_features_list.append(x)

required_features_dataframe = pd.DataFrame(columns=merged_features_list)
required_features_dataframe = pd.concat([required_features_dataframe,dataframe])
required_features_dataframe = required_features_dataframe.fillna(0) 
X = required_features_dataframe[merged_features_list]
X = X.drop(columns = 'class2')
print(X.shape)
print(X.columns)
# print(y)

model = load_model('multiclass_classification_model.h5')
X = np.asarray(X).astype('float32')
y_pred = model.predict(X)
y_pred_rounded = np.round(y_pred, 2)
labels_list = ['ARP_spoofing', 'Docker', 'Exploit', 'Hydra', 'Nmap_sV', 'Regular_traffic', 'SQL_injection'] 
y_pred_df = pd.DataFrame(data=y_pred_rounded, columns = labels_list )
y_pred_df = pd.DataFrame(data=y_pred_rounded, columns = labels_list )
print(y_pred_df)
# second highest prediction value 
print(y_pred_df.to_string())
y_second_pred = y_pred_df.apply(lambda row: row.nlargest(2).values[-1],axis=1)
print(y_second_pred)

# highest prediction value
y_max_pred= y_pred_df.max(axis=1)
print(y_max_pred.to_string())

# first and second predicted class
predictions_df = y_pred_df.apply(lambda s, n: pd.Series(s.nlargest(n).index), axis=1, n=2)
predictions_df = predictions_df.rename(columns = {0: "predicted_class_1", 1: "predicted_class_2"})
predictions_df['max_pred_val'] = y_max_pred
predictions_df['second_pred_val'] = y_second_pred
print(predictions_df)


# test with binary model
model_binary = load_model('binary_classification_model_2.h5')
binary_features_list = []
with open(r'features_binary_model.txt', 'r') as fp:
    for line in fp:
        x = line[:-1]
        binary_features_list.append(x)


X_binary = required_features_dataframe[binary_features_list]
print(X_binary)
y_pred_binary = model_binary.predict(X_binary)
print(y_pred_binary)

(210, 71)
Index(['Avg_packet_count_per_dst_add', 'i_protocol_5_PORTMAP',
       'Avg_packet_count_per_dst_port', 'i_protocol_4_UDP', 'i_protocol_2_SSH',
       'Internal_TCP_count', 'i_protocol_2_UDP', 'i_protocol_4_JSON',
       'All_packets_count', 'Avg_packet_count_per_pair', 'i_protocol_2_ARP',
       'Small_packet_ratio', 'Avg_packet_count_per_src_add',
       'Internal_pairs_count', 'HTTP_4xx_count', 'protocol_5_HTTP',
       'i_protocol_4_RPC', 'All_src_ports_count', 'i_protocol_3_MYSQL',
       'Pairs_ratio', 'All_pairs_count', 'i_protocol_4_', 'i_protocol_1_TCP',
       'SSH_count', 'SSH_packet_ratio', 'All_packets_mean',
       'Internal_packets_count', 'i_protocol_2_ICMP', 'protocol_2_ARP',
       'Packet_count_ratio', 'protocol_5_ICMP', 'protocol_1_TCP',
       'All_dst_add_count', 'Internal_packets_mean', 'protocol_4_HTTP',
       'protocol_3_DATA', 'Other_HTTP_count', 'protocol_1_UDP',
       'protocol_2_DNS', 'protocol_3_GQUIC', 'Internal_packets_sum',
       'protocol_2

    predicted_class_1 predicted_class_2  max_pred_val  second_pred_val
0        ARP_spoofing           Exploit          0.99             0.01
1        ARP_spoofing           Exploit          0.99             0.01
2        ARP_spoofing           Nmap_sV          0.99             0.01
3        ARP_spoofing           Nmap_sV          0.99             0.01
4        ARP_spoofing           Nmap_sV          0.99             0.01
5        ARP_spoofing           Nmap_sV          0.99             0.01
6        ARP_spoofing            Docker          0.99             0.00
7        ARP_spoofing            Docker          1.00             0.00
8        ARP_spoofing            Docker          0.99             0.00
9               Hydra            Docker          0.42             0.27
10             Docker             Hydra          0.36             0.21
11             Docker   Regular_traffic          0.39             0.23
12             Docker   Regular_traffic          0.37             0.24
13    

[[0.8669756 ]
 [0.83450556]
 [0.8380005 ]
 [0.83768237]
 [0.8442918 ]
 [0.8484546 ]
 [0.87890124]
 [0.9030732 ]
 [0.90675664]
 [0.9305819 ]
 [0.9568349 ]
 [0.9770786 ]
 [0.956882  ]
 [0.97480655]
 [0.9750576 ]
 [0.95867115]
 [0.9765377 ]
 [0.97403324]
 [0.90209675]
 [0.9379    ]
 [0.93506587]
 [0.9362351 ]
 [0.9024792 ]
 [0.9322823 ]
 [0.9336732 ]
 [0.93216157]
 [0.93804574]
 [0.95658344]
 [0.95880055]
 [0.95886827]
 [0.9796625 ]
 [0.96646345]
 [0.97451055]
 [0.974566  ]
 [0.9324412 ]
 [0.9744016 ]
 [0.9854318 ]
 [0.9670168 ]
 [0.9760562 ]
 [0.9738829 ]
 [0.97704685]
 [0.9947938 ]
 [0.97733194]
 [0.9708022 ]
 [0.97739387]
 [0.04066756]
 [0.0447872 ]
 [0.06418556]
 [0.06212947]
 [0.06594938]
 [0.04241329]
 [0.06126389]
 [0.04288024]
 [0.04100344]
 [0.04087868]
 [0.03875357]
 [0.06465843]
 [0.03422558]
 [0.07163689]
 [0.06496304]
 [0.04012308]
 [0.03885186]
 [0.03996623]
 [0.04010025]
 [0.06043229]
 [0.05141214]
 [0.23231262]
 [0.03960606]
 [0.05944499]
 [0.05376577]
 [0.08065581]
 [0.04

In [22]:
pd.set_option('display.max_rows', 500)
predictions_df['binary'] = y_pred_binary
predictions_df

Unnamed: 0,predicted_class_1,predicted_class_2,max_pred_val,second_pred_val,binary
0,ARP_spoofing,Exploit,0.99,0.01,0.866976
1,ARP_spoofing,Exploit,0.99,0.01,0.834506
2,ARP_spoofing,Nmap_sV,0.99,0.01,0.838
3,ARP_spoofing,Nmap_sV,0.99,0.01,0.837682
4,ARP_spoofing,Nmap_sV,0.99,0.01,0.844292
5,ARP_spoofing,Nmap_sV,0.99,0.01,0.848455
6,ARP_spoofing,Docker,0.99,0.0,0.878901
7,ARP_spoofing,Docker,1.0,0.0,0.903073
8,ARP_spoofing,Docker,0.99,0.0,0.906757
9,Hydra,Docker,0.42,0.27,0.930582


In [19]:
old_predictions_df = predictions_df

In [20]:
old_predictions_df

Unnamed: 0,predicted_class_1,predicted_class_2,max_pred_val,second_pred_val,binary
0,ARP_spoofing,Exploit,0.99,0.01,0.960687
1,ARP_spoofing,Exploit,0.99,0.01,0.947091
2,ARP_spoofing,Nmap_sV,0.99,0.01,0.959914
3,ARP_spoofing,Nmap_sV,0.99,0.01,0.957747
4,ARP_spoofing,Nmap_sV,0.99,0.01,0.956449
5,ARP_spoofing,Nmap_sV,0.99,0.01,0.949631
6,ARP_spoofing,Docker,0.99,0.0,0.957917
7,ARP_spoofing,Docker,1.0,0.0,0.955216
8,ARP_spoofing,Docker,0.99,0.0,0.958931
9,Hydra,Docker,0.42,0.27,0.991891
