Some Features used by "https://www.freehaven.net/anonbib/cache/wpes11-panchenko.pdf" plus filtering and cleaning

## CLEANING:
### 1 - The max size of packets without running the protocol is 609, suggestion is to remove packet of length lower then 615
### 2 - Removed TCP acks from server

## FEATURES
### 1 - total pkts
### 2 - total pkts incoming
### 3 - total pkts incoming / total
### 4 - total pkts sent / total
### 5 - total bytes
### 6 - total bytes incoming
### 7 - total bytes incoming / tot bytes
### 8 - total bytes sent / tot bytes
### 9 - incoming pkt size mean
### 10 - Markers: how many time the direction of flow changes
### 19 - 21 Top 3 Bursts size
### 11 - 13 Top 3 Bursts time: from the mean, find the time at which the 3 biggest packets > mean size were sent
### 14 - 16 Top 3 Bursts position: which are the pkt numbers of the top 3 bursts?
### 17 - 18 Percentage fluctuation in packet size: find the middle packet. Compute the ratio middle size / first packet size and last packet size / middle packet size
### 22 Average time between incoming pkts

In [5]:
import os.path

import pandas as pd
columns = ['cell_id',
           'num_packets',
           'in_pkts',
           'in_pkts/total',
           'out_pkts/total',
           'num_bytes',
           'bytes_in',
           'bytes_in/total',
           'bytes_out/total',
           'avg_in_pkt_size',
           'markers',
           'burst_size_1',
           'burst_size_2',
           'burst_size_3',
           'burst_time_1',
           'burst_time_2',
           'burst_time_3',
           'burst_seq_1',
           'burst_seq_2',
           'burst_seq_3',
           'percentage_fluctuation_start_middle',
           'percentage_fluctuation_middle_end',
           'avg_interval_pkts']
df = pd.DataFrame(columns=columns)

In [6]:
from os import listdir
from os.path import isfile, join, getsize
import scapy.all as S
import heapq

for i in range(1,101):
    path=f"./filtered/cell_{i}/"
    files = [join(path,f) for f in listdir(path) if isfile(join(path, f))]
    for file in files:
        local_dic = {'cell_id': i}
        f = open(file,'rb')
        #local_dic['pcap_total_size'] = getsize(file)
        f.close()
        packets = S.rdpcap(file)
        local_dic['num_packets'] =  len(packets)
        server_pkt_count = 0
        client_pkt_count = 0
        server_pkt_size_sum = 0
        client_pkt_size_sum = 0
        times_pkts_in = []
        heap = []
        start = ""
        markers = 0
        for seq,packet in enumerate(packets):
            ip_layer = packet.getlayer('IP').fields
            pkt_size = len(packet)

            if '172' in ip_layer['src']:
                if start != "" and start != "client":
                    markers += 1
                start = "client"

                client_pkt_size_sum += pkt_size/1000
                client_pkt_count += 1
            else:
                if start != "" and start != "server":
                    markers += 1
                start = "server"

                server_pkt_size_sum += pkt_size/1000
                server_pkt_count += 1

                times_pkts_in.append(packet.time/(10**9))

                heapq.heappush(heap, (1000/pkt_size, seq, packet.time/(10**9)))

        local_dic['num_packets'] = client_pkt_count + server_pkt_count
        local_dic['in_pkts'] = server_pkt_count
        local_dic['in_pkts/total'] = server_pkt_count / (client_pkt_count + server_pkt_count)
        local_dic['out_pkts/total'] = client_pkt_count / (client_pkt_count + server_pkt_count)

        local_dic['num_bytes'] = client_pkt_size_sum + server_pkt_size_sum
        local_dic['bytes_in'] = server_pkt_size_sum
        local_dic['bytes_in/total'] = server_pkt_size_sum / (client_pkt_size_sum + server_pkt_size_sum)
        local_dic['bytes_out/total'] = client_pkt_size_sum / (client_pkt_size_sum + server_pkt_size_sum)

        local_dic['avg_in_pkt_size'] = server_pkt_size_sum / server_pkt_count

        local_dic['markers'] = markers

        for j in ["1","2","3"]:
            burst = heapq.heappop(heap)
            local_dic["burst_size_"+j] = 1 / burst[0]
            local_dic["burst_time_"+j] = burst[2]
            local_dic["burst_seq_"+j] = burst[1]

        start_pkt = packets[0]
        middle_pkt = packets[int((len(packets)-1)/2)]
        final_pkt = packets[-1]
        local_dic['percentage_fluctuation_start_middle'] = (len(middle_pkt) - len(start_pkt))/100
        local_dic['percentage_fluctuation_middle_end'] = (len(final_pkt) - len(middle_pkt))/100

        sum = 0
        for j in range(1, len(times_pkts_in)):
            sum += times_pkts_in[j] - times_pkts_in[j-1]
        local_dic['avg_interval_pkts'] = sum*(10**9) / (len(times_pkts_in)-1)

        df = df.append(local_dic, ignore_index=True)
df.to_csv('features.csv', index=False)

