In [1]:
input_file='data/prod-anon-021.pcap'

<h1 align='center'> DDoS Attack (Convert, Filter and) Labelling </h1>

<h2 align='center'> 
<div style="width:400px;padding:10px;border:1px dotted black;">
Goal: to label DDoS attacks.
</div>
</h2>

<img align='center' src="figs/summary.png" width="400px"/>

- The **input** is a packet-based network trace file that contains an attack, usually with the extension .pcap or pcapng.
- The **output** is a string with a label of the attack found in the input file.

## Research Question (RQ) definitions:
- **RQ1: How to efficiently read and convert packet-based network traces containing DDoS attacks for facilitating futher analysis?** DDoS attack trace contains a very large amount of records making very expensive computationally to read and analyse them. These records are nested, i.e., packets can have different set of information (network fields), which makes even more challenging to analyse them in a single manner. There are different tools and libraries for reading packet-based network traces 

- **RQ2: How to efficiently identify the main characteristics of a DDoS attack?** From a practical point of view the definition of DDoS attack is a set of network records with a same characteristics, which is the majority of the network traffic compared to the remaining traffic.

- **RQ3: How the characteristics of DDoS attacks can be used for labeling them?** There are many taxonomies to classify and label DDoS attacks. Our approach to address this question is based on the practical experience of network operators and network security specialists. In this approach they label attacks based on their observations on the attacks.

<h2 align='center'>===============================================================<br> RQ1: How to efficiently read and convert packet-based network traces containing DDoS attacks for facilitating futher analysis?</h2>

We analyse the performance of four main tools and libraries for read packet-based network traces: 
- [tcpdump (see our analysis)](analysis_tcpdump_tshark_scapy.ipynb)
- [tshark (see our analysis)](analysis_tcpdump_tshark_scapy.ipynb)
- [scapy (see our analysis)](analysis_tcpdump_tshark_scapy.ipynb)
- [dpkt (see our analysis)](additional_analysis/analysis_dpkt.ipynb)

**In the end we decided for dpkt that has the second best performance but it is easier to manipulate the output for extension purposes.**

## Converting pcap (using dpkt)

In [2]:
output_file=input_file.split('.')[0]+".txt"

In [3]:
import argparse
import dpkt
import socket
import os

In [4]:
outputfile = open(output_file,'w')

inputfile = open(input_file)
pcapfile = dpkt.pcap.Reader(inputfile)

for ts, buf in pcapfile:
    eth = dpkt.ethernet.Ethernet(buf)

    #FILTERING ONLY FOR IPv4 instead of packets ARP or IPv6
    if eth.type == dpkt.ethernet.ETH_TYPE_IP:
        ip = eth.data #Loading the content of the ethernet into a variable 'ip'
        
        timestamp = ts #1
        ip_ttl = ip.ttl #2
        
        ip_proto = ip.p #3
        sport = ""
        dport= ""
        tcp_flag =""
        http_request_method=""
        if (ip_proto != 6) and (ip_proto != 17): #It is not TCP or UDP
            continue
            
        ip_length = ip.len #4
        ip_src = socket.inet_ntoa(ip.src) #5
        ip_dst = socket.inet_ntoa(ip.dst) #6
        
        try: proto = ip.data #Loading the content of the 'ip' into a variable 'protocol' that can be for example ICMP, TCP, and UDP.
        except:
            continue
        
        sport = proto.sport #7
        dport = proto.dport #8


        if ip.p == 6 :
            try:
                tcp_flag += ("F" if (int( proto.flags & dpkt.tcp.TH_FIN ) != 0) else ".") #27
                tcp_flag += ("S" if (int( proto.flags & dpkt.tcp.TH_SYN ) != 0) else ".") #26
                tcp_flag += ("R" if (int( proto.flags & dpkt.tcp.TH_RST ) != 0) else ".") #25
                tcp_flag += ("P" if (int( proto.flags & dpkt.tcp.TH_PUSH) != 0) else ".") #24
                tcp_flag += ("A" if (int( proto.flags & dpkt.tcp.TH_ACK ) != 0) else ".") #23
                tcp_flag += ("U" if (int( proto.flags & dpkt.tcp.TH_URG ) != 0) else ".") #22
                tcp_flag += ("E" if (int( proto.flags & dpkt.tcp.TH_ECE ) != 0) else ".") #21
                tcp_flag += ("C" if (int( proto.flags & dpkt.tcp.TH_CWR ) != 0) else ".") #20
            except:
                print "EXCEPTION TCP FLAG"

            if (proto.dport == 80) or (proto.dport == 443):
                    if proto.data == '':
                        http_request_method=''
                    else:
                        try:
                            http_request_method = dpkt.http.Request(proto.data).method
                        except:
                            http_request_method = ''

            
        fragments = 1 if (int(ip.off & dpkt.ip.IP_MF)!= 0) else 0  #8 This flag is set to a 1 for all fragments except the last one            

        print >> outputfile, str(timestamp)+';'+\
        str(ip_ttl)+';'+\
        str(ip_proto)+';'+\
        str(ip_length)+';'+\
        str(ip_src)+';'+\
        str(ip_dst)+';'+\
        str(sport)+';'+\
        str(dport)+';'+\
        str(tcp_flag)+';'+\
        str(fragments)+';'+\
        str(http_request_method)

<h2>==========================================================<br>
Analysing converted pcap</h2>

In [5]:
import pandas as pd #more info at http://pandas.pydata.org/
import numpy as np #more info at http://www.numpy.org/

In [6]:
columns=['timestamp',\
         'ip_ttl',\
         'ip_proto',\
         'ip_length',\
         'ip_src',\
         'ip_dst',\
         'sport',\
         'dport',\
         'tcp_flag',\
         'fragments',\
         'http_data']

df = pd.read_csv(output_file,delimiter=";", names=columns)
len(df)

9968

In [7]:
df

Unnamed: 0,timestamp,ip_ttl,ip_proto,ip_length,ip_src,ip_dst,sport,dport,tcp_flag,fragments,http_data
0,1.419063e+09,60,6,52,231.56.247.245,179.122.158.191,53757,80,....A...,0.0,
1,1.419063e+09,59,6,52,231.100.233.183,179.122.158.191,50158,80,....A...,0.0,
2,1.419063e+09,60,6,52,231.56.247.245,179.122.158.191,53757,80,....A...,0.0,
3,1.419063e+09,59,6,52,231.100.233.183,179.122.158.191,50158,80,....A...,0.0,
4,1.419063e+09,59,6,52,231.100.233.183,179.122.158.191,50158,80,....A...,0.0,
5,1.419063e+09,60,6,52,231.56.247.245,179.122.158.191,53757,80,....A...,0.0,
6,1.419063e+09,60,6,52,231.56.247.245,179.122.158.191,53757,80,....A...,0.0,
7,1.419063e+09,60,6,52,231.56.247.245,179.122.158.191,53757,80,....A...,0.0,
8,1.419063e+09,59,6,52,231.100.233.183,179.122.158.191,50158,80,....A...,0.0,
9,1.419063e+09,59,6,52,231.100.233.183,179.122.158.191,50158,80,....A...,0.0,


In [8]:
top_ip_dst = df['ip_dst'].value_counts().index[0]
top_ip_dst

'179.122.158.191'

In [9]:
top_ip_proto = df[df['ip_dst']==top_ip_dst]['ip_proto'].value_counts().index[0]
top_ip_proto

6

In [10]:
total_packets = len(df[(df['ip_dst']==top_ip_dst) &\
                ~df['ip_src'].str.contains(".".join(top_ip_dst.split('.')[0:3]),na=False) &\
                       (df['ip_proto'] == top_ip_proto)])
total_packets

9968

In [11]:
percent_src_ports = df[(df['ip_dst']==top_ip_dst) &\
                ~df['ip_src'].str.contains(".".join(top_ip_dst.split('.')[0:3]),na=False) &\
                   (df['ip_proto'] == top_ip_proto)]['sport'].value_counts().divide(total_packets/100)
percent_src_ports.head()

53757    52.898990
50158    47.747475
51711     0.040404
Name: sport, dtype: float64

In [12]:
percent_dst_ports = df[(df['ip_dst']==top_ip_dst) &\
                ~df['ip_src'].str.contains(".".join(top_ip_dst.split('.')[0:3]),na=False) &\
                   (df['ip_proto'] == top_ip_proto)]['dport'].value_counts().divide(total_packets/100)
percent_dst_ports.head()

80    100.686869
Name: dport, dtype: float64

In [13]:
ports_pairs = df[(df['ip_dst']==top_ip_dst) &\
                ~df['ip_src'].str.contains(".".join(top_ip_dst.split('.')[0:3]),na=False) &\
                 (df['ip_proto'] == top_ip_proto)].groupby(['sport','dport']).size().divide(total_packets/100).sort_values(ascending=False).reset_index()
ports_pairs.head()

Unnamed: 0,sport,dport,0
0,53757,80,52.89899
1,50158,80,47.747475
2,51711,80,0.040404


In [14]:
http_data = df[(df['ip_dst']==top_ip_dst) &\
                ~df['ip_src'].str.contains(".".join(top_ip_dst.split('.')[0:3]),na=False) &\
                   (df['ip_proto'] == top_ip_proto)]['http_data'].value_counts().divide(total_packets/100)
http_data.head()

GET    0.010101
Name: http_data, dtype: float64

In [15]:
percent_tcp_flags = df[(df['ip_dst']==top_ip_dst) &\
                ~df['ip_src'].str.contains(".".join(top_ip_dst.split('.')[0:3]),na=False) &\
                   (df['ip_proto'] == top_ip_proto)]['tcp_flag'].value_counts().divide(total_packets/100)
percent_tcp_flags.head()

....A...    100.656566
.S......      0.010101
F...A...      0.010101
...PA...      0.010101
Name: tcp_flag, dtype: float64

In [16]:
percent_fragments = df[(df['ip_dst']==top_ip_dst) &\
                ~df['ip_src'].str.contains(".".join(top_ip_dst.split('.')[0:3]),na=False) &\
                   (df['ip_proto'] == top_ip_proto)]['fragments'].value_counts().divide(total_packets/100)
percent_fragments.head()

0.0    100.676768
Name: fragments, dtype: float64

## Unique IPs

In [17]:
total_src_ips = len(df[(df['ip_dst']==top_ip_dst) &\
                ~df['ip_src'].str.contains(".".join(top_ip_dst.split('.')[0:3]),na=False) &\
                   (df['ip_proto'] == top_ip_proto)]['ip_src'].unique())

total_src_ips

3

## TTL variations

In [18]:
ttl_variations = df[(df['ip_dst']==top_ip_dst) &\
                ~df['ip_src'].str.contains(".".join(top_ip_dst.split('.')[0:3]),na=False) &\
                   (df['ip_proto'] == top_ip_proto)]\
.groupby(['ip_src'])['ip_ttl'].agg(np.ptp).value_counts()*(100/total_src_ips)

ttl_variations 

0    99
Name: ip_ttl, dtype: int64

## IP fragments

In [19]:
percent_fragments = df[(df['ip_dst']==top_ip_dst) &\
                ~df['ip_src'].str.contains(".".join(top_ip_dst.split('.')[0:3]),na=False) &\
                   (df['ip_proto'] == top_ip_proto)]['fragments'].value_counts().divide(total_packets/100)
percent_fragments.head()

0.0    100.676768
Name: fragments, dtype: float64

## Additional information

In [20]:
df_port_name = pd.read_csv('data/port_name.txt',delimiter=",", names=['port_num','port_name'])
df_ip_proto_name = pd.read_csv('data/ip_proto_name.txt',delimiter=",", names=['proto_num','proto_name'])

## Functions for enriching the actual data with additional information

In [21]:
def get_ip_proto_name(ip_proto_number):
    try:
        return df_ip_proto_name[df_ip_proto_name['proto_num']==ip_proto_number]['proto_name'].values[0]
    except:
        return str(ip_proto_number)
    
def get_port_name(port_number):
    try:
        return df_port_name[df_port_name['port_num']==port_number]['port_name'].values[0]
    except:
        return "Port "+str(port_number)
    
def get_tcp_flag_name(tcp_flags_str):
    tcp_flags=""
    try:
        tcp_flags += ("FIN" if (tcp_flags_str.find('F') != -1) else next) 
    except:
        next
    try:
        tcp_flags += ("SYN" if (tcp_flags_str.find('S')!= -1) else next) 
    except:
        next
        
    try:
        tcp_flags += ("RST" if tcp_flags_str.find('R') != -1 else next)
    except:
        next
        
    try:
        tcp_flags += ("PUSH" if tcp_flags_str.find('P') != -1 else next) 
    except:
        next
        
    try:
        tcp_flags += ("ACK" if tcp_flags_str.find('A') != -1 else next)
    except:
        next
        
    try:
        tcp_flags += ("URG" if tcp_flags_str.find('U') != -1 else next) 
    except:
        next
        
    try:
        tcp_flags += ("ECE" if tcp_flags_str.find('E') != -1 else next)
    except:
        next
        
    try:
        tcp_flags += ("CWR" if tcp_flags_str.find('C') != -1 else next)
    except:
        next
   
        
    return tcp_flags            

## DDoS Attack Labeling Approach

In [22]:
percent_threshold=70
ttl_variation_threshold=2

if (top_ip_proto != 6) and (top_ip_proto != 17):
    attack_label = str(top_ip_proto)+" attack"

elif top_ip_proto == 6:
    ##THE Only exception in which the attack is labelled accordingly to the destination port
    if len(http_data)>0 and (percent_dst_ports.values[0]>percent_threshold) and ((percent_dst_ports.index[0]==80) or (percent_dst_ports.index[0]==443)):
        attack_label = get_port_name(percent_dst_ports.index[0]) +" "+http_data.index[0]
      
    elif (percent_src_ports.values[0]>percent_threshold) or (percent_dst_ports.values[0]>percent_threshold): 
         
        if (percent_tcp_flags.values[0]>percent_threshold):
            attack_label = get_ip_proto_name(top_ip_proto)+" "+ get_tcp_flag_name(percent_tcp_flags.index[0])    
        else:
            attack_label = get_ip_proto_name(top_ip_proto)+" "+get_port_name(percent_src_ports.index[0])
    else:
        attack_label = "Several ports (ex. "+get_port_name(percent_src_ports.index[0])+" and "+get_port_name(percent_src_ports.index[1])+") "+get_ip_proto_name(top_ip_proto)

### UDP case
elif ports_pairs[0][0] > percent_threshold:
    attack_label = get_port_name(ports_pairs['sport'][0])
else:
    attack_label = get_ip_proto_name(top_ip_proto)

### IP fragmentation    
if (percent_fragments.values[0] > percent_threshold) and (percent_fragments.index[0]==1):
    attack_label = attack_label + " IP fragmentation"

### IP spoofing
if (ttl_variations[::-1].index[0] > 5) and (ttl_variations[::-1].values[0] > ttl_variation_threshold):
    attack_label= attack_label+" spoofed"
else:
    
### Reflection and Amplification    
    if len(percent_src_ports)==1:
        attack_label= attack_label+" reflection/amplification"
    
### Adding the word attack to the end of the attack label
attack_label= attack_label+" attack"

In [23]:
print(attack_label)

HTTP GET attack
