In [1]:
input_file='data/prod-anon-040.pcap'

<h1 align='center'> DDoS Attack (Convert, Filter and) Labelling </h1>

<h2 align='center'> 
<div style="width:400px;padding:10px;border:1px dotted black;">
Goal: to label DDoS attacks.
</div>
</h2>

<img align='center' src="figs/summary.png" width="400px"/>

- The **input** is a packet-based network trace file that contains an attack, usually with the extension .pcap or pcapng.
- The **output** is a string with a label of the attack found in the input file.

## Research Question (RQ) definitions:
- **RQ1: How to efficiently read and convert packet-based network traces containing DDoS attacks for facilitating futher analysis?** DDoS attack trace contains a very large amount of records making very expensive computationally to read and analyse them. These records are nested, i.e., packets can have different set of information (network fields), which makes even more challenging to analyse them in a single manner. There are different tools and libraries for reading packet-based network traces 

- **RQ2: How to efficiently identify the main characteristics of a DDoS attack?** From a practical point of view the definition of DDoS attack is a set of network records with a same characteristics, which is the majority of the network traffic compared to the remaining traffic.

- **RQ3: How the characteristics of DDoS attacks can be used for labeling them?** There are many taxonomies to classify and label DDoS attacks. Our approach to address this question is based on the practical experience of network operators and network security specialists. In this approach they label attacks based on their observations on the attacks.

<h2 align='center'>===============================================================<br> RQ1: How to efficiently read and convert packet-based network traces containing DDoS attacks for facilitating futher analysis?</h2>

We analyse the performance of four main tools and libraries for read packet-based network traces: 
- [tcpdump (see our analysis)](analysis_tcpdump_tshark_scapy.ipynb)
- [tshark (see our analysis)](analysis_tcpdump_tshark_scapy.ipynb)
- [scapy (see our analysis)](analysis_tcpdump_tshark_scapy.ipynb)
- [dpkt (see our analysis)](additional_analysis/analysis_dpkt.ipynb)

**In the end we decided for dpkt that has the second best performance but it is easier to manipulate the output for extension purposes.**

## Converting pcap (using dpkt)

In [2]:
output_file=input_file.split('.')[0]+".txt"

In [3]:
import argparse
import dpkt
import socket
import os

In [4]:
outputfile = open(output_file,'w')

inputfile = open(input_file)
pcapfile = dpkt.pcap.Reader(inputfile)

for ts, buf in pcapfile:
    eth = dpkt.ethernet.Ethernet(buf)

    #FILTERING ONLY FOR IPv4 instead of packets ARP or IPv6
    if eth.type == dpkt.ethernet.ETH_TYPE_IP:
        ip = eth.data #Loading the content of the ethernet into a variable 'ip'
        
        timestamp = ts #1
        ip_ttl = ip.ttl #2
        
        ip_proto = ip.p #3
        sport = ""
        dport= ""
        tcp_flag =""
        http_request_method=""
        if (ip_proto != 6) and (ip_proto != 17): #It is not TCP or UDP
            continue
            
        ip_length = ip.len #4
        ip_src = socket.inet_ntoa(ip.src) #5
        ip_dst = socket.inet_ntoa(ip.dst) #6
        
        try: proto = ip.data #Loading the content of the 'ip' into a variable 'protocol' that can be for example ICMP, TCP, and UDP.
        except:
            continue
        
        sport = proto.sport #7
        dport = proto.dport #8


        if ip.p == 6 :
            try:
                tcp_flag += ("F" if (int( proto.flags & dpkt.tcp.TH_FIN ) != 0) else ".") #27
                tcp_flag += ("S" if (int( proto.flags & dpkt.tcp.TH_SYN ) != 0) else ".") #26
                tcp_flag += ("R" if (int( proto.flags & dpkt.tcp.TH_RST ) != 0) else ".") #25
                tcp_flag += ("P" if (int( proto.flags & dpkt.tcp.TH_PUSH) != 0) else ".") #24
                tcp_flag += ("A" if (int( proto.flags & dpkt.tcp.TH_ACK ) != 0) else ".") #23
                tcp_flag += ("U" if (int( proto.flags & dpkt.tcp.TH_URG ) != 0) else ".") #22
                tcp_flag += ("E" if (int( proto.flags & dpkt.tcp.TH_ECE ) != 0) else ".") #21
                tcp_flag += ("C" if (int( proto.flags & dpkt.tcp.TH_CWR ) != 0) else ".") #20
            except:
                print "EXCEPTION TCP FLAG"

            if (proto.dport == 80) or (proto.dport == 443):
                    if proto.data == '':
                        http_request_method=''
                    else:
                        try:
                            http_request_method = dpkt.http.Request(proto.data).method
                        except:
                            http_request_method = ''

            
        fragments = 1 if (int(ip.off & dpkt.ip.IP_MF)!= 0) else 0  #8 This flag is set to a 1 for all fragments except the last one            

        print >> outputfile, str(timestamp)+';'+\
        str(ip_ttl)+';'+\
        str(ip_proto)+';'+\
        str(ip_length)+';'+\
        str(ip_src)+';'+\
        str(ip_dst)+';'+\
        str(sport)+';'+\
        str(dport)+';'+\
        str(tcp_flag)+';'+\
        str(fragments)+';'+\
        str(http_request_method)

<h2>==========================================================<br>
Analysing converted pcap</h2>

In [5]:
import pandas as pd #more info at http://pandas.pydata.org/
import numpy as np #more info at http://www.numpy.org/

In [6]:
columns=['timestamp',\
         'ip_ttl',\
         'ip_proto',\
         'ip_length',\
         'ip_src',\
         'ip_dst',\
         'sport',\
         'dport',\
         'tcp_flag',\
         'fragments',\
         'http_data']

df = pd.read_csv(output_file,delimiter=";", names=columns)
len(df)

8926

In [7]:
top_ip_dst = df['ip_dst'].value_counts().index[0]
top_ip_dst

'191.190.43.172'

In [8]:
top_ip_proto = df[df['ip_dst']==top_ip_dst]['ip_proto'].value_counts().index[0]
top_ip_proto

6

In [9]:
total_packets = len(df[(df['ip_dst']==top_ip_dst) &\
                ~df['ip_src'].str.contains(".".join(top_ip_dst.split('.')[0:3]),na=False) &\
                       (df['ip_proto'] == top_ip_proto)])
total_packets

8925

In [10]:
percent_src_ports = df[(df['ip_dst']==top_ip_dst) &\
                ~df['ip_src'].str.contains(".".join(top_ip_dst.split('.')[0:3]),na=False) &\
                   (df['ip_proto'] == top_ip_proto)]['sport'].value_counts().divide(total_packets/100)
percent_src_ports.head()

80       18.662921
50902     1.516854
46961     0.921348
60117     0.898876
50473     0.898876
Name: sport, dtype: float64

In [11]:
percent_dst_ports = df[(df['ip_dst']==top_ip_dst) &\
                ~df['ip_src'].str.contains(".".join(top_ip_dst.split('.')[0:3]),na=False) &\
                   (df['ip_proto'] == top_ip_proto)]['dport'].value_counts().divide(total_packets/100)
percent_dst_ports.head()

443      42.022472
80       39.584270
50229     0.067416
29675     0.056180
86        0.056180
Name: dport, dtype: float64

In [12]:
ports_pairs = df[(df['ip_dst']==top_ip_dst) &\
                ~df['ip_src'].str.contains(".".join(top_ip_dst.split('.')[0:3]),na=False) &\
                 (df['ip_proto'] == top_ip_proto)].groupby(['sport','dport']).size().divide(total_packets/100).sort_values(ascending=False).reset_index()
ports_pairs.head()

Unnamed: 0,sport,dport,0
0,50902,443,1.516854
1,46961,80,0.921348
2,50473,80,0.898876
3,60117,443,0.898876
4,60116,443,0.730337


In [13]:
http_data = df[(df['ip_dst']==top_ip_dst) &\
                ~df['ip_src'].str.contains(".".join(top_ip_dst.split('.')[0:3]),na=False) &\
                   (df['ip_proto'] == top_ip_proto)]['http_data'].value_counts().divide(total_packets/100)
http_data.head()

GET     4.078652
POST    0.157303
HEAD    0.011236
Name: http_data, dtype: float64

In [14]:
percent_tcp_flags = df[(df['ip_dst']==top_ip_dst) &\
                ~df['ip_src'].str.contains(".".join(top_ip_dst.split('.')[0:3]),na=False) &\
                   (df['ip_proto'] == top_ip_proto)]['tcp_flag'].value_counts().divide(total_packets/100)
percent_tcp_flags.head()

....A...    51.191011
...PA...    16.235955
.S..A...    14.539326
.S......     5.921348
F...A...     5.067416
Name: tcp_flag, dtype: float64

In [15]:
percent_fragments = df[(df['ip_dst']==top_ip_dst) &\
                ~df['ip_src'].str.contains(".".join(top_ip_dst.split('.')[0:3]),na=False) &\
                   (df['ip_proto'] == top_ip_proto)]['fragments'].value_counts().divide(total_packets/100)
percent_fragments.head()

0    100.269663
1      0.011236
Name: fragments, dtype: float64

## TTL 

In [29]:
df.groupby(['ip_src'])['ip_ttl'].agg(np.ptp)

ip_src
119.247.94.164     0
162.104.89.223     0
162.74.22.204      1
162.76.189.133     0
162.84.54.140      0
163.214.107.140    0
166.58.174.207     0
166.60.191.118     0
166.64.214.197     1
166.66.222.221     0
166.96.107.156     0
166.98.119.228     0
167.218.22.198     0
167.226.110.157    0
170.254.106.207    0
171.100.55.157     0
171.104.41.142     0
178.84.255.196     1
178.90.235.213     0
178.90.251.163     0
179.102.42.238     0
179.120.253.204    0
179.84.217.155     0
182.108.110.119    0
182.108.158.164    0
182.208.238.237    0
182.210.174.214    0
182.210.219.206    1
182.210.86.190     0
182.212.219.140    0
                  ..
47.123.123.141     0
47.127.230.230     0
47.205.187.230     0
54.125.27.180      0
54.235.93.151      0
54.249.233.179     0
54.97.158.164      0
55.113.170.180     0
55.113.234.222     0
55.117.154.159     0
55.117.246.190     0
55.117.91.158      0
55.77.154.207      0
62.233.110.175     0
66.229.250.179     0
66.237.59.141      0
66.237

In [None]:
percent_fragments = df[(df['ip_dst']==top_ip_dst) &\
                ~df['ip_src'].str.contains(".".join(top_ip_dst.split('.')[0:3]),na=False) &\
                   (df['ip_proto'] == top_ip_proto)]['fragments'].value_counts().divide(total_packets/100)
percent_fragments.head()

## Additional information

In [16]:
df_port_name = pd.read_csv('data/port_name.txt',delimiter=",", names=['port_num','port_name'])
df_ip_proto_name = pd.read_csv('data/ip_proto_name.txt',delimiter=",", names=['proto_num','proto_name'])

## Functions for enriching the actual data with additional information

In [17]:
def get_ip_proto_name(ip_proto_number):
    try:
        return df_ip_proto_name[df_ip_proto_name['proto_num']==ip_proto_number]['proto_name'].values[0]
    except:
        return str(ip_proto_number)
    
def get_port_name(port_number):
    try:
        return df_port_name[df_port_name['port_num']==port_number]['port_name'].values[0]
    except:
        return "Port "+str(port_number)
    
def get_tcp_flag_name(tcp_flags_str):
    tcp_flags=""
    try:
        tcp_flags += ("FIN" if (tcp_flags_str.find('F') != -1) else next) 
    except:
        next
    try:
        tcp_flags += ("SYN" if (tcp_flags_str.find('S')!= -1) else next) 
    except:
        next
        
    try:
        tcp_flags += ("RST" if tcp_flags_str.find('R') != -1 else next)
    except:
        next
        
    try:
        tcp_flags += ("PUSH" if tcp_flags_str.find('P') != -1 else next) 
    except:
        next
        
    try:
        tcp_flags += ("ACK" if tcp_flags_str.find('A') != -1 else next)
    except:
        next
        
    try:
        tcp_flags += ("URG" if tcp_flags_str.find('U') != -1 else next) 
    except:
        next
        
    try:
        tcp_flags += ("ECE" if tcp_flags_str.find('E') != -1 else next)
    except:
        next
        
    try:
        tcp_flags += ("CWR" if tcp_flags_str.find('C') != -1 else next)
    except:
        next
   
        
    return tcp_flags            

## DDoS Attack Labeling Approach

In [18]:
percent_threshold=70

if (top_ip_proto != 6) and (top_ip_proto != 17):
    attack_label = str(top_ip_proto)+" attack"

elif top_ip_proto == 6:
    ##THE Only exception in which the attack is labelled accordingly to the destination port
    if len(http_data)>0 and (percent_dst_ports.values[0]>percent_threshold) and ((percent_dst_ports.index[0]==80) or (percent_dst_ports.index[0]==443)):
        attack_label = get_port_name(percent_dst_ports.index[0]) +" "+http_data.index[0]
      
    elif (percent_src_ports.values[0]>percent_threshold) or (percent_dst_ports.values[0]>percent_threshold): 
         
        if (percent_tcp_flags.values[0]>percent_threshold):
            attack_label = get_ip_proto_name(top_ip_proto)+" "+ get_tcp_flag_name(percent_tcp_flags.index[0])    
        else:
            attack_label = get_ip_proto_name(top_ip_proto)+" "+get_port_name(percent_src_ports.index[0])
    else:
        attack_label = "Several ports (ex. "+get_port_name(percent_src_ports.index[0])+" and "+get_port_name(percent_src_ports.index[1])+") "+get_ip_proto_name(top_ip_proto)

###UDP case
elif ports_pairs[0][0] > percent_threshold:
    attack_label = get_port_name(ports_pairs['sport'][0])
else:
    attack_label = get_ip_proto_name(top_ip_proto)+" "+get_ip_proto_name(top_ip_proto)

###IP fragmentation    
if (percent_fragments.values[0] > percent_threshold) and (percent_fragments.index[0]==1):
    attack_label = attack_label + " IP fragmentation"
    

### Adding the word attack to the end of the attack label
attack_label= attack_label+" attack"

In [19]:
print(attack_label)

Several ports (ex. HTTP and Port 50902) TCP attack
