# Consistent Labels

The purpose of this notebook is to create a dictionary of attack labels to numbers that
is consistent across all of the notebooks. The attacks are labeled by most-frequent to least-frequent
based on the frequency in the training set.  In addition, I create lists of the most frequent values 
of the service, protocol, and TCP state labels. 

The consistent labels and lists are output to a fle called consistent_labels.py to be
imported by other notebooks.

In [31]:
import pandas as pd
import numpy as np

from pathlib import Path
import pickle

In [2]:
%load_ext sql
%config SqlMagic.autopandas = True
%sql postgres://localhost/nb15

'Connected: @nb15'

In [7]:
X_train = %sql select * from full_split where train_set = True;

 * postgres://localhost/nb15
1524031 rows affected.


In [9]:
targets = ['label', 'attack_cat']
Y_train = X_train[targets]
X_train = X_train.drop(columns=targets)

In [14]:
attack_cats = Y_train.attack_cat.value_counts()
attack_cats_labeled = list(enumerate(attack_cats.index))

In [15]:
attack_cats

normal            1331257
generic            129289
exploits            26715
fuzzers             14548
dos                  9812
reconnaissance       8393
analysis             1607
backdoors            1398
shellcode             907
worms                 105
Name: attack_cat, dtype: int64

In [16]:
attack_cats_labeled

[(0, 'normal'),
 (1, 'generic'),
 (2, 'exploits'),
 (3, 'fuzzers'),
 (4, 'dos'),
 (5, 'reconnaissance'),
 (6, 'analysis'),
 (7, 'backdoors'),
 (8, 'shellcode'),
 (9, 'worms')]

In [17]:
attack_cat_encoder = dict()
for i,name in attack_cats_labeled:
    attack_cat_encoder[name] = i 

In [19]:
X_train.state.value_counts().head(10)

FIN    887217
CON    336184
INT    294381
REQ      5477
RST       319
ECO       208
CLO       106
URH        67
ACC        27
PAR        14
Name: state, dtype: int64

In [24]:
states = list(X_train.state.value_counts().head(4).index)
states

['FIN', 'CON', 'INT', 'REQ']

In [22]:
X_train.service.value_counts().head(10)

-           746998
dns         469203
http        124258
ftp-data     75436
smtp         49213
ftp          29355
ssh          28340
pop3           920
dhcp           110
ssl             90
Name: service, dtype: int64

In [25]:
services = list(X_train.state.value_counts().head(7).index)
services

['FIN', 'CON', 'INT', 'REQ', 'RST', 'ECO', 'CLO']

In [26]:
X_train.proto.value_counts().head(10)

tcp     897031
udp     594259
unas      9736
arp       6019
ospf      4676
sctp       907
icmp       320
any        258
gre        193
rsvp       173
Name: proto, dtype: int64

In [29]:
protos = list(X_train.proto.value_counts().head(5).index)
protos

['tcp', 'udp', 'unas', 'arp', 'ospf']

In [41]:
output = Path('consistent_labels.py')
with open(output, 'w') as fp:
    fp.write(f"""
import pandas as pd    

def get_attack_labels():
    return {attack_cat_encoder}
    
def get_common_services():
    return {services}
    
def get_common_protos():
    return {protos}

def get_common_states():
    return {states}
    """
            )

In [42]:
!cat consistent_labels.py


import pandas as pd    

def get_attack_labels():
    return {'normal': 0, 'generic': 1, 'exploits': 2, 'fuzzers': 3, 'dos': 4, 'reconnaissance': 5, 'analysis': 6, 'backdoors': 7, 'shellcode': 8, 'worms': 9}
    
def get_common_services():
    return ['FIN', 'CON', 'INT', 'REQ', 'RST', 'ECO', 'CLO']
    
def get_common_protos():
    return ['tcp', 'udp', 'unas', 'arp', 'ospf']

def get_common_states():
    return ['FIN', 'CON', 'INT', 'REQ']
    