# Read full dataset into Pandas and save as pickle

In [1]:
import pandas as pd
import numpy as np

In [2]:
%load_ext watermark
%watermark -iv

pandas 0.24.0
numpy  1.15.4



In [3]:
# all of the names from the features file

In [4]:
names = [ "srcip", "sport", "dstip", "dsport", "proto", "state",
    "dur", "sbytes", "dbytes", "sttl", "dttl", "sloss", "dloss",
    "service", "sload", "dload", "spkts", "dpkts", "swin", "dwin",
    "stcpb", "dtcpb", "smeansz", "dmeansz", "trans_depth", "res_bdy_len",
    "sjit", "djit", "stime", "ltime", "sintpkt", "dintpkt", "tcprtt",
    "synack", "ackdat", "is_sm_ips_ports", "ct_state_ttl",
    "ct_flw_http_mthd", "is_ftp_login", "ct_ftp_cmd", "ct_srv_src",
    "ct_srv_dst", "ct_dst_ltm", "ct_src_ltm", "ct_src_dport_ltm",
    "ct_dst_sport_ltm", "ct_dst_src_ltm", "attack_cat", "label"
]

In [5]:
len(names)

49

## Read the data in as raw strings

Mainly because pandas.read_csv kept barfing

In [6]:
# read all columns as strings because pandas kept puking 
# chunking to speed things up 
if False:
    full_df = pd.DataFrame(columns=names)

    CHUNKSIZE = 10000
    chunks = [None] * CHUNKSIZE

    with open('../data/full.csv', 'r') as fp:
        reader = csv.DictReader(fp)
        index = 0
        chunk_idx = 0 
        for row in reader:
        
            rowdata = np.array(list(row.values())).reshape(1,-1)
            idf = pd.DataFrame(data=rowdata, index=[index], columns=row.keys())
            chunks[chunk_idx] = idf
            index += 1
        
            chunk_idx += 1
            if chunk_idx == CHUNKSIZE:
                full_df = full_df.append(pd.concat(chunks, axis=0))
                chunk_idx = 0
            
    if chunk_idx != 0:
        chunks = chunks[:chunk_idx]
        full_df = full_df.append(pd.concat(chunks, axis=0))

    pd.to_pickle(full_df, '../data/full_string_df')
else:
    full_df = pd.read_pickle('../data/full_string_df')

In [7]:
full_df.shape

(2540044, 49)

## Convert the integer columns

In [8]:
integers = [ "sport", "dsport", 
     "sbytes", "dbytes", "sttl", "dttl", "sloss", "dloss",
     "spkts", "dpkts", "swin", "dwin",
    "stcpb", "dtcpb", "smeansz", "dmeansz", "trans_depth", "res_bdy_len",
     "stime", "ltime",  "is_sm_ips_ports", "ct_state_ttl",
    "ct_flw_http_mthd", "is_ftp_login", "ct_ftp_cmd", "ct_srv_src",
    "ct_srv_dst", "ct_dst_ltm", "ct_src_ltm", "ct_src_dport_ltm",
    "ct_dst_sport_ltm", "ct_dst_src_ltm", "label"
]

strings = [ "srcip",  "dstip", "proto", "state", "service", "attack_cat"
]

floats = [  "dur",  "sload", "dload", 
    "sjit", "djit", "sintpkt", "dintpkt", "tcprtt",
    "synack", "ackdat"
]

nulls = [ '-', ' ', '' ]

In [9]:
full_df.head()

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,label
0,﻿59.166.0.0,1390,149.171.126.6,53,udp,CON,0.001055,132,164,31,...,0,3,7,1,3,1,1,1,,0
1,59.166.0.0,33661,149.171.126.9,1024,udp,CON,0.036133,528,304,31,...,0,2,4,2,3,1,1,2,,0
2,59.166.0.6,1464,149.171.126.7,53,udp,CON,0.001119,146,178,31,...,0,12,8,1,2,2,1,1,,0
3,59.166.0.5,3593,149.171.126.5,53,udp,CON,0.001209,132,164,31,...,0,6,9,1,1,1,1,1,,0
4,59.166.0.3,49664,149.171.126.0,53,udp,CON,0.001169,146,178,31,...,0,7,9,1,1,1,1,1,,0


In [10]:
# all the non-sloppy columns convert easily
clean_ints = []

for column in integers:
    null_count = 0 
    for s in full_df[column]:
        if s in nulls:
            null_count += 1
            break
    if null_count:
        null_count = 0
    else:
        clean_ints.append(column)
        
for column in clean_ints:
    full_df[column] = full_df[column].astype(np.int)
    

In [11]:
# for the last few, need a special converter 

def tolerant_convert_int(s):
    if s in nulls:
        return 0
    return int(s, 0)

dirty_ints = [ column for column in integers if column not in clean_ints ]
dirty_ints

['sport', 'dsport', 'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd']

In [12]:
for column in dirty_ints:
    full_df[column] = full_df[column].map(tolerant_convert_int)

In [13]:
full_df[dirty_ints].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2540044 entries, 0 to 2540043
Data columns (total 5 columns):
sport               int64
dsport              int64
ct_flw_http_mthd    int64
is_ftp_login        int64
ct_ftp_cmd          int64
dtypes: int64(5)
memory usage: 116.3 MB


## Pickle file checkpoint 

In [14]:
pd.to_pickle(full_df, '../data/converted_ints')

In [15]:
full_df = pd.read_pickle('../data/converted_ints')

## Convert the string columns

In [16]:
for column in strings:
    full_df[column] = full_df[column].str.replace(' ','')

In [17]:
strings

['srcip', 'dstip', 'proto', 'state', 'service', 'attack_cat']

In [18]:
categories = strings[2:]
categories

['proto', 'state', 'service', 'attack_cat']

In [19]:
full_df['proto'] = full_df['proto'].astype('category')
full_df['state'] = full_df['state'].astype('category')
full_df['service'] = full_df['service'].astype('category')

In [20]:
full_df['attack_cat'].value_counts()

                  2218761
Generic            215481
Exploits            44525
Fuzzers             24246
DoS                 16353
Reconnaissance      13987
Analysis             2677
Backdoor             1795
Shellcode            1511
Backdoors             534
Worms                 174
Name: attack_cat, dtype: int64

In [21]:
# From value counts above: Normal category is an empty string and Backdoor attack spelled two ways
full_df.loc[full_df.eval('label == 0'), 'attack_cat'] = 'normal'
full_df.loc[full_df.eval("attack_cat == 'Backdoor'"), 'attack_cat'] = 'Backdoors'

In [22]:
full_df['attack_cat'] = full_df['attack_cat'].str.lower()
full_df['attack_cat'].value_counts()

normal            2218761
generic            215481
exploits            44525
fuzzers             24246
dos                 16353
reconnaissance      13987
analysis             2677
backdoors            2329
shellcode            1511
worms                 174
Name: attack_cat, dtype: int64

In [23]:
full_df['attack_cat'] = full_df['attack_cat'].astype('category')

## Pickle file checkpoint

In [24]:
pd.to_pickle(full_df, '../data/converted_strings')

In [25]:
full_df = pd.read_pickle('../data/converted_strings')

## Convert float columns

In [26]:
# all that should be left is floats!
floats

['dur',
 'sload',
 'dload',
 'sjit',
 'djit',
 'sintpkt',
 'dintpkt',
 'tcprtt',
 'synack',
 'ackdat']

In [27]:
for column in floats:
    full_df[column] = pd.to_numeric(full_df[column])

In [28]:
pd.to_pickle(full_df, '../data/full_pandas_final')

In [29]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2540044 entries, 0 to 2540043
Data columns (total 49 columns):
srcip               object
sport               int64
dstip               object
dsport              int64
proto               category
state               category
dur                 float64
sbytes              int64
dbytes              int64
sttl                int64
dttl                int64
sloss               int64
dloss               int64
service             category
sload               float64
dload               float64
spkts               int64
dpkts               int64
swin                int64
dwin                int64
stcpb               int64
dtcpb               int64
smeansz             int64
dmeansz             int64
trans_depth         int64
res_bdy_len         int64
sjit                float64
djit                float64
stime               int64
ltime               int64
sintpkt             float64
dintpkt             float64
tcprtt              float64
sy