# Train Test Split 

In [1]:
import pandas as pd

In [2]:
%matplotlib inline

In [3]:
%load_ext watermark
%watermark -iv

pandas 0.24.0



In [4]:
%load_ext sql
%config SqlMagic.autopandas = True

In [5]:
%sql postgres://localhost/nb15

'Connected: @nb15'

In [6]:
%%sql
CREATE TABLE shuffle_split  (
    link_index INTEGER,
    train_set BOOLEAN DEFAULT TRUE,
    FOREIGN KEY (link_index) REFERENCES full_data (index)
);

 * postgres://localhost/nb15
Done.


In [7]:
# put everything in the test set, then take 40% out for 
# the test holdout

In [8]:
%%sql
INSERT INTO 
        shuffle_split(link_index) 
SELECT 
        index 
FROM 
        full_data;

 * postgres://localhost/nb15
2540044 rows affected.


In [9]:
attack_cats = %sql select distinct(attack_cat) cats from full_data;
attack_cats = list(attack_cats.cats)

 * postgres://localhost/nb15
10 rows affected.


In [10]:
attack_cats

['shellcode',
 'worms',
 'normal',
 'backdoors',
 'dos',
 'generic',
 'exploits',
 'fuzzers',
 'analysis',
 'reconnaissance']

In [11]:
test_size = .4

In [12]:
for cat in attack_cats:
    catrows = %sql select count(attack_cat) nrows from full_data where attack_cat = '{cat}'
    testrows = int(catrows.nrows * test_size)
    query_string = f"""
WITH
forty_percent AS (
    SELECT index 
    FROM full_data
    WHERE attack_cat = '{cat}'
    ORDER BY random()
    LIMIT {testrows}
)
UPDATE shuffle_split 
SET train_set = FALSE
WHERE link_index IN (SELECT index FROM forty_percent);
"""
    %sql $query_string

 * postgres://localhost/nb15
1 rows affected.
 * postgres://localhost/nb15
604 rows affected.
 * postgres://localhost/nb15
1 rows affected.
 * postgres://localhost/nb15
69 rows affected.
 * postgres://localhost/nb15
1 rows affected.
 * postgres://localhost/nb15
887504 rows affected.
 * postgres://localhost/nb15
1 rows affected.
 * postgres://localhost/nb15
931 rows affected.
 * postgres://localhost/nb15
1 rows affected.
 * postgres://localhost/nb15
6541 rows affected.
 * postgres://localhost/nb15
1 rows affected.
 * postgres://localhost/nb15
86192 rows affected.
 * postgres://localhost/nb15
1 rows affected.
 * postgres://localhost/nb15
17810 rows affected.
 * postgres://localhost/nb15
1 rows affected.
 * postgres://localhost/nb15
9698 rows affected.
 * postgres://localhost/nb15
1 rows affected.
 * postgres://localhost/nb15
1070 rows affected.
 * postgres://localhost/nb15
1 rows affected.
 * postgres://localhost/nb15
5594 rows affected.


In [13]:
%%sql
CREATE VIEW full_split 
AS 
    SELECT full_data.*, shuffle_split.train_set
    FROM
        full_data
        INNER JOIN shuffle_split
        ON full_data.index = shuffle_split.link_index;

 * postgres://localhost/nb15
Done.


In [14]:
%%sql
SELECT attack_cat, train_set, COUNT(train_set) 
FROM full_split
GROUP BY attack_cat, train_set
ORDER BY attack_cat, count

 * postgres://localhost/nb15
20 rows affected.


Unnamed: 0,attack_cat,train_set,count
0,analysis,False,1070
1,analysis,True,1607
2,backdoors,False,931
3,backdoors,True,1398
4,dos,False,6541
5,dos,True,9812
6,exploits,False,17810
7,exploits,True,26715
8,fuzzers,False,9698
9,fuzzers,True,14548


In [15]:
%%sql
WITH split_query AS (
    SELECT 
        attack_cat, 
        train_set,
        proto,
        cast(count(*) as float) split_count
    FROM
        full_split
    WHERE
        proto in ('tcp', 'udp')
    GROUP BY 
        attack_cat, train_set, proto
),
total_query AS (
    SELECT
        attack_cat,
        proto,
        cast(count(*) as float) proto_count
    FROM
        full_split
    WHERE
        proto in ('tcp', 'udp')
    GROUP BY
        attack_cat, proto
)
SELECT 
    split_query.attack_cat, 
    split_query.proto, 
    split_count,
    proto_count,
    split_query.train_set, 
    split_count/proto_count percent
FROM 
    split_query
    LEFT JOIN total_query
        ON
                split_query.attack_cat = total_query.attack_cat
        AND
                split_query.proto = total_query.proto
ORDER BY 
    attack_cat, proto, train_set

 * postgres://localhost/nb15
38 rows affected.


Unnamed: 0,attack_cat,proto,split_count,proto_count,train_set,percent
0,analysis,tcp,237.0,622.0,False,0.381029
1,analysis,tcp,385.0,622.0,True,0.618971
2,backdoors,tcp,141.0,323.0,False,0.436533
3,backdoors,tcp,182.0,323.0,True,0.563467
4,backdoors,udp,15.0,34.0,False,0.441176
5,backdoors,udp,19.0,34.0,True,0.558824
6,dos,tcp,1350.0,3336.0,False,0.404676
7,dos,tcp,1986.0,3336.0,True,0.595324
8,dos,udp,231.0,527.0,False,0.43833
9,dos,udp,296.0,527.0,True,0.56167


I only split the data based on attack_category, not on the packet type, so these aren't all 60/40 but they all seem to be close enough. 