# EDA in Postgres for the full dataset


In [2]:
import pandas as pd

In [3]:
%load_ext sql

In [4]:
%load_ext watermark
%watermark -iv

pandas 0.24.0



In [5]:
%sql postgres://localhost/nb15

'Connected: @nb15'

In [6]:
%sql select count(*) from full_data;

 * postgres://localhost/nb15
1 rows affected.


count
2540044


In [22]:
%sql select * from full_data limit 5

 * postgres://localhost/nb15
5 rows affected.


index,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,dttl,sloss,dloss,service,sload,dload,spkts,dpkts,swin,dwin,stcpb,dtcpb,smeansz,dmeansz,trans_depth,res_bdy_len,sjit,djit,stime,ltime,sintpkt,dintpkt,tcprtt,synack,ackdat,is_sm_ips_ports,ct_state_ttl,ct_flw_http_mthd,is_ftp_login,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,label
0,ï»¿59.166.0.0,1390,149.171.126.6,53,udp,CON,0.001055,132,164,31,29,0,0,dns,500473.9375,621800.9375,2,2,0,0,0,0,66,82,0,0,0.0,0.0,1421927414,1421927414,0.017,0.013,0.0,0.0,0.0,0,0,0,0,0,3,7,1,3,1,1,1,normal,0
1,59.166.0.0,33661,149.171.126.9,1024,udp,CON,0.036133,528,304,31,29,0,0,-,87676.08594,50480.17188,4,4,0,0,0,0,132,76,0,0,9.89101,10.682733,1421927414,1421927414,7.005,7.564333,0.0,0.0,0.0,0,0,0,0,0,2,4,2,3,1,1,2,normal,0
2,59.166.0.6,1464,149.171.126.7,53,udp,CON,0.001119,146,178,31,29,0,0,dns,521894.5313,636282.375,2,2,0,0,0,0,73,89,0,0,0.0,0.0,1421927414,1421927414,0.017,0.013,0.0,0.0,0.0,0,0,0,0,0,12,8,1,2,2,1,1,normal,0
3,59.166.0.5,3593,149.171.126.5,53,udp,CON,0.001209,132,164,31,29,0,0,dns,436724.5625,542597.1875,2,2,0,0,0,0,66,82,0,0,0.0,0.0,1421927414,1421927414,0.043,0.014,0.0,0.0,0.0,0,0,0,0,0,6,9,1,1,1,1,1,normal,0
4,59.166.0.3,49664,149.171.126.0,53,udp,CON,0.001169,146,178,31,29,0,0,dns,499572.25,609067.5625,2,2,0,0,0,0,73,89,0,0,0.0,0.0,1421927414,1421927414,0.005,0.003,0.0,0.0,0.0,0,0,0,0,0,7,9,1,1,1,1,1,normal,0


In [23]:
# This table needs to match the one in Eventlist.ipynb
%sql select attack_cat, count(*) from full_data group by attack_cat;

 * postgres://localhost/nb15
10 rows affected.


attack_cat,count
analysis,2677
backdoors,2329
dos,16353
exploits,44525
fuzzers,24246
generic,215481
normal,2218761
reconnaissance,13987
shellcode,1511
worms,174


So I switched from the training dataset to the full dataset because I noticed 
in the training set that there are a number of protocols that are either entirely
attack or entirely normal (i.e. only need to look at one column to classify 
correctly). Looks like the full dataset has the same problem. 

In [9]:
%%sql 
SELECT proto, label, count(*) 
FROM full_data 
GROUP BY proto, label 
ORDER BY proto, label;

 * postgres://localhost/nb15
138 rows affected.


proto,label,count
3pc,1,137
a/n,1,137
aes-sp3-d,1,137
any,1,411
argus,1,137
aris,1,137
arp,0,10064
ax.25,1,137
bbn-rcc,1,137
bna,1,137


In [10]:
%%sql
WITH only_bad_proto AS (
SELECT proto, COUNT(proto) as proto_cnt
FROM full_data 
WHERE LABEL = 1 
AND proto NOT IN (SELECT DISTINCT(proto) FROM full_data WHERE LABEL = 0)
GROUP BY proto
)
SELECT SUM(proto_cnt)
FROM only_bad_proto
;

 * postgres://localhost/nb15
1 rows affected.


sum
36071


In [11]:
%%sql 
SELECT COUNT(label)
FROM full_data
WHERE LABEL = 1

 * postgres://localhost/nb15
1 rows affected.


count
321283


In [12]:
36071/321283

0.11227173551043784

So about 11 percent of attacks are identifiable as attacks by protocol only

In [13]:
%%sql
WITH only_good_proto AS (
     SELECT 
           proto, COUNT(proto) as proto_cnt
     FROM 
           full_data 
     WHERE 
             LABEL = 0 
         AND proto NOT IN (SELECT DISTINCT(proto) FROM full_data WHERE LABEL = 1)
     GROUP BY 
         proto
)
SELECT SUM(proto_cnt) 
FROM only_good_proto

 * postgres://localhost/nb15
1 rows affected.


sum
10669


In [14]:
%%sql
WITH only_good_proto AS (
      SELECT 
             proto, COUNT(proto) as proto_cnt
      FROM 
             full_data 
      WHERE 
             LABEL = 0 
         AND proto NOT IN 
             (SELECT DISTINCT(proto) FROM full_data WHERE LABEL = 1)
      GROUP BY proto
)
SELECT *
FROM only_good_proto
ORDER BY proto
LIMIT 10
;

 * postgres://localhost/nb15
6 rows affected.


proto,proto_cnt
arp,10064
esp,2
icmp,524
igmp,64
rtp,7
udt,8


There are also some that are only normal traffic.

In [14]:
%%sql
WITH
attack_tbl AS (
    SELECT * 
    FROM full_data
    WHERE LABEL = 1
),
only_bad_proto AS (
    SELECT attack_cat, COUNT(attack_cat) AS proto_leak_cnt
    FROM attack_tbl
    WHERE proto NOT IN (SELECT DISTINCT(proto) FROM full_data WHERE LABEL = 0)
    GROUP BY attack_cat
),
attack_sums AS (
    SELECT attack_cat, COUNT(attack_cat) as attack_cnt
    FROM attack_tbl
    GROUP BY attack_cat
)
SELECT attack_sums.attack_cat, attack_cnt, proto_leak_cnt 
FROM only_bad_proto 
INNER JOIN attack_sums 
        ON only_bad_proto.attack_cat = attack_sums.attack_cat


 * postgres://localhost/nb15
7 rows affected.


attack_cat,attack_cnt,proto_leak_cnt
dos,16353,11546
analysis,2677,1980
backdoors,2329,1810
generic,215481,1609
exploits,44525,14704
reconnaissance,13987,1903
fuzzers,24246,2519


OK -- there are attacks of these types that cannot be detected by protocol alone. 


In [19]:
%%sql
WITH attack_proto AS (
    SELECT DISTINCT(proto)
    FROM full_data
    WHERE proto NOT IN (SELECT DISTINCT(proto) FROM full_data WHERE label = 0)
)
SELECT 
     attack_cat, proto, count(label) num_rows
FROM  
     full_data 
WHERE  
        proto in (SELECT proto FROM attack_proto)
    AND label = 1
GROUP BY 
     attack_cat, proto
ORDER BY 
    proto, attack_cat
;

 * postgres://localhost/nb15
882 rows affected.


attack_cat,proto,num_rows
analysis,3pc,8
backdoors,3pc,7
dos,3pc,44
exploits,3pc,55
fuzzers,3pc,10
generic,3pc,6
reconnaissance,3pc,7
analysis,a/n,8
backdoors,a/n,7
dos,a/n,44


OK, so the protocol tells the algorithm that it's an attack, but that's not
enough to figure out the type of attack that was launched. So, there is still an interesting problem to solve.

The exploit-times-protocol counts are really low given the fact that there are two million rows in the training set. 

Need to think about how to stratify/resample my training set for the best strategy for dealing with imbalances.

I think I will also group the 'small item count' rows into bigger groups to prevent overfitting. 

In [21]:
%%sql 
SELECT service, label, count(*) 
FROM full_data 
GROUP BY service, label;

 * postgres://localhost/nb15
24 rows affected.


service,label,count
-,0,1166518
-,1,79877
dhcp,1,172
dns,0,571037
dns,1,210631
ftp,0,46075
ftp,1,3015
ftp-data,0,123893
ftp-data,1,1890
http,0,187426


No normal dhcp or ssl traffic, but the attacks seem to use the same services as the normal traffic otherwise. 