In [1]:
import pandas as pd
from datetime import datetime, timedelta

## Data Load

In [2]:
data = pd.DataFrame(columns=["date", "time", "pro_id", "path", "sys_call", "event_id", "attack_cat", "attack_subcat", "label"])
for i in range(1,100):
    tmp=pd.read_csv("./host_log/"+str(i)+".csv", names=["date", "time", "pro_id", "path", "sys_call", "event_id", "attack_cat", "attack_subcat", "label"])
    data = data.append(tmp)

In [3]:
def re_index(df):
    tmp = df.reset_index()
    tmp = tmp.drop(tmp.columns[0], axis=1)
    return tmp

In [4]:
data = re_index(data)

## Data preprocessor

In [5]:
def make_datetime(string_date, string_time):
    date = string_date.split("/")
    time = string_time.split(":")
    return datetime(int(date[2]),int(date[1]),int(date[0]),int(time[0]),int(time[1]),int(time[2]))

In [6]:
date_time = []
for i in range(data.shape[0]):
    date_time.append(make_datetime(data.loc[i,data.columns[0]], data.loc[i,data.columns[1]]))

In [7]:
data.insert(0, "date_time", date_time)
data = data.drop("date",axis=1)
data = data.drop("time",axis=1)

In [8]:
data = data.sort_values(by=["date_time", "event_id"], axis=0)
data = re_index(data)
mapping_data = data

In [9]:
cat_mapping = {"normal":0, "Exploits":1, "Denial of Service":2, "Backdoors":3, "Generic":4, "Shellcode":5, "Worms":6, "Reconnaissance":7}
mapping_data["attack_cat"] = mapping_data["attack_cat"].map(cat_mapping)

In [10]:
mapping_data["attack_subcat"].value_counts()

normal                                          88791812
Office Document Batch                             276578
Browser                                           152319
 Clientside                                       102893
 IXIA Batch                                        79624
Clientside Microsoft Office Batch                  71920
 Clientside Microsoft Paint                        71869
All Batch                                          70712
Browser FTP Batch                                  48994
Linux Batch                                        44245
Microsoft Office Batch                             38646
Miscellaneous Batch                                35779
 Microsoft IIS Batch                               33509
Browser Batch                                      26935
 SMB Batch                                         23225
 Clientside Microsoft Office Batch                 21470
 Browser                                           17271
 NetBIOS/SMB Batch             

In [11]:
subcat_mapping = {"normal":0, "Office Document Batch":1, "Browser":2, " Clientside":3, " IXIA Batch":4, "Clientside Microsoft Office Batch":5, " Clientside Microsoft Paint":6, "All Batch":7, "Browser FTP Batch":8, "Linux Batch":9, "Microsoft Office Batch":10, "Miscellaneous Batch":11, " Microsoft IIS Batch":12, "Browser Batch":13, " SMB Batch":14, " Clientside Microsoft Office Batch":15, " Browser":16, " NetBIOS/SMB Batch":17, "All Batch":18, "Multiple OS Batch":19, "IIS Web Server":20, " Miscellaneous Batch":21, " FrontPage HTTP Batch":22, "DCERPC Batch":23, "Windows Explorer":24, " Web Application Cross-Site Scripting Batch":25, "Microsoft IIS":26, "All":27, "SMTP Batch":28, " LDAP":29, "HTTP":30, "Microsoft IIS Batch":31, " ICMP":32, " Hypervisor":33, "TCP":34, "Backup Appliance":35, "FTP":36, " SMTP":37, " Browser FTP Batch":38, "NNTP":39, "RDesktop":40, "WINS":41, " TFTP":42, "FrontPage HTTP Batch":43, " DNS":44, "SNMP ":45, "TFTP ":46, " POP3":47, " Webserver":48, " PPTP":49, " DCERPC":50, " MSSQL":51, "Microsoft IIS HTTP 200/200+A308969":52}
mapping_data["attack_subcat"] = mapping_data["attack_subcat"].map(subcat_mapping)

In [12]:
mapping_data["attack_subcat"].value_counts()

0.0     88791812
1.0       276578
2.0       152319
3.0       102893
4.0        79624
5.0        71920
6.0        71869
18.0       70712
9.0        44245
10.0       38646
11.0       35779
12.0       33509
13.0       26935
14.0       23225
15.0       21470
16.0       17271
20.0       12277
21.0       12141
22.0        9515
23.0        7880
24.0        7792
27.0        3761
28.0        3553
30.0        3129
32.0        2661
33.0        2604
34.0        2300
35.0        2158
36.0        2107
37.0        1638
38.0        1396
39.0        1378
41.0        1290
42.0        1100
43.0         917
44.0         781
47.0         732
48.0         701
50.0         654
51.0         607
52.0         258
Name: attack_subcat, dtype: int64

In [13]:
mapping_data.to_csv("./NGIDS_host_log.csv")

In [14]:
mapping_data.head(10)

Unnamed: 0,date_time,pro_id,path,sys_call,event_id,attack_cat,attack_subcat,label
0,2016-03-11 02:45:01,1951,/usr/lib/i386-linux-gnu/indicator-datetime/ind...,168,45350,0,0.0,0
1,2016-03-11 02:45:01,1966,/usr/lib/i386-linux-gnu/indicator-datetime/ind...,168,45351,0,0.0,0
2,2016-03-11 02:45:01,1885,/usr/lib/unity/unity-panel-service,168,45353,0,0.0,0
3,2016-03-11 02:45:01,1830,/sbin/upstart-dbus-bridge,142,45354,0,0.0,0
4,2016-03-11 02:45:01,1872,/usr/lib/unity/unity-panel-service,168,45355,0,0.0,0
5,2016-03-11 02:45:01,2114,/usr/bin/compiz,168,45357,0,0.0,0
6,2016-03-11 02:45:06,1804,/bin/dbus-daemon,256,45352,0,0.0,0
7,2016-03-11 02:45:06,2834,/usr/bin/update-notifier,142,45360,0,0.0,0
8,2016-03-11 02:45:06,2133,/usr/lib/i386-linux-gnu/gconf/gconfd-2,168,45372,0,0.0,0
9,2016-03-11 02:45:11,3989,/sbin/auditd,256,45374,0,0.0,0
