In [12]:
# Create an RDD from the KDD99 10% dataset
data_file = "./kddcup.data_10_percent.gz"
raw_data = sc.textFile(data_file)

In [13]:
# Count the number of lines in the raw dataset
raw_data.count()

494021

In [14]:
# Print first few entries
raw_data.take(5)

['0,tcp,http,SF,181,5450,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,9,9,1.00,0.00,0.11,0.00,0.00,0.00,0.00,0.00,normal.', '0,tcp,http,SF,239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,19,19,1.00,0.00,0.05,0.00,0.00,0.00,0.00,0.00,normal.', '0,tcp,http,SF,235,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,29,29,1.00,0.00,0.03,0.00,0.00,0.00,0.00,0.00,normal.', '0,tcp,http,SF,219,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.00,0.00,0.00,0.00,1.00,0.00,0.00,39,39,1.00,0.00,0.03,0.00,0.00,0.00,0.00,0.00,normal.', '0,tcp,http,SF,217,2032,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.00,0.00,0.00,0.00,1.00,0.00,0.00,49,49,1.00,0.00,0.02,0.00,0.00,0.00,0.00,0.00,normal.']

In [15]:
# Filter `normal` data
normal_raw_data = raw_data.filter(lambda x: 'normal.' in x)

In [16]:
# Count the normal data and measure time
from time import time
t0 = time()
normal_count = normal_raw_data.count()
tt = time() - t0
print ("There are {} 'normal' interactions".format(normal_count))
print ("Count completed in {} seconds".format(round(tt,3)))

There are 97278 'normal' interactions
Count completed in 1.161 seconds

In [17]:
# Sample data
raw_data_sample = raw_data.sample(False, 0.1, 1234)
sample_size = raw_data_sample.count()
total_size = raw_data.count()
print ("Sample size is {} of {}".format(sample_size, total_size))

Sample size is 49493 of 494021

In [18]:
# Measuring the normal interaction in the sample dataset
from time import time

# transformations to get normal data
raw_data_sample_items = raw_data_sample.map(lambda x: x.split(","))
sample_normal_tags = raw_data_sample_items.filter(lambda x: "normal." in x)

# actions + time
t0 = time()
sample_normal_tags_count = sample_normal_tags.count()
tt = time() - t0

sample_normal_ratio = sample_normal_tags_count / float(sample_size)
print ("The ratio of 'normal' interactions is {}".format(round(sample_normal_ratio,3)))
print ("Count done in {} seconds".format(round(tt,3)))

The ratio of 'normal' interactions is 0.195
Count done in 1.491 seconds

In [19]:
# Measuring the normal interaction in the entire dataset
raw_data_items = raw_data.map(lambda x: x.split(","))
normal_tags = raw_data_items.filter(lambda x: "normal." in x)

# actions + time
t0 = time()
normal_tags_count = normal_tags.count()
tt = time() - t0

normal_ratio = normal_tags_count / float(total_size)
print ("The ratio of 'normal' interactions is {}".format(round(sample_normal_ratio,3)))
print ("Count done in {} seconds".format(round(tt,3)))

The ratio of 'normal' interactions is 0.195
Count done in 2.681 seconds

This shows that the normal interaction in the data set is 0.195 (from sampling and from the entire data set). The duration is about a second slower when operating on the entire data set.

In [20]:
# Subtract the normal data from the entire dataset to get attacks
attack_raw_data = raw_data.subtract(normal_raw_data)
print ("There are {} attack interactions".format(round(attack_raw_data.count(),3)))

There are 396743 attack interactions

In [21]:
# Extract protocols
# Isolate each collection of values in two separate RDDs. 
# For that we will use distinct on the CSV-parsed dataset. 
# From the dataset description we know that protocol is the 
# second column and service is the third (tag is the last 
# one and not the first as appears in the page).
csv_data = raw_data.map(lambda x: x.split(","))
protocols = csv_data.map(lambda x: x[1]).distinct()
protocols.collect()

['icmp', 'udp', 'tcp']

In [22]:
# Extract services
services = csv_data.map(lambda x: x[2]).distinct()
services.collect()

['finger', 'http', 'netbios_dgm', 'name', 'hostnames', 'vmnet', 'systat', 'shell', 'netbios_ssn', 'urh_i', 'pop_3', 'ctf', 'domain', 'mtp', 'remote_job', 'exec', 'supdup', 'http_443', 'sunrpc', 'urp_i', 'pop_2', 'csnet_ns', 'smtp', 'whois', 'ldap', 'daytime', 'imap4', 'nntp', 'klogin', 'rje', 'IRC', 'link', 'eco_i', 'tftp_u', 'iso_tsap', 'uucp_path', 'auth', 'ecr_i', 'other', 'domain_u', 'courier', 'discard', 'red_i', 'tim_i', 'time', 'login', 'ftp', 'telnet', 'ntp_u', 'sql_net', 'echo', 'private', 'gopher', 'efs', 'netbios_ns', 'ftp_data', 'nnsp', 'ssh', 'netstat', 'uucp', 'Z39_50', 'kshell', 'X11', 'bgp', 'pm_dump', 'printer']

In [23]:
# Print all possible combinations of protocols x services
product = protocols.cartesian(services).collect()
print ("There are {} combinations of protocol X service".format(len(product)))

There are 198 combinations of protocol X service