# Installing Spark

In [None]:
!pip3 install -qq pyspark

[K     |████████████████████████████████| 281.4 MB 34 kB/s 
[K     |████████████████████████████████| 198 kB 46.8 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext
sc

#Load Data

In [None]:
!wget 'http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz'

--2022-05-22 09:46:43--  http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz
Resolving kdd.ics.uci.edu (kdd.ics.uci.edu)... 128.195.1.86
Connecting to kdd.ics.uci.edu (kdd.ics.uci.edu)|128.195.1.86|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2144903 (2.0M) [application/x-gzip]
Saving to: ‘kddcup.data_10_percent.gz’


2022-05-22 09:46:44 (4.14 MB/s) - ‘kddcup.data_10_percent.gz’ saved [2144903/2144903]



In [None]:
!rm kdd10.gz
!mv kddcup.data_10_percent.gz kdd10.gz

rm: cannot remove 'kdd10.gz': No such file or directory


In [None]:
data_file = "./kdd10.gz"
raw_data = sc.textFile(data_file)

print ("Train data size is ", raw_data.count())

Train data size is  494021


In [None]:
!wget 'http://kdd.ics.uci.edu/databases/kddcup99/corrected.gz'

--2022-05-22 09:49:06--  http://kdd.ics.uci.edu/databases/kddcup99/corrected.gz
Resolving kdd.ics.uci.edu (kdd.ics.uci.edu)... 128.195.1.86
Connecting to kdd.ics.uci.edu (kdd.ics.uci.edu)|128.195.1.86|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1409035 (1.3M) [application/x-gzip]
Saving to: ‘corrected.gz’


2022-05-22 09:49:07 (2.67 MB/s) - ‘corrected.gz’ saved [1409035/1409035]



In [None]:
test_data_file = "./corrected.gz"
test_raw_data = sc.textFile(test_data_file)

print ("Test data size is ", test_raw_data.count())

Test data size is  311029


# Part 01: RDD Creation

In [None]:
rawData_RDD = sc.textFile(data_file)

In [None]:
rawData_RDD.count()

494021

In [None]:
rawData_RDD.take(3)

['0,tcp,http,SF,181,5450,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,9,9,1.00,0.00,0.11,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,tcp,http,SF,239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,19,19,1.00,0.00,0.05,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,tcp,http,SF,235,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,29,29,1.00,0.00,0.03,0.00,0.00,0.00,0.00,0.00,normal.']

# Part 2: RDD Basics

In [None]:
normal_rawData = rawData_RDD.filter(lambda x: 'normal.' in x) # Filter operations
notNormal_rawData = rawData_RDD.filter(lambda x: 'normal.' not in x) 

In [None]:
normal_rawData.count()

97278

In [None]:
notNormal_rawData.count()

396743

In [None]:
missing = rawData_RDD.count() - (normal_rawData.count() + notNormal_rawData.count() )
print("missing ", missing)

missing  0


In [None]:
notNormal_rawData.take(3)

['184,tcp,telnet,SF,1511,2957,0,0,0,3,0,1,2,1,0,0,1,0,0,0,0,0,1,1,0.00,0.00,0.00,0.00,1.00,0.00,0.00,1,3,1.00,0.00,1.00,0.67,0.00,0.00,0.00,0.00,buffer_overflow.',
 '305,tcp,telnet,SF,1735,2766,0,0,0,3,0,1,2,1,0,0,1,0,0,0,0,0,1,1,0.00,0.00,0.00,0.00,1.00,0.00,0.00,2,4,1.00,0.00,0.50,0.50,0.00,0.00,0.00,0.00,buffer_overflow.',
 '79,tcp,telnet,SF,281,1301,0,0,0,2,0,1,1,1,0,0,4,2,0,0,0,0,1,1,0.00,0.00,0.00,0.00,1.00,0.00,0.00,1,10,1.00,0.00,1.00,0.30,0.00,0.00,0.00,0.10,loadmodule.']

In [None]:
from time import time
t0 = time()
normal_count = normal_rawData.count()
tt = time() - t0
print ("Counted ",    normal_count," normal transactions in ", tt, "seconds")
t0 = time()
notNormal_count = notNormal_rawData.count()
tt = time() - t0
print ("Counted ", notNormal_count," notNormal transactions in ", tt, "seconds")

Counted  97278  normal transactions in  1.1309189796447754 seconds
Counted  396743  notNormal transactions in  1.469597578048706 seconds


In [None]:
csv_data = rawData_RDD.map(lambda x: x.split(","))
t0 = time()
head_rows = csv_data.take(5)
tt = time() - t0
print ("Parse completed in ", round(tt,3)," secs")
print(head_rows[0])
print(head_rows[1])
print(head_rows[2])
#head_rows

Parse completed in  0.577  secs
['0', 'tcp', 'http', 'SF', '181', '5450', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '8', '8', '0.00', '0.00', '0.00', '0.00', '1.00', '0.00', '0.00', '9', '9', '1.00', '0.00', '0.11', '0.00', '0.00', '0.00', '0.00', '0.00', 'normal.']
['0', 'tcp', 'http', 'SF', '239', '486', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '8', '8', '0.00', '0.00', '0.00', '0.00', '1.00', '0.00', '0.00', '19', '19', '1.00', '0.00', '0.05', '0.00', '0.00', '0.00', '0.00', '0.00', 'normal.']
['0', 'tcp', 'http', 'SF', '235', '1337', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '8', '8', '0.00', '0.00', '0.00', '0.00', '1.00', '0.00', '0.00', '29', '29', '1.00', '0.00', '0.03', '0.00', '0.00', '0.00', '0.00', '0.00', 'normal.']


In [None]:
def parse_interaction(line):
    elems = line.split(",")
    tag = elems[41]
    return (tag, elems)

key_csv_data = rawData_RDD.map(parse_interaction)
head_rows = key_csv_data.take(5)
print(head_rows[0])
print(head_rows[1])
print(head_rows[2])
print(head_rows[3])
print(head_rows[4])
#print(head_rows[5])

('normal.', ['0', 'tcp', 'http', 'SF', '181', '5450', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '8', '8', '0.00', '0.00', '0.00', '0.00', '1.00', '0.00', '0.00', '9', '9', '1.00', '0.00', '0.11', '0.00', '0.00', '0.00', '0.00', '0.00', 'normal.'])
('normal.', ['0', 'tcp', 'http', 'SF', '239', '486', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '8', '8', '0.00', '0.00', '0.00', '0.00', '1.00', '0.00', '0.00', '19', '19', '1.00', '0.00', '0.05', '0.00', '0.00', '0.00', '0.00', '0.00', 'normal.'])
('normal.', ['0', 'tcp', 'http', 'SF', '235', '1337', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '8', '8', '0.00', '0.00', '0.00', '0.00', '1.00', '0.00', '0.00', '29', '29', '1.00', '0.00', '0.03', '0.00', '0.00', '0.00', '0.00', '0.00', 'normal.'])
('normal.', ['0', 'tcp', 'http', 'SF', '219', '1337', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '6', 

In [None]:
t0 = time()
all_raw_data = rawData_RDD.collect()
tt = time() - t0
print ("Data collected in ",round(tt,3), "secs")

Data collected in  6.14 secs


In [None]:
# get data from file
data_file = "kdd10.gz"
raw_data = spark.sparkContext.textFile(data_file)

# parse into key-value pairs
key_csv_data = raw_data.map(parse_interaction)

# filter normal key interactions
normal_key_interactions = key_csv_data.filter(lambda x: x[0] == "normal.")

# collect all
t0 = time()
all_normal = normal_key_interactions.collect()
tt = time() - t0
normal_count = len(all_normal)
print ("Data collected in", round(tt,3)," secs")
print ("There are ",normal_count, "'normal' interactions")
print(normal_key_interactions.take(2))

Data collected in 3.751  secs
There are  97278 'normal' interactions
[('normal.', ['0', 'tcp', 'http', 'SF', '181', '5450', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '8', '8', '0.00', '0.00', '0.00', '0.00', '1.00', '0.00', '0.00', '9', '9', '1.00', '0.00', '0.11', '0.00', '0.00', '0.00', '0.00', '0.00', 'normal.']), ('normal.', ['0', 'tcp', 'http', 'SF', '239', '486', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '8', '8', '0.00', '0.00', '0.00', '0.00', '1.00', '0.00', '0.00', '19', '19', '1.00', '0.00', '0.05', '0.00', '0.00', '0.00', '0.00', '0.00', 'normal.'])]


#RDD Sampling

In [None]:
sc = spark.sparkContext

In [None]:
datafile = "kdd10.gz"
raw_data = sc.textFile(data_file)

In [None]:
raw_data_sample = raw_data.sample(False, 0.1, 1234)
sample_size = raw_data_sample.count()
total_size = raw_data.count()
print ("Sample size is ",sample_size , "of ", total_size)

Sample size is  49493 of  494021


In [None]:
from time import time

# transformations to be applied
raw_data_sample_items = raw_data_sample.map(lambda x: x.split(","))
sample_normal_tags = raw_data_sample_items.filter(lambda x: "normal." in x)

# actions + time
t0 = time()
sample_normal_tags_count = sample_normal_tags.count()
tt = time() - t0

sample_normal_ratio = sample_normal_tags_count / float(sample_size)
print ("The ratio of 'normal' interactions is ",round(sample_normal_ratio,3)) 
print ("Count done in ",round(tt,3) ," seconds")

The ratio of 'normal' interactions is  0.195
Count done in  1.129  seconds


In [None]:
# transformations to be applied
raw_data_items = raw_data.map(lambda x: x.split(","))
normal_tags = raw_data_items.filter(lambda x: "normal." in x)

# actions + time
t0 = time()
normal_tags_count = normal_tags.count()
tt = time() - t0

normal_ratio = normal_tags_count / float(total_size)
print ("The ratio of 'normal' interactions is",round(normal_ratio,3)) 
print ("Count done in ",round(tt,3)," seconds")

The ratio of 'normal' interactions is 0.197
Count done in  2.076  seconds


# Part 4: RDD Set Operations

In [None]:
sc = spark.sparkContext
datafile = "kdd10.gz"
raw_data = sc.textFile(data_file)

In [None]:
normal_raw_data = raw_data.filter(lambda x: "normal." in x)

In [None]:
attack_raw_data = raw_data.subtract(normal_raw_data)

In [None]:
raw_data_count = raw_data.count()
normal_raw_data_count = normal_raw_data.count()
attack_raw_data_count = attack_raw_data.count()
print("raw ",raw_data_count," normal raw ",normal_raw_data_count," attack raw ",attack_raw_data_count)

raw  494021  normal raw  97278  attack raw  396743


In [None]:
csv_data = raw_data.map(lambda x: x.split(","))
protocols = csv_data.map(lambda x: x[1]).distinct()
protocols.collect()

['tcp', 'udp', 'icmp']

In [None]:
services = csv_data.map(lambda x: x[2]).distinct()
services.collect()

['http',
 'smtp',
 'finger',
 'domain_u',
 'auth',
 'telnet',
 'ftp',
 'eco_i',
 'ntp_u',
 'ecr_i',
 'other',
 'private',
 'pop_3',
 'ftp_data',
 'rje',
 'time',
 'mtp',
 'link',
 'remote_job',
 'gopher',
 'ssh',
 'name',
 'whois',
 'domain',
 'login',
 'imap4',
 'daytime',
 'ctf',
 'nntp',
 'shell',
 'IRC',
 'nnsp',
 'http_443',
 'exec',
 'printer',
 'efs',
 'courier',
 'uucp',
 'klogin',
 'kshell',
 'echo',
 'discard',
 'systat',
 'supdup',
 'iso_tsap',
 'hostnames',
 'csnet_ns',
 'pop_2',
 'sunrpc',
 'uucp_path',
 'netbios_ns',
 'netbios_ssn',
 'netbios_dgm',
 'sql_net',
 'vmnet',
 'bgp',
 'Z39_50',
 'ldap',
 'netstat',
 'urh_i',
 'X11',
 'urp_i',
 'pm_dump',
 'tftp_u',
 'tim_i',
 'red_i']

In [None]:
services.count()

66

In [None]:
product = protocols.cartesian(services)
product.take(5)

[('tcp', 'http'),
 ('tcp', 'smtp'),
 ('tcp', 'finger'),
 ('tcp', 'domain_u'),
 ('tcp', 'auth')]

In [None]:
print ("There are ",product.count()," combinations of protocol X service")

There are  198  combinations of protocol X service


#Part 5: RDD Aggregations

In [None]:
sc = spark.sparkContext
datafile = "kdd10.gz"
raw_data = sc.textFile(data_file)

In [None]:
# parse data
csv_data = raw_data.map(lambda x: x.split(","))

# separate into different RDDs
normal_csv_data = csv_data.filter(lambda x: x[41]=="normal.")
attack_csv_data = csv_data.filter(lambda x: x[41]!="normal.")

In [None]:
normal_duration_data = normal_csv_data.map(lambda x: int(x[0]))
attack_duration_data = attack_csv_data.map(lambda x: int(x[0]))

In [None]:
normal_duration_data.take(15)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [None]:
total_normal_duration = normal_duration_data.reduce(lambda x, y: x + y)
total_attack_duration = attack_duration_data.reduce(lambda x, y: x + y)

print ("Total duration for 'normal' interactions is ",total_normal_duration)
print ("Total duration for 'attack' interactions is ",total_attack_duration)

Total duration for 'normal' interactions is  21075991
Total duration for 'attack' interactions is  2626792


In [None]:
normal_count = normal_duration_data.count()
attack_count = attack_duration_data.count()
print('Mean Count Normal', normal_count, " Mean Count Attack", attack_count)

Mean Count Normal 97278  Mean Count Attack 396743


In [None]:
print ("Mean duration for 'normal' interactions is ",round(total_normal_duration/float(normal_count),3))
print ("Mean duration for 'attack' interactions is ",round(total_attack_duration/float(attack_count),3))

Mean duration for 'normal' interactions is  216.657
Mean duration for 'attack' interactions is  6.621


In [None]:
normal_sum_count = normal_duration_data.aggregate(
    (0,0), # the initial value
    (lambda acc, value: (acc[0] + value, acc[1] + 1)), # combine value with acc
    (lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1])) # combine accumulators
)
print(normal_sum_count[0],normal_sum_count[1] )
print ("Mean duration for 'normal' interactions is",round(normal_sum_count[0]/float(normal_sum_count[1]),3))

21075991 97278
Mean duration for 'normal' interactions is 216.657


In [None]:
attack_sum_count = attack_duration_data.aggregate(
    (0,0), # the initial value
    (lambda acc, value: (acc[0] + value, acc[1] + 1)), # combine value with acc
    (lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1])) # combine accumulators
)

print ("Mean duration for 'attack' interactions is",round(attack_sum_count[0]/float(attack_sum_count[1]),3))

Mean duration for 'attack' interactions is 6.621


# Part 6: RDD Key Value Pair

In [None]:
sc = spark.sparkContext
datafile = "kdd10.gz"
raw_data = sc.textFile(datafile)

In [None]:
csv_data = raw_data.map(lambda x: x.split(","))
key_value_data = csv_data.map(lambda x: (x[41], x)) # x[41] contains the network interaction tag

In [None]:
key_value_data.take(1)

[('normal.',
  ['0',
   'tcp',
   'http',
   'SF',
   '181',
   '5450',
   '0',
   '0',
   '0',
   '0',
   '0',
   '1',
   '0',
   '0',
   '0',
   '0',
   '0',
   '0',
   '0',
   '0',
   '0',
   '0',
   '8',
   '8',
   '0.00',
   '0.00',
   '0.00',
   '0.00',
   '1.00',
   '0.00',
   '0.00',
   '9',
   '9',
   '1.00',
   '0.00',
   '0.11',
   '0.00',
   '0.00',
   '0.00',
   '0.00',
   '0.00',
   'normal.'])]

In [None]:
key_value_duration = csv_data.map(lambda x: (x[41], float(x[0]))) 
key_value_duration.take(5)

[('normal.', 0.0),
 ('normal.', 0.0),
 ('normal.', 0.0),
 ('normal.', 0.0),
 ('normal.', 0.0)]

In [None]:
durations_by_key = key_value_duration.reduceByKey(lambda x, y: x + y)
durations_by_key.collect()

[('normal.', 21075991.0),
 ('buffer_overflow.', 2751.0),
 ('loadmodule.', 326.0),
 ('perl.', 124.0),
 ('neptune.', 0.0),
 ('smurf.', 0.0),
 ('guess_passwd.', 144.0),
 ('pod.', 0.0),
 ('teardrop.', 0.0),
 ('portsweep.', 1991911.0),
 ('ipsweep.', 43.0),
 ('land.', 0.0),
 ('ftp_write.', 259.0),
 ('back.', 284.0),
 ('imap.', 72.0),
 ('satan.', 64.0),
 ('phf.', 18.0),
 ('nmap.', 0.0),
 ('multihop.', 1288.0),
 ('warezmaster.', 301.0),
 ('warezclient.', 627563.0),
 ('spy.', 636.0),
 ('rootkit.', 1008.0)]

In [None]:
counts_by_key = key_value_data.countByKey()
#print(counts_by_key)
counts_by_key.items()

dict_items([('normal.', 97278), ('buffer_overflow.', 30), ('loadmodule.', 9), ('perl.', 3), ('neptune.', 107201), ('smurf.', 280790), ('guess_passwd.', 53), ('pod.', 264), ('teardrop.', 979), ('portsweep.', 1040), ('ipsweep.', 1247), ('land.', 21), ('ftp_write.', 8), ('back.', 2203), ('imap.', 12), ('satan.', 1589), ('phf.', 4), ('nmap.', 231), ('multihop.', 7), ('warezmaster.', 20), ('warezclient.', 1020), ('spy.', 2), ('rootkit.', 10)])

In [None]:
print("Looping ..........................")

for key, val in counts_by_key.items():
    print (key, val)

Looping ..........................
normal. 97278
buffer_overflow. 30
loadmodule. 9
perl. 3
neptune. 107201
smurf. 280790
guess_passwd. 53
pod. 264
teardrop. 979
portsweep. 1040
ipsweep. 1247
land. 21
ftp_write. 8
back. 2203
imap. 12
satan. 1589
phf. 4
nmap. 231
multihop. 7
warezmaster. 20
warezclient. 1020
spy. 2
rootkit. 10


In [None]:
sum_counts = key_value_duration.combineByKey(
    (lambda x: (x, 1)), # the initial value, with value x and count 1
    (lambda acc, value: (acc[0]+value, acc[1]+1)), # how to combine a pair value with the accumulator: sum value, and increment count
    (lambda acc1, acc2: (acc1[0]+acc2[0], acc1[1]+acc2[1])) # combine accumulators
)

sum_counts.collectAsMap()

{'back.': (284.0, 2203),
 'buffer_overflow.': (2751.0, 30),
 'ftp_write.': (259.0, 8),
 'guess_passwd.': (144.0, 53),
 'imap.': (72.0, 12),
 'ipsweep.': (43.0, 1247),
 'land.': (0.0, 21),
 'loadmodule.': (326.0, 9),
 'multihop.': (1288.0, 7),
 'neptune.': (0.0, 107201),
 'nmap.': (0.0, 231),
 'normal.': (21075991.0, 97278),
 'perl.': (124.0, 3),
 'phf.': (18.0, 4),
 'pod.': (0.0, 264),
 'portsweep.': (1991911.0, 1040),
 'rootkit.': (1008.0, 10),
 'satan.': (64.0, 1589),
 'smurf.': (0.0, 280790),
 'spy.': (636.0, 2),
 'teardrop.': (0.0, 979),
 'warezclient.': (627563.0, 1020),
 'warezmaster.': (301.0, 20)}

In [None]:
#duration_means_by_type = sum_counts.map(lambda (key,value): (key, round(value[0]/value[1],3))).collectAsMap()
duration_means_by_type = sum_counts.map(lambda z: (z[0], round(z[1][0]/z[1][1],3))).collectAsMap()

# Print them sorted
for tag in sorted(duration_means_by_type, key=duration_means_by_type.get, reverse=True):
   print (tag, duration_means_by_type[tag])


portsweep. 1915.299
warezclient. 615.258
spy. 318.0
normal. 216.657
multihop. 184.0
rootkit. 100.8
buffer_overflow. 91.7
perl. 41.333
loadmodule. 36.222
ftp_write. 32.375
warezmaster. 15.05
imap. 6.0
phf. 4.5
guess_passwd. 2.717
back. 0.129
satan. 0.04
ipsweep. 0.034
neptune. 0.0
smurf. 0.0
pod. 0.0
teardrop. 0.0
land. 0.0
nmap. 0.0


# Part 7 - MLib Statistics

In [None]:
sc = spark.sparkContext
datafile = "kdd10.gz"
raw_data = sc.textFile(datafile)

In [None]:
raw_data.take(2)

['0,tcp,http,SF,181,5450,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,9,9,1.00,0.00,0.11,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,tcp,http,SF,239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,19,19,1.00,0.00,0.05,0.00,0.00,0.00,0.00,0.00,normal.']

In [None]:
import numpy as np

def parse_interaction(line):
    line_split = line.split(",")
    # keep just numeric and logical values
    symbolic_indexes = [1,2,3,41]
    clean_line_split = [item for i,item in enumerate(line_split) if i not in symbolic_indexes]
    return np.array([float(x) for x in clean_line_split])

vector_data = raw_data.map(parse_interaction)
vector_data.take(2)

[array([0.00e+00, 1.81e+02, 5.45e+03, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 8.00e+00, 8.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00, 9.00e+00, 9.00e+00,
        1.00e+00, 0.00e+00, 1.10e-01, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00]),
 array([0.00e+00, 2.39e+02, 4.86e+02, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 8.00e+00, 8.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00, 1.90e+01, 1.90e+01,
        1.00e+00, 0.00e+00, 5.00e-02, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00])]

In [None]:
from pyspark.mllib.stat import Statistics 
from math import sqrt 

# Compute column summary statistics.
summary = Statistics.colStats(vector_data)

In [None]:
print ("Duration Statistics:")
print (" Mean: ", round(summary.mean()[0],3))
print (" St. deviation: ", round(sqrt(summary.variance()[0]),3))
print (" Max value: ", round(summary.max()[0],3))
print (" Min value: ", round(summary.min()[0],3))
print (" Total value count: ", summary.count())
print (" Number of non-zero values: ", summary.numNonzeros()[0])

Duration Statistics:
 Mean:  47.979
 St. deviation:  707.746
 Max value:  58329.0
 Min value:  0.0
 Total value count:  494021
 Number of non-zero values:  12350.0


In [None]:
print ("SRC Bytes Statistics:")
print (" Mean: ", round(summary.mean()[1],3))
print (" St. deviation: ", round(sqrt(summary.variance()[1]),3))
print (" Max value: ", round(summary.max()[1],3))
print (" Min value: ", round(summary.min()[1],3))
print (" Total value count: ", summary.count())
print (" Number of non-zero values: ", summary.numNonzeros()[1])

SRC Bytes Statistics:
 Mean:  3025.61
 St. deviation:  988218.101
 Max value:  693375640.0
 Min value:  0.0
 Total value count:  494021
 Number of non-zero values:  378679.0


In [None]:
def parse_interaction_with_key(line):
    line_split = line.split(",")
    # keep just numeric and logical values
    symbolic_indexes = [1,2,3,41]
    clean_line_split = [item for i,item in enumerate(line_split) if i not in symbolic_indexes]
    return (line_split[41], np.array([float(x) for x in clean_line_split]))

In [None]:
label_vector_data = raw_data.map(parse_interaction_with_key)

In [None]:
label_vector_data.take(2)

[('normal.', array([0.00e+00, 1.81e+02, 5.45e+03, 0.00e+00, 0.00e+00, 0.00e+00,
         0.00e+00, 0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
         0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
         0.00e+00, 8.00e+00, 8.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
         0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00, 9.00e+00, 9.00e+00,
         1.00e+00, 0.00e+00, 1.10e-01, 0.00e+00, 0.00e+00, 0.00e+00,
         0.00e+00, 0.00e+00])),
 ('normal.', array([0.00e+00, 2.39e+02, 4.86e+02, 0.00e+00, 0.00e+00, 0.00e+00,
         0.00e+00, 0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
         0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
         0.00e+00, 8.00e+00, 8.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
         0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00, 1.90e+01, 1.90e+01,
         1.00e+00, 0.00e+00, 5.00e-02, 0.00e+00, 0.00e+00, 0.00e+00,
         0.00e+00, 0.00e+00]))]

In [None]:
normal_label_data = label_vector_data.filter(lambda x: x[0]=="normal.")

In [None]:
normal_label_data.take(2)

[('normal.', array([0.00e+00, 1.81e+02, 5.45e+03, 0.00e+00, 0.00e+00, 0.00e+00,
         0.00e+00, 0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
         0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
         0.00e+00, 8.00e+00, 8.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
         0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00, 9.00e+00, 9.00e+00,
         1.00e+00, 0.00e+00, 1.10e-01, 0.00e+00, 0.00e+00, 0.00e+00,
         0.00e+00, 0.00e+00])),
 ('normal.', array([0.00e+00, 2.39e+02, 4.86e+02, 0.00e+00, 0.00e+00, 0.00e+00,
         0.00e+00, 0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
         0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
         0.00e+00, 8.00e+00, 8.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
         0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00, 1.90e+01, 1.90e+01,
         1.00e+00, 0.00e+00, 5.00e-02, 0.00e+00, 0.00e+00, 0.00e+00,
         0.00e+00, 0.00e+00]))]

In [None]:
normal_summary = Statistics.colStats(normal_label_data.values())

In [None]:
print ("Duration Statistics for label: ", "normal")
print (" Mean: ", round(normal_summary.mean()[0],3))
print (" St. deviation: ", round(sqrt(normal_summary.variance()[0]),3))
print (" Max value: ", round(normal_summary.max()[0],3))
print (" Min value: ", round(normal_summary.min()[0],3))
print (" Total value count: ", normal_summary.count())
print (" Number of non-zero values: ", normal_summary.numNonzeros()[0])

Duration Statistics for label:  normal
 Mean:  216.657
 St. deviation:  1359.213
 Max value:  58329.0
 Min value:  0.0
 Total value count:  97278
 Number of non-zero values:  11690.0


In [None]:
def summary_by_label(raw_data, label):
    label_vector_data = raw_data.map(parse_interaction_with_key).filter(lambda x: x[0]==label)
    return Statistics.colStats(label_vector_data.values())

In [None]:
normal_sum = summary_by_label(raw_data, "normal.")

print ("Duration Statistics for label: ", "normal")
print (" Mean: ", round(normal_sum.mean()[0],3))
print (" St. deviation: ", round(sqrt(normal_sum.variance()[0]),3))
print (" Max value: ", round(normal_sum.max()[0],3))
print (" Min value: ", round(normal_sum.min()[0],3))
print (" Total value count: ", normal_sum.count())
print (" Number of non-zero values: ", normal_sum.numNonzeros()[0])

Duration Statistics for label:  normal
 Mean:  216.657
 St. deviation:  1359.213
 Max value:  58329.0
 Min value:  0.0
 Total value count:  97278
 Number of non-zero values:  11690.0


In [None]:
#generic_sum = summary_by_label(raw_data, "guess_passwd.")
generic_sum = summary_by_label(raw_data, "teardrop.")

#print ("Duration Statistics for label: ", "guess_passwd")
print ("Duration Statistics for label: ", "teardrop.")
print (" Mean: ", round(generic_sum.mean()[0],3))
print (" St. deviation: ", round(sqrt(generic_sum.variance()[0]),3))
print (" Max value: ", round(generic_sum.max()[0],3))
print (" Min value: ", round(generic_sum.min()[0],3))
print (" Total value count: ", generic_sum.count())
print (" Number of non-zero values: ", generic_sum.numNonzeros()[0])

Duration Statistics for label:  teardrop.
 Mean:  0.0
 St. deviation:  0.0
 Max value:  0.0
 Min value:  0.0
 Total value count:  979
 Number of non-zero values:  0.0


In [None]:
label_list = ["back.","buffer_overflow.","ftp_write.","guess_passwd.",
              "imap.","ipsweep.","land.","loadmodule.","multihop.",
              "neptune.","nmap.","normal.","perl.","phf.","pod.","portsweep.",
              "rootkit.","satan.","smurf.","spy.","teardrop.","warezclient.",
              "warezmaster."]

In [None]:
from time import time

In [None]:
t0 = time()
stats_by_label = [(label, summary_by_label(raw_data, label)) for label in label_list]
tt = time()-t0
print('time taken ',tt, 'seconds')


time taken  205.84382891654968 seconds


In [None]:
stats_by_label

[('back.',
  <pyspark.mllib.stat._statistics.MultivariateStatisticalSummary at 0x7f69dc43fd50>),
 ('buffer_overflow.',
  <pyspark.mllib.stat._statistics.MultivariateStatisticalSummary at 0x7f69dc4026d0>),
 ('ftp_write.',
  <pyspark.mllib.stat._statistics.MultivariateStatisticalSummary at 0x7f69dc40ced0>),
 ('guess_passwd.',
  <pyspark.mllib.stat._statistics.MultivariateStatisticalSummary at 0x7f69dc3e8a10>),
 ('imap.',
  <pyspark.mllib.stat._statistics.MultivariateStatisticalSummary at 0x7f69dc402fd0>),
 ('ipsweep.',
  <pyspark.mllib.stat._statistics.MultivariateStatisticalSummary at 0x7f69dc3f9c50>),
 ('land.',
  <pyspark.mllib.stat._statistics.MultivariateStatisticalSummary at 0x7f69dc442110>),
 ('loadmodule.',
  <pyspark.mllib.stat._statistics.MultivariateStatisticalSummary at 0x7f69dc39b910>),
 ('multihop.',
  <pyspark.mllib.stat._statistics.MultivariateStatisticalSummary at 0x7f69dc39b890>),
 ('neptune.',
  <pyspark.mllib.stat._statistics.MultivariateStatisticalSummary at 0x7f69dc

In [None]:
duration_by_label2 = [(stat[0],float(stat[1].mean()[0]), float(sqrt(stat[1].variance()[0])), float(stat[1].min()[0]), float(stat[1].max()[0]), int(stat[1].count())) for stat in stats_by_label]

In [None]:
duration_by_label2

[('back.', 0.1289151157512483, 1.1100621667887005, 0.0, 14.0, 2203),
 ('buffer_overflow.', 91.70000000000003, 97.51468501258388, 0.0, 321.0, 30),
 ('ftp_write.', 32.375, 47.44903280664121, 0.0, 134.0, 8),
 ('guess_passwd.', 2.7169811320754715, 11.879810537356365, 0.0, 60.0, 53),
 ('imap.', 6.0, 14.174240399721281, 0.0, 41.0, 12),
 ('ipsweep.', 0.034482758620689655, 0.4384391926659592, 0.0, 7.0, 1247),
 ('land.', 0.0, 0.0, 0.0, 0.0, 21),
 ('loadmodule.', 36.22222222222222, 41.40886915196362, 0.0, 103.0, 9),
 ('multihop.', 184.00000000000003, 253.8510061696296, 0.0, 718.0, 7),
 ('neptune.', 0.0, 0.0, 0.0, 0.0, 107201),
 ('nmap.', 0.0, 0.0, 0.0, 0.0, 231),
 ('normal.', 216.65732231336938, 1359.213468917662, 0.0, 58329.0, 97278),
 ('perl.', 41.333333333333336, 14.843629385474879, 25.0, 54.0, 3),
 ('phf.', 4.5, 5.744562646538029, 0.0, 12.0, 4),
 ('pod.', 0.0, 0.0, 0.0, 0.0, 264),
 ('portsweep.', 1915.299038461538, 7285.125158537256, 0.0, 42448.0, 1040),
 ('rootkit.', 100.8, 216.185003077354

In [None]:
import pandas as pd
pd.set_option('display.max_columns', 50)

stats_by_label_df2 = pd.DataFrame(duration_by_label2, columns=[' ',"Mean", "Std Dev", "Min", "Max", "Count"])

In [None]:
print ("Duration statistics, by label")
stats_by_label_df2

Duration statistics, by label


Unnamed: 0,Unnamed: 1,Mean,Std Dev,Min,Max,Count
0,back.,0.128915,1.110062,0.0,14.0,2203
1,buffer_overflow.,91.7,97.514685,0.0,321.0,30
2,ftp_write.,32.375,47.449033,0.0,134.0,8
3,guess_passwd.,2.716981,11.879811,0.0,60.0,53
4,imap.,6.0,14.17424,0.0,41.0,12
5,ipsweep.,0.034483,0.438439,0.0,7.0,1247
6,land.,0.0,0.0,0.0,0.0,21
7,loadmodule.,36.222222,41.408869,0.0,103.0,9
8,multihop.,184.0,253.851006,0.0,718.0,7
9,neptune.,0.0,0.0,0.0,0.0,107201


In [None]:
def get_variable_stats_df2(stats_by_label, column_i):
    column_stats_by_label = [
        (stat[0], float(stat[1].mean()[column_i]), float(sqrt(stat[1].variance()[column_i])), float(stat[1].min()[column_i]), float(stat[1].max()[column_i]), int(stat[1].count())) 
        for stat in stats_by_label
    ]
    return pd.DataFrame(column_stats_by_label, columns=[' ',"Mean", "Std Dev", "Min", "Max", "Count"])

In [None]:
get_variable_stats_df2(stats_by_label,0)

Unnamed: 0,Unnamed: 1,Mean,Std Dev,Min,Max,Count
0,back.,0.128915,1.110062,0.0,14.0,2203
1,buffer_overflow.,91.7,97.514685,0.0,321.0,30
2,ftp_write.,32.375,47.449033,0.0,134.0,8
3,guess_passwd.,2.716981,11.879811,0.0,60.0,53
4,imap.,6.0,14.17424,0.0,41.0,12
5,ipsweep.,0.034483,0.438439,0.0,7.0,1247
6,land.,0.0,0.0,0.0,0.0,21
7,loadmodule.,36.222222,41.408869,0.0,103.0,9
8,multihop.,184.0,253.851006,0.0,718.0,7
9,neptune.,0.0,0.0,0.0,0.0,107201


In [None]:
print ("src_bytes statistics, by label")
get_variable_stats_df2(stats_by_label,1)

src_bytes statistics, by label


Unnamed: 0,Unnamed: 1,Mean,Std Dev,Min,Max,Count
0,back.,54156.355878,3159.36,13140.0,54540.0,2203
1,buffer_overflow.,1400.433333,1337.133,0.0,6274.0,30
2,ftp_write.,220.75,267.7476,0.0,676.0,8
3,guess_passwd.,125.339623,3.03786,104.0,126.0,53
4,imap.,347.583333,629.926,0.0,1492.0,12
5,ipsweep.,10.0834,5.231658,0.0,18.0,1247
6,land.,0.0,0.0,0.0,0.0,21
7,loadmodule.,151.888889,127.7453,0.0,302.0,9
8,multihop.,435.142857,540.9604,0.0,1412.0,7
9,neptune.,0.0,0.0,0.0,0.0,107201


In [None]:
raw_data_sample = raw_data.sample(False, 0.1, 1234)
sample_size = raw_data_sample.count()
total_size = raw_data.count()
print ("Sample size is ",sample_size , "of ", total_size)

Sample size is  49493 of  494021


In [None]:
import numpy as np

def parse_interaction(line):
    line_split = line.split(",")
    # keep just numeric and logical values
    symbolic_indexes = [1,2,3,41]
    clean_line_split = [item for i,item in enumerate(line_split) if i not in symbolic_indexes]
    return np.array([float(x) for x in clean_line_split])

vector_data_sample = raw_data_sample.map(parse_interaction)
vector_data_sample.take(2)

[array([0.00e+00, 1.81e+02, 5.45e+03, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 8.00e+00, 8.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00, 9.00e+00, 9.00e+00,
        1.00e+00, 0.00e+00, 1.10e-01, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00]),
 array([0.00e+00, 2.10e+02, 1.51e+02, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 8.00e+00, 8.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00, 8.00e+00, 8.90e+01,
        1.00e+00, 0.00e+00, 1.20e-01, 4.00e-02, 0.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00])]

In [None]:
from pyspark.mllib.stat import Statistics 
correlation_matrix = Statistics.corr(vector_data_sample, method="spearman") # we have used a smaller sample here

In [None]:
import pandas as pd
pd.set_option('display.max_columns', 50)

col_names = ["duration","src_bytes","dst_bytes","land","wrong_fragment",
             "urgent","hot","num_failed_logins","logged_in","num_compromised",
             "root_shell","su_attempted","num_root","num_file_creations",
             "num_shells","num_access_files","num_outbound_cmds",
             "is_hot_login","is_guest_login","count","srv_count","serror_rate",
             "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
             "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
             "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
             "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
             "dst_host_rerror_rate","dst_host_srv_rerror_rate"]

corr_df = pd.DataFrame(correlation_matrix, index=col_names, columns=col_names)

corr_df

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_hot_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
duration,1.0,0.014854,0.297922,-0.001017,-0.007621,0.028233,0.101065,-0.001439,0.15692,0.01203,0.033684,,0.015708,0.072401,0.008206,0.004575,,,0.196682,-0.258991,-0.250785,-0.073436,-0.073242,-0.022575,-0.023619,0.061804,-0.05015,0.129145,-0.161715,-0.220296,-0.215577,0.237118,-0.065232,0.104728,-0.058296,-0.056524,-0.007542,-0.014086
src_bytes,0.014854,1.0,-0.169473,-0.00895,-0.022158,-0.004041,0.106511,-0.007582,-0.093463,0.114493,0.002368,,-0.005549,0.023594,0.020213,-0.002262,,,0.023215,0.673891,0.72892,-0.657138,-0.651919,-0.344384,-0.335915,0.74651,-0.74259,-0.105738,0.134125,0.74601,0.732665,-0.717587,0.818764,-0.143084,-0.645579,-0.640117,-0.305603,-0.307658
dst_bytes,0.297922,-0.169473,1.0,-0.002882,-0.02159,0.011808,0.187336,0.017105,0.883522,0.163355,0.026866,,-0.004147,0.039026,-0.000986,0.061647,,,0.079851,-0.63739,-0.495782,-0.206125,-0.200039,-0.100882,-0.082943,0.228386,-0.221267,0.526787,-0.605922,0.026204,0.053968,-0.033624,-0.397466,0.574511,-0.164738,-0.156567,-0.01156,-0.006518
land,-0.001017,-0.00895,-0.002882,1.0,-0.000303,-2.9e-05,-0.000497,-5.7e-05,-0.00263,-0.00041,-7e-05,,-0.000231,-0.00014,-8.1e-05,-0.000181,,,-0.00022,-0.010684,-0.01009,0.013622,0.013664,-0.001607,-0.001625,0.003415,-0.003419,0.011359,-0.019338,-0.010545,0.004034,-0.00405,0.005183,0.020627,0.013307,0.010495,-0.001766,-0.00174
wrong_fragment,-0.007621,-0.022158,-0.02159,-0.000303,1.0,-0.000214,-0.003725,-0.000428,-0.019703,-0.003071,-0.000524,,-0.001727,-0.001049,-0.000606,-0.001354,,,-0.001645,-0.054439,-0.026845,-0.007755,-0.022209,-0.001948,-0.012171,0.00959,-0.008084,0.009094,-0.028674,-0.056104,-0.047199,0.055455,-0.015379,0.004921,0.015411,-0.022931,0.051155,-0.013032
urgent,0.028233,-0.004041,0.011808,-2.9e-05,-0.000214,1.0,-0.000352,-4e-05,0.010865,0.069688,-4.9e-05,,0.123854,-9.9e-05,-5.7e-05,0.158048,,,-0.000155,-0.007555,-0.007587,-0.002109,-0.002096,-0.001137,-0.001149,0.002414,-0.002418,-0.001234,-0.013534,-0.009236,-0.004762,0.009611,-0.001708,-0.001528,-0.002181,-0.002164,-0.001248,-0.00123
hot,0.101065,0.106511,0.187336,-0.000497,-0.003725,-0.000352,1.0,0.114307,0.184684,0.799647,0.140633,,0.004374,0.045624,0.019523,-0.002225,,,0.443815,-0.117548,-0.111706,-0.033359,-0.033131,0.017551,0.055817,0.040439,-0.039853,0.038803,-0.069,-0.013506,0.019205,-0.015684,-0.083343,-0.010144,0.002638,-0.002728,0.182773,0.178575
num_failed_logins,-0.001439,-0.007582,0.017105,-5.7e-05,-0.000428,-4e-05,0.114307,1.0,-0.003719,-0.00058,-9.9e-05,,-0.000326,-0.000198,-0.000114,-0.000256,,,-0.000311,-0.014698,-0.014722,-0.004218,-0.004192,0.035713,0.035298,0.004829,-0.004836,-0.002468,-0.024911,-0.010999,0.005705,-0.005728,-0.005109,-0.003056,0.014629,0.014834,0.031014,0.031362
logged_in,0.15692,-0.093463,0.883522,-0.00263,-0.019703,0.010865,0.184684,-0.003719,1.0,0.155888,0.026616,,0.087655,0.053241,0.030734,0.068745,,,0.083506,-0.575376,-0.435923,-0.187335,-0.180977,-0.091605,-0.073307,0.21657,-0.213788,0.510475,-0.674912,0.084897,0.11602,-0.0961,-0.360047,0.655313,-0.140352,-0.130404,-0.002099,0.005451
num_compromised,0.01203,0.114493,0.163355,-0.00041,-0.003071,0.069688,0.799647,-0.00058,0.155888,1.0,0.085353,,0.032517,0.04163,0.024032,0.009234,,,-0.002228,-0.093844,-0.08848,-0.028906,-0.02872,0.014927,0.060239,0.034067,-0.03379,0.041642,-0.032615,0.006644,0.037112,-0.037242,-0.075873,-0.020783,0.004655,0.004613,0.203364,0.205928


In [None]:
# get a boolean dataframe where true means that a pair of variables is highly correlated
highly_correlated_df = (abs(corr_df) > .8) & (corr_df < 1.0)
highly_correlated_df

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_hot_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
duration,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
src_bytes,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False
dst_bytes,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
land,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
wrong_fragment,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
urgent,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
hot,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
num_failed_logins,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
logged_in,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
num_compromised,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [None]:
# get the names of the variables so we can use them to slice the dataframe
correlated_vars_index = (highly_correlated_df==True).any()
correlated_vars_index

duration                       False
src_bytes                       True
dst_bytes                       True
land                           False
wrong_fragment                 False
urgent                         False
hot                            False
num_failed_logins              False
logged_in                       True
num_compromised                False
root_shell                     False
su_attempted                   False
num_root                       False
num_file_creations             False
num_shells                     False
num_access_files               False
num_outbound_cmds              False
is_hot_login                   False
is_guest_login                 False
count                           True
srv_count                       True
serror_rate                     True
srv_serror_rate                 True
rerror_rate                     True
srv_rerror_rate                 True
same_srv_rate                   True
diff_srv_rate                   True
s

In [None]:
# columns that have some high correlation
correlated_var_names = correlated_vars_index[correlated_vars_index==True].index
correlated_var_names

Index(['src_bytes', 'dst_bytes', 'logged_in', 'count', 'srv_count',
       'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
       'same_srv_rate', 'diff_srv_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate'],
      dtype='object')

In [None]:
# columns that have no high correlation
not_correlated_var_names = correlated_vars_index[correlated_vars_index!=True].index
not_correlated_var_names

Index(['duration', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'num_compromised', 'root_shell', 'su_attempted',
       'num_root', 'num_file_creations', 'num_shells', 'num_access_files',
       'num_outbound_cmds', 'is_hot_login', 'is_guest_login',
       'srv_diff_host_rate'],
      dtype='object')

In [None]:
# slice it
highly_correlated_df.loc[correlated_var_names,correlated_var_names]

Unnamed: 0,src_bytes,dst_bytes,logged_in,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
src_bytes,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False
dst_bytes,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
logged_in,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
count,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
srv_count,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False
serror_rate,False,False,False,False,False,False,True,False,False,True,True,False,False,False,False,False,False,True,True,False,False
srv_serror_rate,False,False,False,False,False,True,False,False,False,True,True,False,False,False,False,False,False,True,True,False,False
rerror_rate,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,True
srv_rerror_rate,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,True
same_srv_rate,False,False,False,False,False,True,True,False,False,False,True,False,True,True,True,False,False,True,True,False,False


In [None]:
#Above four steps done in one cell
# get a boolean dataframe where true means that a pair of variables is highly correlated
highly_correlated_df = (abs(corr_df) > .8) & (corr_df < 1.0)
# get the names of the variables so we can use them to slice the dataframe
correlated_vars_index = (highly_correlated_df==True).any()
correlated_var_names = correlated_vars_index[correlated_vars_index==True].index
# slice it
highly_correlated_df.loc[correlated_var_names,correlated_var_names]

Unnamed: 0,src_bytes,dst_bytes,logged_in,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
src_bytes,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False
dst_bytes,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
logged_in,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
count,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
srv_count,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False
serror_rate,False,False,False,False,False,False,True,False,False,True,True,False,False,False,False,False,False,True,True,False,False
srv_serror_rate,False,False,False,False,False,True,False,False,False,True,True,False,False,False,False,False,False,True,True,False,False
rerror_rate,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,True
srv_rerror_rate,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,True
same_srv_rate,False,False,False,False,False,True,True,False,False,False,True,False,True,True,True,False,False,True,True,False,False
