# **KDDCup Data Analytics with PySpark RDD: A structured case study**

### Udemy Course: Best Hands-on Big Data Practices and Use Cases using PySpark

### Author: Amin Karami (PhD, FHEA)

##### data source: http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html


In [None]:
########## ONLY in Colab ##########
!pip3 install pyspark
########## ONLY in Colab ##########

In [None]:
########## ONLY in Ubuntu Machine ##########
# Load Spark engine
# !pip3 install -q findspark
# import findspark
# findspark.init()
########## ONLY in Ubuntu Machine ##########

In [1]:
from pyspark import SparkContext, SparkConf

# Initializing Spark
conf = SparkConf().setAppName("KDDCup_PySpark").setMaster("local[*]")
sc = SparkContext(conf=conf)
print(sc)
print("Ready to go!")

<SparkContext master=local[*] appName=KDDCup_PySpark>
Ready to go!


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Read and Load Data to Spark
# Data source: http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html
rdd = sc.textFile('/content/drive/MyDrive/Colab Notebooks/Udemy - Best Hands-on Big Data Practices with PySpark & Spark Tuning 2022-8/kddcup.data.gz')

In [10]:
print(rdd.getNumPartitions())
print(sc.defaultParallelism)

10
2


In [11]:
# Repartition and Cache Data:
from pyspark import StorageLevel
rdd = rdd.repartition(10)
rdd.persist(StorageLevel.MEMORY_AND_DISK_2)

MapPartitionsRDD[18] at coalesce at NativeMethodAccessorImpl.java:0

## Question 1: Get ten records randomly


In [12]:
rdd.takeSample(False, 10, seed=1234)

['0,icmp,ecr_i,SF,1032,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,510,510,0.00,0.00,0.00,0.00,1.00,0.00,0.00,255,255,1.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,smurf.',
 '0,icmp,ecr_i,SF,1032,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,511,511,0.00,0.00,0.00,0.00,1.00,0.00,0.00,255,255,1.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,smurf.',
 '0,icmp,ecr_i,SF,520,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,432,432,0.00,0.00,0.00,0.00,1.00,0.00,0.00,255,255,1.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,smurf.',
 '0,tcp,http,SF,234,2899,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,24,28,0.00,0.00,0.00,0.00,1.00,0.00,0.07,248,255,1.00,0.00,0.00,0.01,0.00,0.00,0.00,0.00,normal.',
 '0,icmp,ecr_i,SF,520,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,511,511,0.00,0.00,0.00,0.00,1.00,0.00,0.00,255,255,1.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,smurf.',
 '0,icmp,urp_i,SF,183,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0.00,0.00,0.00,0.00,1.00,0.00,1.00,255,16,0.06,0.01,0.08,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,tcp,http,REJ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0.00,0.00

## Question 2: Count elements

In [13]:
rdd.count()

4898431

## Question 3: Calculate the ratio of `normal` connections


In [16]:
rdd.map(lambda x: x.split(",")[-1]).filter(lambda x: "normal." in x).count() / rdd.count() * 100

19.859032412623552

## Question 4: Get the list of labels


In [21]:
rdd_labels = rdd.map(lambda x: x.split(",")[-1]).distinct().collect()
rdd_labels

['nmap.',
 'multihop.',
 'neptune.',
 'guess_passwd.',
 'rootkit.',
 'spy.',
 'land.',
 'ftp_write.',
 'warezmaster.',
 'perl.',
 'normal.',
 'pod.',
 'warezclient.',
 'buffer_overflow.',
 'smurf.',
 'ipsweep.',
 'satan.',
 'phf.',
 'teardrop.',
 'back.',
 'loadmodule.',
 'portsweep.',
 'imap.']

## Question 5: Count the number of connections for each label

In [28]:
%%time
rdd_pairs = rdd.map(lambda x: (x.split(",")[-1], 1)).reduceByKey(lambda x, y: x + y)

keys = rdd_pairs.keys().collect()
values = rdd_pairs.values().collect()

import pandas as pd

pd.DataFrame({"label": keys, "count": values}).sort_values(by="count", ascending=False)

CPU times: user 100 ms, sys: 16.4 ms, total: 117 ms
Wall time: 20 s


Unnamed: 0,label,count
14,smurf.,2807886
2,neptune.,1072017
10,normal.,972781
16,satan.,15892
15,ipsweep.,12481
21,portsweep.,10413
0,nmap.,2316
19,back.,2203
12,warezclient.,1020
18,teardrop.,979


## Question 6: Get the connection type with successful `root_shell` connections to servers, where the number of data bytes from source (`src_bytes`) is 500 times more than from server (`dst_bytes`)

In [40]:
split_rdd = rdd.filter(lambda line: line.split(',')[13] == '1')\
  .map(lambda x: (x.split(',')[1], x.split(',')[4], x.split(',')[5]))\
  .filter(lambda x: int(x[2]) > 500 * int(x[1]))
split_rdd.take(3)

[('tcp', '433', '1524348'), ('tcp', '351', '759161'), ('tcp', '296', '507534')]

## Question 7:  Get the list of `Protocols`that are `normal` and `vulnerable to attacks`, where there is NOT `guest login` to the destination addresses


In [48]:
normal_rdd = rdd.filter(lambda line: 'normal' in line.split(',')[-1] and line.split(',')[21] != '1')\
    .map(lambda line: (line.split(',')[1], 1))\
    .reduceByKey(lambda x, y: x + y)

attacks_rdd = rdd.filter(lambda line: 'normal' not in line.split(',')[-1] and line.split(',')[21] != '1')\
    .map(lambda line: (line.split(',')[1], 1))\
    .reduceByKey(lambda x, y: x + y)

pd.concat([
    pd.DataFrame({
        'label': normal_rdd.keys().collect(),
        'state': 'normal',
        'count': normal_rdd.values().collect()
    })
    ,
    pd.DataFrame({
        'label': attacks_rdd.keys().collect(),
        'state': 'attacks',
        'count': attacks_rdd.values().collect()
    })
]).sort_values(by='label', ascending=False)

Unnamed: 0,label,state,count
0,udp,normal,191348
0,udp,attacks,2940
1,tcp,normal,764894
1,tcp,attacks,1101613
2,icmp,normal,12763
2,icmp,attacks,2820782


## Question 8: Get a summary statistics for the sum of `tcp` connections to the same destination IP address (hint: `protocol_type` and `dst_host_count` features)

In [72]:
# Source: https://spark.apache.org/docs/latest/mllib-statistics.html
import numpy as np
from pyspark.mllib.stat import Statistics

stats_rdd = rdd.filter(lambda x: x.split(',')[1]=='tcp')\
    .map(lambda line: np.array(int(line.split(',')[31])))

summary = Statistics.colStats(stats_rdd)
print('tcp mean', summary.mean())
print('tcp std', np.sqrt(summary.variance()))
print('tcp min', summary.min())
print('tcp max', summary.max())

# para este tipo de operaciones estadisticas es mejor trabajar sobre dataframes

tcp mean [201.7520146]
tcp std [90.72575504]
tcp min [0.]
tcp max [255.]


## [challenge] Question 9: Filter the number of `icmp`-based attacks for each `service`

In [87]:
icmp_rdd = rdd.filter(lambda line: 'icmp' in line.split(',')[1])\
    .map(lambda line: (line.split(',')[2], 1))\
    .reduceByKey(lambda x, y: x + y)\

pd.DataFrame({
    'service': icmp_rdd.keys().collect(),
    'count': icmp_rdd.values().collect()
}).sort_values(by='count', ascending=False)

Unnamed: 0,service,count
1,ecr_i,2811660
2,eco_i,16338
0,urp_i,5378
5,urh_i,148
4,tim_i,12
3,red_i,9
