# Chapter 5: Anomaly Detection in Network Traffic with K-means Clustering
http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html

In [1]:
from pprint import pprint

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Ch05").getOrCreate()
sc = spark.sparkContext

### A First Take on Clustering

In [4]:
dataWithoutHeader = spark.read.option('inferSchema', 'true') \
                            .option('header', 'false') \
                            .csv('./data/kddcup.data_10_percent_corrected')
# 10% sampling dataser --> 500k rows

In [5]:
data = dataWithoutHeader.toDF(
"duration", "protocol_type", "service", "flag",
"src_bytes", "dst_bytes", "land", "wrong_fragment", "urgent",
"hot", "num_failed_logins", "logged_in", "num_compromised",
"root_shell", "su_attempted", "num_root", "num_file_creations",
"num_shells", "num_access_files", "num_outbound_cmds",
"is_host_login", "is_guest_login", "count", "srv_count",
"serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate",
"same_srv_rate", "diff_srv_rate", "srv_diff_host_rate",
"dst_host_count", "dst_host_srv_count",
"dst_host_same_srv_rate", "dst_host_diff_srv_rate",
"dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate",
"dst_host_serror_rate", "dst_host_srv_serror_rate",
"dst_host_rerror_rate", "dst_host_srv_rerror_rate",
"label")

In [11]:
data.count()

494021

In [8]:
data.select("label").groupBy("label").count().orderBy("count", ascending=False).show(25)

+----------------+------+
|           label| count|
+----------------+------+
|          smurf.|280790|
|        neptune.|107201|
|         normal.| 97278|
|           back.|  2203|
|          satan.|  1589|
|        ipsweep.|  1247|
|      portsweep.|  1040|
|    warezclient.|  1020|
|       teardrop.|   979|
|            pod.|   264|
|           nmap.|   231|
|   guess_passwd.|    53|
|buffer_overflow.|    30|
|           land.|    21|
|    warezmaster.|    20|
|           imap.|    12|
|        rootkit.|    10|
|     loadmodule.|     9|
|      ftp_write.|     8|
|       multihop.|     7|
|            phf.|     4|
|           perl.|     3|
|            spy.|     2|
+----------------+------+



### KMeans

In [12]:
from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans, KMeansModel
from pyspark.ml.feature import VectorAssembler

In [45]:
# numericOnly = data.drop("protocol_type", "service", "flag").dropna().cache()
# categorical data는 one hot coding 해야됨 -- 뒤에서
numericOnly = data.drop("protocol_type", "service", "flag").cache()

In [46]:
inputCols = numericOnly.columns
inputCols.remove('label')

In [47]:
assembler = VectorAssembler()\
    .setInputCols(inputCols)\
    .setOutputCol("featureVector")

In [48]:
kmeans = KMeans() \
    .setPredictionCol("cluster") \
    .setFeaturesCol("featureVector")

In [49]:
pipeline = Pipeline().setStages([assembler, kmeans]) # stages -> transformer, estimator
# pipeline = Pipeline(stages=[assembler, kmeans])
pipelineModel = pipeline.fit(numericOnly)
kmeansModel = pipelineModel.stages[-1]

kmeansModel.clusterCenters() # k=2 (Default)

[array([4.79793956e+01, 1.62207883e+03, 8.68534183e+02, 4.45326100e-05,
        6.43293794e-03, 1.41694668e-05, 3.45168212e-02, 1.51815716e-04,
        1.48247035e-01, 1.02121372e-02, 1.11331525e-04, 3.64357718e-05,
        1.13517671e-02, 1.08295211e-03, 1.09307315e-04, 1.00805635e-03,
        0.00000000e+00, 0.00000000e+00, 1.38658354e-03, 3.32286248e+02,
        2.92907143e+02, 1.76685418e-01, 1.76607809e-01, 5.74330999e-02,
        5.77183920e-02, 7.91548844e-01, 2.09816404e-02, 2.89968625e-02,
        2.32470732e+02, 1.88666046e+02, 7.53781203e-01, 3.09056111e-02,
        6.01935529e-01, 6.68351484e-03, 1.76753957e-01, 1.76441622e-01,
        5.81176268e-02, 5.74111170e-02]),
 array([2.0000000e+00, 6.9337564e+08, 0.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00, 

In [55]:
withCluster = pipelineModel.transform(numericOnly)

[Row(duration=0, src_bytes=181, dst_bytes=5450, land=0, wrong_fragment=0, urgent=0, hot=0, num_failed_logins=0, logged_in=1, num_compromised=0, root_shell=0, su_attempted=0, num_root=0, num_file_creations=0, num_shells=0, num_access_files=0, num_outbound_cmds=0, is_host_login=0, is_guest_login=0, count=8, srv_count=8, serror_rate=0.0, srv_serror_rate=0.0, rerror_rate=0.0, srv_rerror_rate=0.0, same_srv_rate=1.0, diff_srv_rate=0.0, srv_diff_host_rate=0.0, dst_host_count=9, dst_host_srv_count=9, dst_host_same_srv_rate=1.0, dst_host_diff_srv_rate=0.0, dst_host_same_src_port_rate=0.11, dst_host_srv_diff_host_rate=0.0, dst_host_serror_rate=0.0, dst_host_srv_serror_rate=0.0, dst_host_rerror_rate=0.0, dst_host_srv_rerror_rate=0.0, label='normal.', featureVector=SparseVector(38, {1: 181.0, 2: 5450.0, 8: 1.0, 19: 8.0, 20: 8.0, 25: 1.0, 28: 9.0, 29: 9.0, 30: 1.0, 32: 0.11}), cluster=0)]

In [53]:
withCluster.select("cluster", "label") \
    .groupBy("cluster", "label").count() \
    .orderBy(["cluster", "count"], ascending=[1, 0]) \
    .show(25)

+-------+----------------+------+
|cluster|           label| count|
+-------+----------------+------+
|      0|          smurf.|280790|
|      0|        neptune.|107201|
|      0|         normal.| 97278|
|      0|           back.|  2203|
|      0|          satan.|  1589|
|      0|        ipsweep.|  1247|
|      0|      portsweep.|  1039|
|      0|    warezclient.|  1020|
|      0|       teardrop.|   979|
|      0|            pod.|   264|
|      0|           nmap.|   231|
|      0|   guess_passwd.|    53|
|      0|buffer_overflow.|    30|
|      0|           land.|    21|
|      0|    warezmaster.|    20|
|      0|           imap.|    12|
|      0|        rootkit.|    10|
|      0|     loadmodule.|     9|
|      0|      ftp_write.|     8|
|      0|       multihop.|     7|
|      0|            phf.|     4|
|      0|           perl.|     3|
|      0|            spy.|     2|
|      1|      portsweep.|     1|
+-------+----------------+------+



### Choosing k

In [20]:
import random

def clusteringScore0(data, k): # (data: DataFrame, k: Int): Double 
    inputCols = data.columns.copy()
    inputCols.remove('label')

    assembler = VectorAssembler() \
        .setInputCols(inputCols) \
        .setOutputCol("featureVector")
        
    kmeans = KMeans() \
        .setSeed(42) \
        .setK(k) \
        .setPredictionCol("cluster") \
        .setFeaturesCol("featureVector")
    
    pipeline = Pipeline().setStages([assembler, kmeans])
    kmeansModel = pipeline.fit(data).stages[-1]
    return kmeansModel.computeCost(assembler.transform(data)) / data.count()

In [21]:
scores0 = map(lambda x: (x, clusteringScore0(numericOnly, x)) ,range(20, 101, 20))
list(scores0)

## 결과값의 숫자가 너무 크다 (Normalize 필요)

[(20, 69889100.71522829),
 (40, 69889095.22569679),
 (60, 32232875.75527558),
 (80, 31553254.56382279),
 (100, 26254419.772138733)]

In [22]:
def clusteringScore1(data, k): # (data: DataFrame, k: Int): Double 
    inputCols = data.columns.copy()
    inputCols.remove('label')

    assembler = VectorAssembler() \
        .setInputCols(inputCols) \
        .setOutputCol("featureVector")
        
    kmeans = KMeans() \
        .setSeed(random.randint(0,1000)) \
        .setK(k) \
        .setMaxIter(40) \
        .setTol(1.0e-5) \
        .setPredictionCol("cluster") \
        .setFeaturesCol("featureVector")
    
    pipeline = Pipeline().setStages([assembler, kmeans])
    kmeansModel = pipeline.fit(data).stages[-1]
    return kmeansModel.computeCost(assembler.transform(data)) / data.count()

In [23]:
scores1 = map(lambda x: (x, clusteringScore1(numericOnly, x)) ,range(20, 101, 20))
list(scores1)

## 결과값의 숫자가 너무 크다 (Normalize 필요)

[(20, 25353139.00444269),
 (40, 48613146.20434586),
 (60, 8114452.259562982),
 (80, 34134663.67454684),
 (100, 1911346.3927472366)]

### Feature Normalization

In [24]:
from pyspark.ml.feature import StandardScaler

In [25]:
def clusteringScore2(data, k): #def clusteringScore2(data: DataFrame, k: Int): Double = {
    inputCols = data.columns.copy()
    inputCols.remove("label")

    assembler = VectorAssembler() \
        .setInputCols(inputCols) \
        .setOutputCol("featureVector")

    # Data normalizer
    scaler = StandardScaler() \
        .setInputCol("featureVector") \
        .setOutputCol("scaledFeatureVector") \
        .setWithStd(True) \
        .setWithMean(False)
        
    kmeans = KMeans() \
        .setSeed(42) \
        .setK(k) \
        .setMaxIter(40) \
        .setTol(1.0e-5) \
        .setPredictionCol("cluster") \
        .setFeaturesCol("scaledFeatureVector")
    
    pipeline = Pipeline().setStages([assembler, scaler, kmeans])
    pipelineModel = pipeline.fit(data)
    kmeansModel = pipelineModel.stages[-1]
    return kmeansModel.computeCost(pipelineModel.transform(data)) / data.count()

In [26]:
scores2 = map(lambda x: (x, clusteringScore2(numericOnly, x)) ,range(60, 271, 30))
list(scores2)

[(60, 1.045256310031205),
 (90, 0.6672592567377787),
 (120, 0.48436324930446734),
 (150, 0.3656394040579684),
 (180, 0.30840490659866215),
 (210, 0.2607871194536823),
 (240, 0.2313484694456669),
 (270, 0.20208789727372622)]

### Categorical Variables

In [30]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer

def oneHotPipeline(inputCol): # (inputCol: String): (Pipeline, String)
    indexer = StringIndexer(inputCol=inputCol, outputCol=inputCol+"_indexed")   
    encoder = OneHotEncoder(inputCol=inputCol+"_indexed", outputCol=inputCol+"_vec")

    pipeline = Pipeline().setStages([indexer, encoder])
    return (pipeline, inputCol + "_vec")

In [31]:
def clusteringScore3(data, k): # data: DataFrame, k: Int): Double = {
    (protoTypeEncoder, protoTypeVecCol) = oneHotPipeline("protocol_type")
    (serviceEncoder, serviceVecCol) = oneHotPipeline("service")
    (flagEncoder, flagVecCol) = oneHotPipeline("flag")
       
    inputCols = data.columns.copy()
    for c in ["protocol_type", "service", "flag", "label"]:
        inputCols.remove(c)
    inputCols.extend(["protocol_type_vec", "service_vec", "flag_vec"])

    assembler = VectorAssembler() \
        .setInputCols(inputCols) \
        .setOutputCol("featureVector")
    
    scaler = StandardScaler() \
        .setInputCol("featureVector") \
        .setOutputCol("scaledFeatureVector") \
        .setWithStd(True) \
        .setWithMean(False)
        
    kmeans = KMeans() \
        .setSeed(42) \
        .setK(k) \
        .setMaxIter(40) \
        .setTol(1.0e-5) \
        .setPredictionCol("cluster") \
        .setFeaturesCol("scaledFeatureVector")
    
    pipeline = Pipeline().setStages([protoTypeEncoder, serviceEncoder, flagEncoder, assembler, scaler, kmeans])
    pipelineModel = pipeline.fit(data)
    kmeansModel = pipelineModel.stages[-1]
    return kmeansModel.computeCost(pipelineModel.transform(data)) / data.count()

In [58]:
scores3 = map(lambda x: (x, clusteringScore3(data, x)), range(60, 271, 30))
sc3 = list(scores3)

In [None]:
from Matplotlib import pyplot as plt
plt.plot(sc3)

### Using Labels with Entropy

In [33]:
import math
#  Calc entropy
#  파이썬 map은 제너레이터
def calc_each_entropy(v, n):
    p = v/n
    return -p*math.log(p)

def entropy(counts): # (counts: iterable[int]): Double
    values = [x for x in counts if x > 0]
    n = sum(map(float, values))
    entropys = map(lambda v: calc_each_entropy(v, n), values)
    return sum(entropys)

In [34]:
def fitPipeline4(data, k):
    (protoTypeEncoder, protoTypeVecCol) = oneHotPipeline("protocol_type")
    (serviceEncoder, serviceVecCol) = oneHotPipeline("service")
    (flagEncoder, flagVecCol) = oneHotPipeline("flag")

    inputCols = data.columns.copy()
    for c in ["protocol_type", "service", "flag", "label"]:
        inputCols.remove(c)
    inputCols.extend(["protocol_type_vec", "service_vec", "flag_vec"])

    assembler = VectorAssembler() \
        .setInputCols(inputCols) \
        .setOutputCol("featureVector")
    
    scaler = StandardScaler() \
        .setInputCol("featureVector") \
        .setOutputCol("scaledFeatureVector") \
        .setWithStd(True) \
        .setWithMean(False)
        
    kmeans = KMeans() \
        .setSeed(42) \
        .setK(k) \
        .setMaxIter(40) \
        .setTol(1.0e-5) \
        .setPredictionCol("cluster") \
        .setFeaturesCol("scaledFeatureVector")
    
    pipeline = Pipeline().setStages([protoTypeEncoder, serviceEncoder, flagEncoder, assembler, scaler, kmeans])
    return pipeline.fit(data)  

In [35]:
from collections import Counter

def clusteringScore4(data, k): # (data: DataFrame, k: Int): Double 
    pipelineModel = fitPipeline4(data, k)

    clusterLabel = pipelineModel.transform(data).select(["cluster", "label"])
    labels_grouped = clusterLabel.rdd.groupByKey()
    labels_counted = labels_grouped.map(lambda x: (x[0], len(x[1]), list(Counter(x[1]).values())))
    weightedClusterEntropy = labels_counted.map(lambda x: x[1]*entropy((x[2])))

    # Average entropy weighted by cluster size 
    return sum(weightedClusterEntropy.collect())/data.count()

In [36]:
scores4 = map(lambda x: (x, clusteringScore4(data, x)), range(60, 271, 30))
list(scores4)

[(60, 0.08610395783128089),
 (90, 0.04635306786629947),
 (120, 0.041960857374731125),
 (150, 0.04060728429783955),
 (180, 0.02367598139007656),
 (210, 0.020198154875377766),
 (240, 0.0128627536767686),
 (270, 0.01394629790110366)]

### Clustering in Action

In [37]:
pipelineModel = fitPipeline4(data, 180)
countByClusterLabel = pipelineModel.transform(data) \
    .select("cluster", "label") \
    .groupBy("cluster", "label").count() \
    .orderBy(["cluster", "label"])
countByClusterLabel.show(200)

+-------+----------------+------+
|cluster|           label| count|
+-------+----------------+------+
|      0|           back.|     5|
|      0|         normal.|  5783|
|      1|         normal.|     1|
|      2|        neptune.|   101|
|      3|        rootkit.|     1|
|      4|       teardrop.|   711|
|      5|        neptune.|   106|
|      6|         normal.|     2|
|      6|    warezmaster.|    15|
|      7|        neptune.|    18|
|      8|           back.|     2|
|      8|         normal.|    50|
|      9|         normal.|     1|
|      9|      portsweep.|    49|
|     10|           imap.|     4|
|     10|           nmap.|   102|
|     11|        ipsweep.|     4|
|     11|           nmap.|     1|
|     11|         normal.|   337|
|     11|      portsweep.|     1|
|     11|          smurf.|280787|
|     12|        neptune.| 45524|
|     13|         normal.|   160|
|     14|        neptune.|   101|
|     14|      portsweep.|     1|
|     15|        neptune.|   102|
|     15|     

In [38]:
kMeansModel = pipelineModel.stages[-1]
centroids = kMeansModel.clusterCenters()
clustered = pipelineModel.transform(data)

In [57]:
import numpy as np
def sqdist(a,b):
    return float(np.sqrt(np.sum((a-b)**2, axis=0)))
    
thresholds = clustered.select("cluster", "scaledFeatureVector").rdd \
    .map(lambda x: sqdist(centroids[x[0]], np.array(x[1])))\
    .sortBy(lambda x: x, ascending=False)\
    .take(100)
threshold = thresholds[-1]
threshold

## cluster의 center에서 point간 거리계산 --- outlier 100개를 찾기 위함.

25.882188620759905

In [40]:
samples = clustered.sample(0.01) # 너무 오래걸려서  1%만 샘플링
samples.count()

4940

In [41]:
anomalies = samples.select("cluster", "scaledFeatureVector", "label").rdd \
    .filter(lambda x: sqdist(centroids[x[0]], np.array(x[1])) >= threshold).toDF()
anomalies.select("cluster", "label").groupBy('label') \
    .count().orderBy("count", ascending=False).show()

+----------------+-----+
|           label|count|
+----------------+-----+
|         normal.|   52|
|        neptune.|   16|
|    warezclient.|    7|
|      portsweep.|    5|
|          satan.|    5|
|        ipsweep.|    2|
|           back.|    2|
|   guess_passwd.|    1|
|           nmap.|    1|
|      ftp_write.|    1|
|buffer_overflow.|    1|
|           land.|    1|
+----------------+-----+

