# Chapter 5: Anomaly Detection in Network Traffic with K-means Clustering
http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html

In [1]:
from pprint import pprint

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Ch05").getOrCreate()
sc = spark.sparkContext

### A First Take on Clustering

In [3]:
dataWithoutHeader = spark.read.option('inferSchema', 'true') \
                            .option('header', 'false') \
                            .csv('kddcup.data_10_percent_corrected')
# 10% sampling dataser --> 500k rows

In [4]:
data = dataWithoutHeader.toDF(
"duration", "protocol_type", "service", "flag",
"src_bytes", "dst_bytes", "land", "wrong_fragment", "urgent",
"hot", "num_failed_logins", "logged_in", "num_compromised",
"root_shell", "su_attempted", "num_root", "num_file_creations",
"num_shells", "num_access_files", "num_outbound_cmds",
"is_host_login", "is_guest_login", "count", "srv_count",
"serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate",
"same_srv_rate", "diff_srv_rate", "srv_diff_host_rate",
"dst_host_count", "dst_host_srv_count",
"dst_host_same_srv_rate", "dst_host_diff_srv_rate",
"dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate",
"dst_host_serror_rate", "dst_host_srv_serror_rate",
"dst_host_rerror_rate", "dst_host_srv_rerror_rate",
"label")

In [5]:
data.select("label").groupBy("label").count().orderBy("count", ascending=False).show(25)

+----------------+------+
|           label| count|
+----------------+------+
|          smurf.|280790|
|        neptune.|107201|
|         normal.| 97278|
|           back.|  2203|
|          satan.|  1589|
|        ipsweep.|  1247|
|      portsweep.|  1040|
|    warezclient.|  1020|
|       teardrop.|   979|
|            pod.|   264|
|           nmap.|   231|
|   guess_passwd.|    53|
|buffer_overflow.|    30|
|           land.|    21|
|    warezmaster.|    20|
|           imap.|    12|
|        rootkit.|    10|
|     loadmodule.|     9|
|      ftp_write.|     8|
|       multihop.|     7|
|            phf.|     4|
|           perl.|     3|
|            spy.|     2|
+----------------+------+



In [6]:
from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans, KMeansModel
from pyspark.ml.feature import VectorAssembler

In [7]:
# numericOnly = data.drop("protocol_type", "service", "flag").dropna().cache()
numericOnly = data.drop("protocol_type", "service", "flag").cache()

In [8]:
inputCols = numericOnly.columns
inputCols.remove('label')

In [9]:
assembler = VectorAssembler() \
    .setInputCols(inputCols)\
    .setOutputCol("featureVector")

In [10]:
kmeans = KMeans() \
    .setPredictionCol("cluster") \
    .setFeaturesCol("featureVector")

In [11]:
pipeline = Pipeline().setStages([assembler, kmeans]) # stages -> transformer, estimator
# pipeline = Pipeline(stages=[assembler, kmeans])
pipelineModel = pipeline.fit(numericOnly)
kmeansModel = pipelineModel.stages[-1]

kmeansModel.clusterCenters() # k=2

[array([4.79793956e+01, 1.62207883e+03, 8.68534183e+02, 4.45326100e-05,
        6.43293794e-03, 1.41694668e-05, 3.45168212e-02, 1.51815716e-04,
        1.48247035e-01, 1.02121372e-02, 1.11331525e-04, 3.64357718e-05,
        1.13517671e-02, 1.08295211e-03, 1.09307315e-04, 1.00805635e-03,
        0.00000000e+00, 0.00000000e+00, 1.38658354e-03, 3.32286248e+02,
        2.92907143e+02, 1.76685418e-01, 1.76607809e-01, 5.74330999e-02,
        5.77183920e-02, 7.91548844e-01, 2.09816404e-02, 2.89968625e-02,
        2.32470732e+02, 1.88666046e+02, 7.53781203e-01, 3.09056111e-02,
        6.01935529e-01, 6.68351484e-03, 1.76753957e-01, 1.76441622e-01,
        5.81176268e-02, 5.74111170e-02]),
 array([2.0000000e+00, 6.9337564e+08, 0.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00, 

In [12]:
withCluster = pipelineModel.transform(numericOnly)

In [13]:
withCluster.select("cluster", "label") \
    .groupBy("cluster", "label").count() \
    .orderBy(["cluster", "count"], ascending=[1, 0]) \
    .show(25)

+-------+----------------+------+
|cluster|           label| count|
+-------+----------------+------+
|      0|          smurf.|280790|
|      0|        neptune.|107201|
|      0|         normal.| 97278|
|      0|           back.|  2203|
|      0|          satan.|  1589|
|      0|        ipsweep.|  1247|
|      0|      portsweep.|  1039|
|      0|    warezclient.|  1020|
|      0|       teardrop.|   979|
|      0|            pod.|   264|
|      0|           nmap.|   231|
|      0|   guess_passwd.|    53|
|      0|buffer_overflow.|    30|
|      0|           land.|    21|
|      0|    warezmaster.|    20|
|      0|           imap.|    12|
|      0|        rootkit.|    10|
|      0|     loadmodule.|     9|
|      0|      ftp_write.|     8|
|      0|       multihop.|     7|
|      0|            phf.|     4|
|      0|           perl.|     3|
|      0|            spy.|     2|
|      1|      portsweep.|     1|
+-------+----------------+------+



### Choosing k

In [14]:
import random

def clusteringScore0(data, k): # (data: DataFrame, k: Int): Double 
    inputCols = data.columns.copy()
    inputCols.remove('label')

    assembler = VectorAssembler() \
        .setInputCols(inputCols) \
        .setOutputCol("featureVector")
        
    kmeans = KMeans() \
        .setSeed(42) \
        .setK(k) \
        .setPredictionCol("cluster") \
        .setFeaturesCol("featureVector")
    
    pipeline = Pipeline().setStages([assembler, kmeans])
    kmeansModel = pipeline.fit(data).stages[-1]
    return kmeansModel.computeCost(assembler.transform(data)) / data.count()

In [15]:
scores0 = map(lambda x: (x, clusteringScore0(numericOnly, x)) ,range(20, 101, 20))
list(scores0)

[(20, 69904862.91838956),
 (40, 49086774.42579959),
 (60, 34134988.99795592),
 (80, 27900808.810133304),
 (100, 19701999.771263387)]

In [16]:
def clusteringScore1(data, k): # (data: DataFrame, k: Int): Double 
    inputCols = data.columns.copy()
    inputCols.remove('label')

    assembler = VectorAssembler() \
        .setInputCols(inputCols) \
        .setOutputCol("featureVector")
        
    kmeans = KMeans() \
        .setSeed(random.randint(0,1000)) \
        .setK(k) \
        .setMaxIter(40) \
        .setTol(1.0e-5) \
        .setPredictionCol("cluster") \
        .setFeaturesCol("featureVector")
    
    pipeline = Pipeline().setStages([assembler, kmeans])
    kmeansModel = pipeline.fit(data).stages[-1]
    return kmeansModel.computeCost(assembler.transform(data)) / data.count()

In [17]:
scores1 = map(lambda x: (x, clusteringScore1(numericOnly, x)) ,range(20, 101, 20))
list(scores1)

[(20, 54857135.05911902),
 (40, 69889095.30833218),
 (60, 27004024.474778596),
 (80, 14753113.896101408),
 (100, 6495327.519716123)]

### Feature Normalization

In [18]:
from pyspark.ml.feature import StandardScaler

In [19]:
def clusteringScore2(data, k): #def clusteringScore2(data: DataFrame, k: Int): Double = {
    inputCols = data.columns.copy()
    inputCols.remove("label")

    assembler = VectorAssembler() \
        .setInputCols(inputCols) \
        .setOutputCol("featureVector")
    
    scaler = StandardScaler() \
        .setInputCol("featureVector") \
        .setOutputCol("scaledFeatureVector") \
        .setWithStd(True) \
        .setWithMean(False)
        
    kmeans = KMeans() \
        .setSeed(42) \
        .setK(k) \
        .setMaxIter(40) \
        .setTol(1.0e-5) \
        .setPredictionCol("cluster") \
        .setFeaturesCol("scaledFeatureVector")
    
    pipeline = Pipeline().setStages([assembler, scaler, kmeans])
    pipelineModel = pipeline.fit(data)
    kmeansModel = pipelineModel.stages[-1]
    return kmeansModel.computeCost(pipelineModel.transform(data)) / data.count()

In [20]:
scores2 = map(lambda x: (x, clusteringScore2(numericOnly, x)) ,range(60, 271, 30))
list(scores2)

[(60, 1.1441261032075585),
 (90, 0.6880893675555909),
 (120, 0.46846759420005024),
 (150, 0.3715874199476205),
 (180, 0.31340704586915813),
 (210, 0.26199805630434914),
 (240, 0.22810044559997783),
 (270, 0.20502721183457584)]

### Categorical Variables

In [21]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer

def oneHotPipeline(inputCol): # (inputCol: String): (Pipeline, String)
    indexer = StringIndexer(inputCol=inputCol, outputCol=inputCol+"_indexed")   
    encoder = OneHotEncoder(inputCol=inputCol+"_indexed", outputCol=inputCol+"_vec")

    pipeline = Pipeline().setStages([indexer, encoder])
    return (pipeline, inputCol + "_vec")

In [22]:
def clusteringScore3(data, k): # data: DataFrame, k: Int): Double = {
    (protoTypeEncoder, protoTypeVecCol) = oneHotPipeline("protocol_type")
    (serviceEncoder, serviceVecCol) = oneHotPipeline("service")
    (flagEncoder, flagVecCol) = oneHotPipeline("flag")
       
    inputCols = data.columns.copy()
    for c in ["protocol_type", "service", "flag", "label"]:
        inputCols.remove(c)
    inputCols.extend(["protocol_type_vec", "service_vec", "flag_vec"])

    assembler = VectorAssembler() \
        .setInputCols(inputCols) \
        .setOutputCol("featureVector")
    
    scaler = StandardScaler() \
        .setInputCol("featureVector") \
        .setOutputCol("scaledFeatureVector") \
        .setWithStd(True) \
        .setWithMean(False)
        
    kmeans = KMeans() \
        .setSeed(42) \
        .setK(k) \
        .setMaxIter(40) \
        .setTol(1.0e-5) \
        .setPredictionCol("cluster") \
        .setFeaturesCol("scaledFeatureVector")
    
    pipeline = Pipeline().setStages([protoTypeEncoder, serviceEncoder, flagEncoder, assembler, scaler, kmeans])
    pipelineModel = pipeline.fit(data)
    kmeansModel = pipelineModel.stages[-1]
    return kmeansModel.computeCost(pipelineModel.transform(data)) / data.count()

In [23]:
scores3 = map(lambda x: (x, clusteringScore3(data, x)), range(60, 271, 30))
list(scores3)

[(60, 34.40873843218975),
 (90, 17.395607124276115),
 (120, 2.7341559849041),
 (150, 2.073111509455972),
 (180, 1.5623811454432166),
 (210, 1.1769339814927156),
 (240, 0.9597538974191357),
 (270, 0.7760935648355177)]

### Using Labels with Entropy

In [24]:
import math
#  Calc entropy
#  파이썬 map은 제너레이터
def calc_each_entropy(v, n):
    p = v/n
    return -p*math.log(p)

def entropy(counts): # (counts: iterable[int]): Double
    values = [x for x in counts if x > 0]
    n = sum(map(float, values))
    entropys = map(lambda v: calc_each_entropy(v, n), values)
    return sum(entropys)

In [25]:
def fitPipeline4(data, k):
    (protoTypeEncoder, protoTypeVecCol) = oneHotPipeline("protocol_type")
    (serviceEncoder, serviceVecCol) = oneHotPipeline("service")
    (flagEncoder, flagVecCol) = oneHotPipeline("flag")

    inputCols = data.columns.copy()
    for c in ["protocol_type", "service", "flag", "label"]:
        inputCols.remove(c)
    inputCols.extend(["protocol_type_vec", "service_vec", "flag_vec"])

    assembler = VectorAssembler() \
        .setInputCols(inputCols) \
        .setOutputCol("featureVector")
    
    scaler = StandardScaler() \
        .setInputCol("featureVector") \
        .setOutputCol("scaledFeatureVector") \
        .setWithStd(True) \
        .setWithMean(False)
        
    kmeans = KMeans() \
        .setSeed(42) \
        .setK(k) \
        .setMaxIter(40) \
        .setTol(1.0e-5) \
        .setPredictionCol("cluster") \
        .setFeaturesCol("scaledFeatureVector")
    
    pipeline = Pipeline().setStages([protoTypeEncoder, serviceEncoder, flagEncoder, assembler, scaler, kmeans])
    return pipeline.fit(data)  

In [26]:
from collections import Counter

def clusteringScore4(data, k): # (data: DataFrame, k: Int): Double 
    pipelineModel = fitPipeline4(data, k)

    clusterLabel = pipelineModel.transform(data).select(["cluster", "label"])
    labels_grouped = clusterLabel.rdd.groupByKey()
    labels_counted = labels_grouped.map(lambda x: (x[0], len(x[1]), list(Counter(x[1]).values())))
    weightedClusterEntropy = labels_counted.map(lambda x: x[1]*entropy((x[2])))

    # Average entropy weighted by cluster size 
    return sum(weightedClusterEntropy.collect())/data.count()

In [27]:
scores4 = map(lambda x: (x, clusteringScore4(data, x)), range(60, 271, 30))
list(scores4)

[(60, 0.15744539217936912),
 (90, 0.04303023524872978),
 (120, 0.03993253904916126),
 (150, 0.022212582441225457),
 (180, 0.015327587317161325),
 (210, 0.020206916403309093),
 (240, 0.012232202534867242),
 (270, 0.009810213344877838)]

### Clustering in Action

In [28]:
pipelineModel = fitPipeline4(data, 180)
countByClusterLabel = pipelineModel.transform(data) \
    .select("cluster", "label") \
    .groupBy("cluster", "label").count() \
    .orderBy(["cluster", "label"])
countByClusterLabel.show(200)

+-------+----------------+------+
|cluster|           label| count|
+-------+----------------+------+
|      0|        neptune.| 82130|
|      0|      portsweep.|    10|
|      1|         normal.|     9|
|      1|          smurf.|280773|
|      2|        ipsweep.|     2|
|      2|        neptune.|   105|
|      3|        neptune.|   106|
|      3|      portsweep.|     1|
|      4|           imap.|     7|
|      4|        neptune.|   105|
|      5|        neptune.|    97|
|      5|         normal.|     2|
|      5|          satan.|     1|
|      6|         normal.|  2284|
|      7|        ipsweep.|     2|
|      7|        neptune.|    28|
|      7|      portsweep.|     1|
|      8|buffer_overflow.|     1|
|      8|        neptune.|    20|
|      8|         normal.|     7|
|      8|      portsweep.|     1|
|      9|           nmap.|    23|
|      9|         normal.|  7215|
|      9|          satan.|    28|
|     10|          satan.|  1220|
|     11|        neptune.|   105|
|     11|     

In [29]:
kMeansModel = pipelineModel.stages[-1]
centroids = kMeansModel.clusterCenters()
clustered = pipelineModel.transform(data)

In [30]:
import numpy as np
def sqdist(a,b):
    return float(np.sqrt(np.sum((a-b)**2, axis=0)))
    
thresholds = clustered.select("cluster", "scaledFeatureVector").rdd \
    .map(lambda x: sqdist(centroids[x[0]], np.array(x[1]))).take(100)
threshold = sorted(thresholds)[99]
threshold

3.1897948162756258

In [31]:
samples = clustered.sample(0.01) # 너무 오래걸려서  1%만 샘플링
samples.count()

5021

In [32]:
anomalies = samples.select("cluster", "scaledFeatureVector", "label").rdd \
    .filter(lambda x: sqdist(centroids[x[0]], np.array(x[1])) >= threshold).toDF()
anomalies.select("cluster", "label").groupBy('label') \
    .count().orderBy("count", ascending=False).show()

+----------------+-----+
|           label|count|
+----------------+-----+
|         normal.|   50|
|        neptune.|   17|
|           back.|    6|
|      portsweep.|    4|
|        ipsweep.|    2|
|          smurf.|    1|
|            pod.|    1|
|buffer_overflow.|    1|
|           nmap.|    1|
+----------------+-----+

