In [2]:
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from pyspark.sql import Row
import operator
from pyspark.mllib.clustering import KMeans

In [5]:
spark = SparkSession.builder.master("local")\
       .appName("Anomalies Detection")\
       .config("spark.some.config.option", "some-value")\
       .getOrCreate()\

sparkCt = spark.sparkContext

In [16]:
def readData(filename):
    rawDF = spark.read.parquet(filename).cache()
    return rawDF
    
file_path = "logs-features-sample/"
rawDF = readData(file_path)
rawDF.show(n=7)

+-----+--------------------+
|   id|         rawFeatures|
+-----+--------------------+
|44263|[udp, SF, -0.1585...|
|44264|[tcp, SF, -0.1585...|
|44265|[tcp, SF, -0.1585...|
|44266|[tcp, SF, -0.1585...|
|44267|[tcp, SF, -0.1585...|
|44268|[udp, SF, -0.1585...|
|44269|[tcp, SF, -0.1585...|
+-----+--------------------+
only showing top 7 rows



In [10]:
def to_onehot(lst, indices, unique_values, c):
    zs = [0.0]*c
    rest_lst = [float(lst[k]) for k in range(len(lst)) if k not in indices]
    for pos in indices:
        idx = unique_values.index(Row(lst[pos]))
        zs[idx] = 1.0
    zs.extend(rest_lst)
    return zs
    
    
# in rawFeatures, the first 2 categorical data convert to one hot vector such as [0,0,1,0,1]
# extend the one-hot vector with original numerical list, and all convert to Double type
# put the numerical list to a new column called "features"
def cat2Num(df, indices):
    unique_values = []
    for i in indices:
        d = udf(lambda r: r[i], StringType())
        dt = df.select(d(df.rawFeatures)).distinct().collect()
        unique_values.extend(dt)

    unique_count = len(unique_values)
    convertUDF = udf(lambda r: to_onehot(r, indices, unique_values, unique_count), ArrayType(DoubleType()))
    newdf = df.withColumn("features", convertUDF(df.rawFeatures))

    return newdf


def addScore(df):
    cluster_dict = {}
    clusters_list = df.select("prediction").collect()
    for c in clusters_list:
        cluster_dict[c] = cluster_dict.setdefault(c,0.0)+1.0
    sorted_clusters = sorted(cluster_dict.items(), key=operator.itemgetter(1))  # sort by value
    n_max = sorted_clusters[-1][1]
    n_min = sorted_clusters[0][1]
    score_udf = udf(lambda p: float(n_max - cluster_dict.get(Row(p)))/(n_max - n_min), DoubleType())
    score_df = df.withColumn("score", score_udf(df.prediction))
    return score_df


def detect(rawDF, k, t):
    # Encoding categorical features using one-hot.
    df1 = cat2Num(rawDF, [0, 1]).cache()
    df1.show(n=2, truncate=False)

    # Clustering points using KMeans
    features = df1.select("features").rdd.map(lambda row: row[0]).cache()
    model = KMeans.train(features, k, maxIterations=40, runs=10, initializationMode="random", seed=410)

    # Adding the prediction column to df1
    modelBC = sparkCt.broadcast(model)
    predictUDF = udf(lambda x: modelBC.value.predict(x), StringType())
    df2 = df1.withColumn("prediction", predictUDF(df1.features)).cache()
    df2.show(n=3, truncate=False)

    # Adding the score column to df2; The higher the score, the more likely it is an anomaly
    df3 = self.addScore(df2).cache()
    df3.show(n=3, truncate=False)

    return df3.where(df3.score > t)

In [17]:
df1 = cat2Num(rawDF, [0, 1]).cache()
df1.show(n=7)

+-----+--------------------+--------------------+
|   id|         rawFeatures|            features|
+-----+--------------------+--------------------+
|44263|[udp, SF, -0.1585...|[0.0, 1.0, 0.0, 0...|
|44264|[tcp, SF, -0.1585...|[1.0, 0.0, 0.0, 0...|
|44265|[tcp, SF, -0.1585...|[1.0, 0.0, 0.0, 0...|
|44266|[tcp, SF, -0.1585...|[1.0, 0.0, 0.0, 0...|
|44267|[tcp, SF, -0.1585...|[1.0, 0.0, 0.0, 0...|
|44268|[udp, SF, -0.1585...|[0.0, 1.0, 0.0, 0...|
|44269|[tcp, SF, -0.1585...|[1.0, 0.0, 0.0, 0...|
+-----+--------------------+--------------------+
only showing top 7 rows



In [15]:
# Clustering points using KMeans
k = 8
features = df1.select("features").rdd.map(lambda row: row[0]).cache()  # row is a list, row[0] is the feature list
model = KMeans.train(features, k, maxIterations=40, initializationMode="random", seed=410)

In [19]:
# Adding the prediction column to df1
modelBC = sparkCt.broadcast(model)
predictUDF = udf(lambda x: modelBC.value.predict(x), StringType())
df2 = df1.withColumn("prediction", predictUDF(df1.features)).cache()
df2.show(n=7)

+-----+--------------------+--------------------+----------+
|   id|         rawFeatures|            features|prediction|
+-----+--------------------+--------------------+----------+
|44263|[udp, SF, -0.1585...|[0.0, 1.0, 0.0, 0...|         3|
|44264|[tcp, SF, -0.1585...|[1.0, 0.0, 0.0, 0...|         5|
|44265|[tcp, SF, -0.1585...|[1.0, 0.0, 0.0, 0...|         3|
|44266|[tcp, SF, -0.1585...|[1.0, 0.0, 0.0, 0...|         5|
|44267|[tcp, SF, -0.1585...|[1.0, 0.0, 0.0, 0...|         3|
|44268|[udp, SF, -0.1585...|[0.0, 1.0, 0.0, 0...|         3|
|44269|[tcp, SF, -0.1585...|[1.0, 0.0, 0.0, 0...|         7|
+-----+--------------------+--------------------+----------+
only showing top 7 rows



In [21]:
# Adding the score column to df2; The higher the score, the more likely it is an anomaly
# higher the score, means the member count of that cluster is less
df3 = addScore(df2).cache()
df3.show(n=7)

+-----+--------------------+--------------------+----------+-------------------+
|   id|         rawFeatures|            features|prediction|              score|
+-----+--------------------+--------------------+----------+-------------------+
|44263|[udp, SF, -0.1585...|[0.0, 1.0, 0.0, 0...|         3|                0.0|
|44264|[tcp, SF, -0.1585...|[1.0, 0.0, 0.0, 0...|         5|0.02132185100643287|
|44265|[tcp, SF, -0.1585...|[1.0, 0.0, 0.0, 0...|         3|                0.0|
|44266|[tcp, SF, -0.1585...|[1.0, 0.0, 0.0, 0...|         5|0.02132185100643287|
|44267|[tcp, SF, -0.1585...|[1.0, 0.0, 0.0, 0...|         3|                0.0|
|44268|[udp, SF, -0.1585...|[0.0, 1.0, 0.0, 0...|         3|                0.0|
|44269|[tcp, SF, -0.1585...|[1.0, 0.0, 0.0, 0...|         7| 0.8150031126789791|
+-----+--------------------+--------------------+----------+-------------------+
only showing top 7 rows



In [22]:
t = 0.79
df3.where(df3.score > t).show(n=7)

+-----+--------------------+--------------------+----------+------------------+
|   id|         rawFeatures|            features|prediction|             score|
+-----+--------------------+--------------------+----------+------------------+
|44269|[tcp, SF, -0.1585...|[1.0, 0.0, 0.0, 0...|         7|0.8150031126789791|
|44281|[tcp, SF, -0.1585...|[1.0, 0.0, 0.0, 0...|         7|0.8150031126789791|
|44286|[udp, SF, -0.1585...|[0.0, 1.0, 0.0, 0...|         6|0.9452946669433493|
|44290|[tcp, SF, -0.1585...|[1.0, 0.0, 0.0, 0...|         6|0.9452946669433493|
|44302|[tcp, REJ, -0.158...|[1.0, 0.0, 0.0, 0...|         1|0.8564795600747043|
|44304|[tcp, REJ, -0.158...|[1.0, 0.0, 0.0, 0...|         1|0.8564795600747043|
|44305|[tcp, SF, -0.1585...|[1.0, 0.0, 0.0, 0...|         7|0.8150031126789791|
+-----+--------------------+--------------------+----------+------------------+
only showing top 7 rows

