In [1]:
# Import SparkConf class into program
from pyspark import SparkConf
import os
from pyspark.sql.functions import unix_timestamp
from pyspark.sql.functions import to_timestamp
from pyspark.ml.pipeline import PipelineModel
from pyspark.sql.types import StructType,StructField, IntegerType, DateType, StringType, DoubleType, LongType
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split
from pyspark.sql import functions as F
from pyspark.sql.types import *
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.0.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0 pyspark-shell'


# local[*]: run Spark in local mode with as many working processors as logical cores on your machine
# If we want Spark to run locally with 'k' worker threads, we can specify as "local[k]".
master = "local[2]"
# The `appName` field is a name to be shown on the Spark cluster UI page
app_name = "Linux system hacking Detection"
# Setup configuration parameters for Spark
spark_conf = SparkConf().setMaster(master).setAppName(app_name)


In [2]:
# Import SparkContext and SparkSession classes
from pyspark import SparkContext # Spark
from pyspark.sql import SparkSession # Spark SQL

#Method 1: Using SparkSession
#We have to keep the maxPartition bytes by seeing the size of our csv files and in such a way we get 4 partitions
spark = SparkSession.builder.config(conf=spark_conf).config("spark.sql.session.timeZone", "UTC").getOrCreate()
sc = spark.sparkContext
sc.setLogLevel('ERROR')

In [3]:
topic = "process"
df_process = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "127.0.0.1:9092") \
    .option("subscribe", topic) \
    .load()

In [4]:
topic = "memory"
df_memory = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "127.0.0.1:9092") \
    .option("subscribe", topic) \
    .load()

In [5]:
df_process.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [6]:
df_process = df_process.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
df_process.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: string (nullable = true)



In [7]:
df_memory = df_memory.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
df_memory.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: string (nullable = true)



In [8]:
schema_process = ArrayType(StructType([    
    StructField('sequence', IntegerType(), True), 
    StructField('PID', LongType(), True),
    StructField('TRUN', IntegerType(), True),
    StructField('TSLPI', IntegerType(), True),
    StructField('TSLPU', IntegerType(), True),
    StructField('State', StringType(), True),
    StructField('POLI', StringType(), True),
    StructField('NICE', IntegerType(), True),
    StructField('PRI', IntegerType(), True),
    StructField('RTPR', IntegerType(), True),
    StructField('CPUNR', IntegerType(), True),
    StructField('machine', IntegerType(), True),
    StructField('Status', StringType(), True),
    StructField('EXC', IntegerType(), True),
    StructField('CPU', FloatType(), True),
    StructField('CMD', StringType(), True),
    StructField('ts', StringType(), True)
]))

In [9]:
df_process=df_process.select(F.from_json(F.col("value").cast("string"), schema_process).alias('parsed_value'))
df_process.printSchema()

root
 |-- parsed_value: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- sequence: integer (nullable = true)
 |    |    |-- PID: long (nullable = true)
 |    |    |-- TRUN: integer (nullable = true)
 |    |    |-- TSLPI: integer (nullable = true)
 |    |    |-- TSLPU: integer (nullable = true)
 |    |    |-- State: string (nullable = true)
 |    |    |-- POLI: string (nullable = true)
 |    |    |-- NICE: integer (nullable = true)
 |    |    |-- PRI: integer (nullable = true)
 |    |    |-- RTPR: integer (nullable = true)
 |    |    |-- CPUNR: integer (nullable = true)
 |    |    |-- machine: integer (nullable = true)
 |    |    |-- Status: string (nullable = true)
 |    |    |-- EXC: integer (nullable = true)
 |    |    |-- CPU: float (nullable = true)
 |    |    |-- CMD: string (nullable = true)
 |    |    |-- ts: string (nullable = true)



In [10]:
df_process = df_process.select(F.explode(F.col("parsed_value")).alias('value_column'))      
df_process.printSchema()

root
 |-- value_column: struct (nullable = true)
 |    |-- sequence: integer (nullable = true)
 |    |-- PID: long (nullable = true)
 |    |-- TRUN: integer (nullable = true)
 |    |-- TSLPI: integer (nullable = true)
 |    |-- TSLPU: integer (nullable = true)
 |    |-- State: string (nullable = true)
 |    |-- POLI: string (nullable = true)
 |    |-- NICE: integer (nullable = true)
 |    |-- PRI: integer (nullable = true)
 |    |-- RTPR: integer (nullable = true)
 |    |-- CPUNR: integer (nullable = true)
 |    |-- machine: integer (nullable = true)
 |    |-- Status: string (nullable = true)
 |    |-- EXC: integer (nullable = true)
 |    |-- CPU: float (nullable = true)
 |    |-- CMD: string (nullable = true)
 |    |-- ts: string (nullable = true)



In [11]:
df_process_formatted = df_process.select(
                    F.col("value_column.sequence").alias("sequence"),
                    F.col("value_column.machine").alias("machine"),
                    F.col("value_column.PID").alias("PID"),
                    F.col("value_column.TRUN").alias("TRUN"),
                    F.col("value_column.TSLPI").alias("TSLPI"),
                    F.col("value_column.TSLPU").alias("TSLPU"),
                    F.col("value_column.POLI").alias("POLI"),
                    F.col("value_column.NICE").alias("NICE"),
                    F.col("value_column.PRI").alias("PRI"),
                    F.col("value_column.RTPR").alias("RTPR"),
                    F.col("value_column.CPUNR").alias("CPUNR"),
                    F.col("value_column.Status").alias("Status"),
                    F.col("value_column.EXC").alias("EXC"),
                    F.col("value_column.State").alias("State"),
                    F.col("value_column.CPU").alias("CPU"),
                    F.col("value_column.CMD").alias("CMD"),
                    F.col("value_column.ts").alias("ts")
                )
df_process_formatted.printSchema()

root
 |-- sequence: integer (nullable = true)
 |-- machine: integer (nullable = true)
 |-- PID: long (nullable = true)
 |-- TRUN: integer (nullable = true)
 |-- TSLPI: integer (nullable = true)
 |-- TSLPU: integer (nullable = true)
 |-- POLI: string (nullable = true)
 |-- NICE: integer (nullable = true)
 |-- PRI: integer (nullable = true)
 |-- RTPR: integer (nullable = true)
 |-- CPUNR: integer (nullable = true)
 |-- Status: string (nullable = true)
 |-- EXC: integer (nullable = true)
 |-- State: string (nullable = true)
 |-- CPU: float (nullable = true)
 |-- CMD: string (nullable = true)
 |-- ts: string (nullable = true)



In [12]:
query = df_process_formatted \
    .writeStream \
    .outputMode("append") \
    .format("console") \
    .trigger(processingTime='5 seconds') \
    .start()

#### For memory

In [None]:
schema_memory = ArrayType(StructType([    
    StructField('sequence', IntegerType(), True), 
    StructField('machine', IntegerType(), True),
    StructField('PID', LongType(), True),
    StructField('MINFLT', LongType(), True),
    StructField('MAJFLT', LongType(), True),
    StructField('VSTEXT', LongType(), True),
    StructField('VSIZE', DoubleType(), True),
    StructField('RSIZE', DoubleType(), True),
    StructField('VGROW', DoubleType(), True),
    StructField('RGROW', DoubleType(), True),
    StructField('MEM', FloatType(), True),
    StructField('CMD', StringType(), True),
    StructField('ts', StringType(), True)
]))

In [None]:
df_memory=df_memory.select(F.from_json(F.col("value").cast("string"), schema_memory).alias('parsed_value'))
df_memory.printSchema()

In [None]:
df_memory = df_memory.select(F.explode(F.col("parsed_value")).alias('unnested_value'))      
df_memory.printSchema()

In [None]:
df_memory_formatted = df_memory.select(
                    F.col("unnested_value.sequence").alias("sequence"),
                    F.col("unnested_value.machine").alias("machine"),
                    F.col("unnested_value.PID").alias("PID"),
                    F.col("unnested_value.MINFLT").alias("MINFLT"),
                    F.col("unnested_value.MAJFLT").alias("MAJFLT"),
                    F.col("unnested_value.VSTEXT").alias("VSTEXT"),
                    F.col("unnested_value.VSIZE").alias("VSIZE"),
                    F.col("unnested_value.RSIZE").alias("RSIZE"),
                    F.col("unnested_value.VGROW").alias("VGROW"),
                    F.col("unnested_value.RGROW").alias("RGROW"),
                    F.col("unnested_value.MEM").alias("MEM"),
                    F.col("unnested_value.CMD").alias("CMD"),
                    F.col("unnested_value.ts").alias("ts")
                )

In [None]:
df_memory_formatted.printSchema()

#### 3. 

In [None]:
def remove_all_whitespace(col):
    
    return F.regexp_replace(col, "\\s+", "")
def replacek(col): 
    return F.regexp_replace(col, "K", "000")

def replacem(col): 
    return F.regexp_replace(col, "M", "000000") 

In [None]:
columns = ['MINFLT', 'MAJFLT', 'VSTEXT', 'RSIZE', 'VGROW', 'RGROW']
for column in columns:
    df_memory_formatted = df_memory_formatted.withColumn(column, replacek(F.col(column))).withColumn(column, replacem(F.col(column)))


In [None]:
columns = ['MINFLT', 'MAJFLT', 'VSTEXT']
for column in columns:
    df_memory_formatted = df_memory_formatted.withColumn(column,df_memory_formatted[column].cast(LongType()))

In [None]:
columns = ['RSIZE', 'VGROW', 'RGROW']
for column in columns:
    df_memory_formatted = df_memory_formatted.withColumn(column,df_memory_formatted[column].cast(DoubleType()))

In [None]:
df_process_formatted = df_process_formatted.withColumn('NICE', df_process_formatted.PRI - 120)

In [None]:
df_process_formatted = df_process_formatted.withColumn('cmd_pid', F.concat('CMD', 'PID'))
df_process_formatted = df_process_formatted.withColumn('event_time', to_timestamp(unix_timestamp('ts')))

In [None]:
df_memory_formatted = df_memory_formatted.withColumn('cmd_pid', F.concat('CMD', 'PID'))
df_memory_formatted = df_memory_formatted.withColumn('event_time', to_timestamp(unix_timestamp('ts')))

In [None]:
df_memory_formatted.printSchema()

In [None]:
memory_watermark = df_memory_formatted.withWatermark('event_time', '20 seconds')

In [None]:
process_watermark = df_process_formatted.withWatermark('event_time', '20 seconds')

In [None]:
mem_paraquet_query = memory_watermark.writeStream.format("parquet")\
        .outputMode("append")\
        .option("path", r"./memory")\
        .option("checkpointLocation", r"./memory")\
        .start()

In [None]:
pro_paraquet_query = process_watermark.writeStream.format("parquet")\
        .outputMode("append")\
        .option("path", r"./process")\
        .option("checkpointLocation", r"./process")\
        .start()

In [None]:
memorymodel = PipelineModel.load('../memory_pipeline_model')

In [None]:
processmodel = PipelineModel.load('../process_pipeline_model')

In [None]:
mem_predict = memorymodel.transform(df_memory_formatted)

In [None]:
pro_predict = processmodel.transform(df_process_formatted)