1. SparkSession is created using a SparkConf object, which would use two local cores
with a proper application name, and use UTC as the timezone 3

In [57]:
# Import SparkConf class into program
from pyspark import SparkConf
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split
from pyspark.sql import functions as F
from pyspark.sql.types import *
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.0.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0 pyspark-shell'


# local[*]: run Spark in local mode with as many working processors as logical cores on your machine
# If we want Spark to run locally with 'k' worker threads, we can specify as "local[k]".
master = "local[2]"
# The `appName` field is a name to be shown on the Spark cluster UI page
app_name = "Linux system hacking Detection"
# Setup configuration parameters for Spark
spark_conf = SparkConf().setMaster(master).setAppName(app_name)


In [58]:
# Import SparkContext and SparkSession classes
from pyspark import SparkContext # Spark
from pyspark.sql import SparkSession # Spark SQL

#Method 1: Using SparkSession
#We have to keep the maxPartition bytes by seeing the size of our csv files and in such a way we get 4 partitions
spark = SparkSession.builder.config(conf=spark_conf).config("spark.sql.session.timeZone", "UTC").getOrCreate()
sc = spark.sparkContext
sc.setLogLevel('ERROR')

2. From the Kafka producers in Task 1.1 and 1.2, ingest the streaming data into Spark
Streaming for both process and memory activities.

In [59]:
topic = "process"
df_process = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "127.0.0.1:9092") \
    .option("subscribe", topic) \
    .load()

In [60]:
topic = "memory"
df_memory = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "127.0.0.1:9092") \
    .option("subscribe", topic) \
    .load()

In [61]:
df_process.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [62]:
df_memory.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [63]:
df_process = df_process.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

In [64]:
df_memory = df_memory.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

In [65]:
df_process.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: string (nullable = true)



In [66]:
df_memory.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: string (nullable = true)



Defining the schemas for both the activities.

In [67]:
schema_process = ArrayType(StructType([    
    StructField('sequence', IntegerType(), True), 
    StructField('machine', IntegerType(), True),
    StructField('PID', LongType(), True),
    StructField('TRUN', IntegerType(), True),
    StructField('TSLPI', IntegerType(), True),
    StructField('TSLPU', IntegerType(), True),
    StructField('POLI', StringType(), True),
    StructField('NICE', IntegerType(), True),
    StructField('PRI', IntegerType(), True),
    StructField('RTPR', IntegerType(), True),
    StructField('CPUNR', IntegerType(), True),
    StructField('Status', StringType(), True),
    StructField('EXC', IntegerType(), True),
    StructField('State', StringType(), True),
    StructField('CPU', FloatType(), True),
    StructField('CMD', StringType(), True),
    StructField('ts', StringType(), True)
]))

In [68]:
df_process=df_process.select(F.from_json(F.col("value").cast("string"), schema_process).alias('parsed_value'))

In [69]:
df_process.printSchema()

root
 |-- parsed_value: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- sequence: integer (nullable = true)
 |    |    |-- machine: integer (nullable = true)
 |    |    |-- PID: long (nullable = true)
 |    |    |-- TRUN: integer (nullable = true)
 |    |    |-- TSLPI: integer (nullable = true)
 |    |    |-- TSLPU: integer (nullable = true)
 |    |    |-- POLI: string (nullable = true)
 |    |    |-- NICE: integer (nullable = true)
 |    |    |-- PRI: integer (nullable = true)
 |    |    |-- RTPR: integer (nullable = true)
 |    |    |-- CPUNR: integer (nullable = true)
 |    |    |-- Status: string (nullable = true)
 |    |    |-- EXC: integer (nullable = true)
 |    |    |-- State: string (nullable = true)
 |    |    |-- CPU: float (nullable = true)
 |    |    |-- CMD: string (nullable = true)
 |    |    |-- ts: string (nullable = true)



In [70]:
df_process = df_process.select(F.explode(F.col("parsed_value")).alias('unnested_value'))      

In [71]:
df_process.printSchema()

root
 |-- unnested_value: struct (nullable = true)
 |    |-- sequence: integer (nullable = true)
 |    |-- machine: integer (nullable = true)
 |    |-- PID: long (nullable = true)
 |    |-- TRUN: integer (nullable = true)
 |    |-- TSLPI: integer (nullable = true)
 |    |-- TSLPU: integer (nullable = true)
 |    |-- POLI: string (nullable = true)
 |    |-- NICE: integer (nullable = true)
 |    |-- PRI: integer (nullable = true)
 |    |-- RTPR: integer (nullable = true)
 |    |-- CPUNR: integer (nullable = true)
 |    |-- Status: string (nullable = true)
 |    |-- EXC: integer (nullable = true)
 |    |-- State: string (nullable = true)
 |    |-- CPU: float (nullable = true)
 |    |-- CMD: string (nullable = true)
 |    |-- ts: string (nullable = true)



In [72]:
df_process_formatted = df_process.select(
                    F.col("unnested_value.sequence").alias("sequence"),
                    F.col("unnested_value.machine").alias("machine"),
                    F.col("unnested_value.PID").alias("PID"),
                    F.col("unnested_value.TRUN").alias("TRUN"),
                    F.col("unnested_value.TSLPI").alias("TSLPI"),
                    F.col("unnested_value.TSLPU").alias("TSLPU"),
                    F.col("unnested_value.POLI").alias("POLI"),
                    F.col("unnested_value.NICE").alias("NICE"),
                    F.col("unnested_value.PRI").alias("PRI"),
                    F.col("unnested_value.RTPR").alias("RTPR"),
                    F.col("unnested_value.CPUNR").alias("CPUNR"),
                    F.col("unnested_value.Status").alias("Status"),
                    F.col("unnested_value.EXC").alias("EXC"),
                    F.col("unnested_value.State").alias("State"),
                    F.col("unnested_value.CPU").alias("CPU"),
                    F.col("unnested_value.CMD").alias("CMD"),
                    F.col("unnested_value.ts").alias("ts")
                )

In [73]:
df_process_formatted.printSchema()

root
 |-- sequence: integer (nullable = true)
 |-- machine: integer (nullable = true)
 |-- PID: long (nullable = true)
 |-- TRUN: integer (nullable = true)
 |-- TSLPI: integer (nullable = true)
 |-- TSLPU: integer (nullable = true)
 |-- POLI: string (nullable = true)
 |-- NICE: integer (nullable = true)
 |-- PRI: integer (nullable = true)
 |-- RTPR: integer (nullable = true)
 |-- CPUNR: integer (nullable = true)
 |-- Status: string (nullable = true)
 |-- EXC: integer (nullable = true)
 |-- State: string (nullable = true)
 |-- CPU: float (nullable = true)
 |-- CMD: string (nullable = true)
 |-- ts: string (nullable = true)



Now lets do the same for the memory activity.

In [74]:
schema_memory = ArrayType(StructType([    
    StructField('sequence', IntegerType(), True), 
    StructField('machine', IntegerType(), True),
    StructField('PID', LongType(), True),
    StructField('MINFLT', LongType(), True),
    StructField('MAJFLT', LongType(), True),
    StructField('VSTEXT', LongType(), True),
    StructField('VSIZE', DoubleType(), True),
    StructField('RSIZE', DoubleType(), True),
    StructField('VGROW', DoubleType(), True),
    StructField('RGROW', DoubleType(), True),
    StructField('MEM', FloatType(), True),
    StructField('CMD', StringType(), True),
    StructField('ts', StringType(), True)
]))

In [75]:
df_memory=df_memory.select(F.from_json(F.col("value").cast("string"), schema_memory).alias('parsed_value'))

In [76]:
df_memory.printSchema()

root
 |-- parsed_value: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- sequence: integer (nullable = true)
 |    |    |-- machine: integer (nullable = true)
 |    |    |-- PID: long (nullable = true)
 |    |    |-- MINFLT: long (nullable = true)
 |    |    |-- MAJFLT: long (nullable = true)
 |    |    |-- VSTEXT: long (nullable = true)
 |    |    |-- VSIZE: double (nullable = true)
 |    |    |-- RSIZE: double (nullable = true)
 |    |    |-- VGROW: double (nullable = true)
 |    |    |-- RGROW: double (nullable = true)
 |    |    |-- MEM: float (nullable = true)
 |    |    |-- CMD: string (nullable = true)
 |    |    |-- ts: string (nullable = true)



In [77]:
df_memory = df_memory.select(F.explode(F.col("parsed_value")).alias('unnested_value'))      

In [78]:
df_memory.printSchema()

root
 |-- unnested_value: struct (nullable = true)
 |    |-- sequence: integer (nullable = true)
 |    |-- machine: integer (nullable = true)
 |    |-- PID: long (nullable = true)
 |    |-- MINFLT: long (nullable = true)
 |    |-- MAJFLT: long (nullable = true)
 |    |-- VSTEXT: long (nullable = true)
 |    |-- VSIZE: double (nullable = true)
 |    |-- RSIZE: double (nullable = true)
 |    |-- VGROW: double (nullable = true)
 |    |-- RGROW: double (nullable = true)
 |    |-- MEM: float (nullable = true)
 |    |-- CMD: string (nullable = true)
 |    |-- ts: string (nullable = true)



In [79]:
df_memory_formatted = df_memory.select(
                    F.col("unnested_value.sequence").alias("sequence"),
                    F.col("unnested_value.machine").alias("machine"),
                    F.col("unnested_value.PID").alias("PID"),
                    F.col("unnested_value.MINFLT").alias("MINFLT"),
                    F.col("unnested_value.MAJFLT").alias("MAJFLT"),
                    F.col("unnested_value.VSTEXT").alias("VSTEXT"),
                    F.col("unnested_value.VSIZE").alias("VSIZE"),
                    F.col("unnested_value.RSIZE").alias("RSIZE"),
                    F.col("unnested_value.VGROW").alias("VGROW"),
                    F.col("unnested_value.RGROW").alias("RGROW"),
                    F.col("unnested_value.MEM").alias("MEM"),
                    F.col("unnested_value.CMD").alias("CMD"),
                    F.col("unnested_value.ts").alias("ts")
                )

In [80]:
df_memory_formatted.printSchema()

root
 |-- sequence: integer (nullable = true)
 |-- machine: integer (nullable = true)
 |-- PID: long (nullable = true)
 |-- MINFLT: long (nullable = true)
 |-- MAJFLT: long (nullable = true)
 |-- VSTEXT: long (nullable = true)
 |-- VSIZE: double (nullable = true)
 |-- RSIZE: double (nullable = true)
 |-- VGROW: double (nullable = true)
 |-- RGROW: double (nullable = true)
 |-- MEM: float (nullable = true)
 |-- CMD: string (nullable = true)
 |-- ts: string (nullable = true)



In [81]:
query = df_memory_formatted \
    .writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

In [82]:
query.stop()

3. Then the streaming data format should be transformed into the proper formats
following the metadata file schema for both process and memory, similar to
assignment 2A 4 (3%)
- The numeric values with extra spaces or “K” / “M” / “G” should be properly
transformed into their correct values
- The NICE value should also be restored based on the PRI values using their
relationship 5
- Hint - There is a mapping between PRI (priority) and NICE, as long as
the process is not yet finished during the last interval. For example,
- PRI 100 maps to NICE -20
- PRI 101 maps toNICE -19
- …
- PRI 139 maps to NICE 19
- Hint - If the process is finished, PRI and NICE would both be 0.

In [21]:
test = udf(lambda z:float(z[0:-1])*1000 if 'K' in z\
           else (float(z[0:-1])*1000000 if 'M' in z\
                 else (float(z[0:-1])*1000000000 if 'G' in z\
                     else z)))

cols = ['MINFLT', 'MAJFLT', 'VSTEXT', 'RSIZE', 'VGROW', 'RGROW']
for column in cols:
    df_memory_formatted = df_memory_formatted.withColumn(column, test(col(column)))
    

In [None]:
df_process_formatted.createOrReplaceTempView("process_formatted_sql")

df_process_formatted = spark.sql("""SELECT sequence,machine,PID,TRUN,TSLPI,TSLPU,POLI,PRI,RTPR,CPUNR,Status,EXC,State,
                                    CPU,CMD,
                                    CASE 
                                        WHEN PRI = 100 THEN NICE = -20
                                        WHEN PRI = 109 THEN NICE = -19
                                        ELSE NICE
                                    END AS NICE,
            
                                    FROM process_formatted_sql """)

In [None]:
df_memory_formatted["CMD_PID"] = df_memory_formatted["CMD"] + df_memory_formatted["PID"]


In [None]:
df_process_formatted["CMD_PID"] = df_process_formatted["CMD"] + df_process_formatted["PID"]


In [None]:
windowedCounts = df_process_formatted \
    .withWatermark("ts", "10 seconds") \
    .groupBy(window(df_formatted.ts, "10 seconds"))\
    .agg(F.sum("Impressions").alias("total"))\
    .select("window","total")

In [None]:
windowedCounts = df_process_formatted \
    .withWatermark("ts", "10 seconds") \
    .groupBy(window(df_formatted.ts, "10 seconds"))\
    .agg(F.sum("Impressions").alias("total"))\
    .select("window","total")

In [None]:
query_file_sink = df_formatted.writeStream.format("parquet")\
        .outputMode("append")\
        .option("path", "parquet/clickstream_df")\
        .option("checkpointLocation", "parquet/clickstream_df/checkpoint")\
        .start()

In [None]:
query_file_sink.stop()

In [None]:
query_file_sink = df_formatted.writeStream.format("parquet")\
        .outputMode("append")\
        .option("path", "parquet/clickstream_df")\
        .option("checkpointLocation", "parquet/clickstream_df/checkpoint")\
        .start()

In [None]:
query_file_sink.stop()

In [52]:
# Create function to show values received from input dataframe
def foreach_batch_function(df, epoch_id):
    df.show(20,False)

In [53]:
query1 = df_process_formatted.writeStream.outputMode("append")\
        .foreachBatch(foreach_batch_function)\
        .trigger(processingTime='5 seconds')\
        .start()

+--------+-------+---+----+-----+-----+----+----+---+----+-----+------+---+-----+---+---+
|sequence|machine|PID|TRUN|TSLPI|TSLPU|POLI|NICE|PRI|RTPR|CPUNR|Status|EXC|State|CPU|CMD|
+--------+-------+---+----+-----+-----+----+----+---+----+-----+------+---+-----+---+---+
+--------+-------+---+----+-----+-----+----+----+---+----+-----+------+---+-----+---+---+

+--------+-------+---+----+-----+-----+----+----+---+----+-----+------+---+-----+---+---+
|sequence|machine|PID|TRUN|TSLPI|TSLPU|POLI|NICE|PRI|RTPR|CPUNR|Status|EXC|State|CPU|CMD|
+--------+-------+---+----+-----+-----+----+----+---+----+-----+------+---+-----+---+---+
+--------+-------+---+----+-----+-----+----+----+---+----+-----+------+---+-----+---+---+

+--------+-------+---+----+-----+-----+----+----+---+----+-----+------+---+-----+---+---+
|sequence|machine|PID|TRUN|TSLPI|TSLPU|POLI|NICE|PRI|RTPR|CPUNR|Status|EXC|State|CPU|CMD|
+--------+-------+---+----+-----+-----+----+----+---+----+-----+------+---+-----+---+---+
+-------

In [54]:
query1.stop()