In [None]:
# import findspark
# findspark.init()
import json
import pyspark
import numpy as np
import pandas as pd
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.feature import VectorAssembler, StandardScaler 
from pyspark.sql.functions import from_utc_timestamp

In [None]:
# Spark session & context
spark = (SparkSession
         .builder
         .master('spark://192.168.100.38:7077')
         .appName('MalwareDetection')
         # Add kafka package
         .config("spark.driver.memory", "512m")
         .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1,org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.1")
         .config("spark.mongodb.input.uri","mongodb://ta:ta@127.0.0.1:27017/MalwareDetection.data?authSource=admin")
         .config("spark.mongodb.output.uri","mongodb://ta:ta@127.0.0.1:27017/MalwareDetection.data?authSource=admin")
         .getOrCreate())
spark.conf.set("spark.sql.caseSensitive", "true")
sc = spark.sparkContext

In [None]:
benign_resample_1400 = pd.read_csv('/work/ta/FinalProject/pa/data/Benign_resample_1400.csv')
benign_resample_1400.drop(['Label'], axis=1, inplace=True)

In [None]:
model = 

In [None]:
jsonFormatSchema = spark.read.json("schema/schema.json")

In [None]:
rawData = (spark
  .readStream
  .format("kafka")
  .option("kafka.bootstrap.servers", "127.0.0.1:9092") # kafka server
  .option("subscribe", "netflowmeter") # topic
  .option("startingOffsets", "latest") 
  .load())

In [None]:
parsedData = rawData.selectExpr("cast (value as string) as json").select(F.from_json("json",jsonFormatSchema.schema).alias("data")).select("data.*")

In [None]:
featureExtraction = parsedData.select(F.col('flow_id'), F.col('src_ip'), F.col('src_port'), F.col('dst_ip'), F.col('dst_port'), F.col('protocol'), F.col('timestamp'),F.col("extractFeature.*"))

In [None]:
def flatten_df(nested_df):
    flat_cols = [c[0] for c in nested_df.dtypes if c[1][:6] != 'struct']
    nested_cols = [c[0] for c in nested_df.dtypes if c[1][:6] == 'struct']

    flat_df = nested_df.select(flat_cols +
                               [F.col(nc+'.'+c).alias(nc+'_'+c)
                                for nc in nested_cols
                                for c in nested_df.select(nc+'.*').columns])
    return flat_df

In [None]:
data_flat = flatten_df(featureExtraction)


In [None]:
duplicate_columns = ['ActivePacket_max', 'ActivePacket_mean', 'ActivePacket_min', 'ActivePacket_std', 'IdlePacket_max', 'IdlePacket_mean', 'IdlePacket_min','IdlePacket_std',]
data_flat = data_flat.drop(*duplicate_columns)

In [None]:
vector_assembler_1 = VectorAssembler(inputCols=df_train.columns, outputCol="SS_features")
data_flat = vector_assembler.transform(data_flat)

In [None]:
#normalize data sensor
data_flat = scaler.fit(df_train).transform(data_flat)

In [None]:
data_flat.printSchema()

In [None]:
def predict_udf(x):
    newlist = [x]
    z = clf.predict(newlist)
    label = int(z[0])
    if label == 1:
        predict = 'Benign'
        return predict
    else:
        predict = 'Anomaly'
        return predict

In [None]:
label_udf = F.udf(predict_udf, StringType())
data = data_flat.withColumn('Label', label_udf(data_flat['scaledFeatures']))

In [None]:
from datetime import datetime 
from decimal import Decimal

def date_udf(x):
    dec = Decimal(x)
    c = datetime.fromtimestamp(int(dec)/1000).strftime("%Y-%m-%d %H:%M:%S")
    return c

In [None]:
datetime_udf = F.udf(date_udf, StringType())
pred_df= data.withColumn('datetime', datetime_udf(data['timestamp']))

In [None]:
kolom = ['SS_features','scaledFeatures','timestamp']
pred_df = pred_df.drop(*kolom)

In [None]:
# pred_df_1 = data.select(F.col('flow_id'), F.col('src_ip'), F.col('src_port'), F.col('dst_ip'), F.col('dst_port'), F.col('protocol'), F.col('timestamp'), F.col('Label'))

In [None]:
pred_df.printSchema()

In [None]:
def write_mongo_row(df, epoch_id):
    df.write.format("mongo").mode("append").option("database","MalwareDetection").option("collection", "data").save()
    pass

query=pred_df.writeStream.foreachBatch(write_mongo_row).start()
query.awaitTermination()