In [1]:
from datetime import datetime 
from decimal import Decimal

import findspark
import json
import pyspark
import pyspark.sql.functions as F

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.feature import VectorAssembler, StandardScaler 
from pyspark.sql.functions import from_utc_timestamp

from pyspark.sql.types import *
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer

In [2]:
findspark.init()

In [3]:
# data diflatkan biar ga bercabang --> pada extract feature
def flatten_df(nested_df):
    flat_cols = [c[0] for c in nested_df.dtypes if c[1][:6] != 'struct']
    nested_cols = [c[0] for c in nested_df.dtypes if c[1][:6] == 'struct']
    flat_df = nested_df.select(flat_cols +
                               [F.col(nc+'.'+c).alias(nc+'_'+c)
                                for nc in nested_cols
                                for c in nested_df.select(nc+'.*').columns])
    return flat_df

In [4]:
def date_udf(x):
    dec = Decimal(x)
    c = datetime.fromtimestamp(int(dec)/1000).strftime("%Y-%m-%d %H:%M:%S")
    return c

In [5]:
def write_mongo_row(df, epoch_id):
    df.write.format("mongo").mode("append").option("database","IoTMalwareDetection").option("collection","results").save()
    pass

In [6]:
spark = (SparkSession
         .builder
         .master('local')
         .appName('IoTMalwareDetection')
         .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1,org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.1")
         .config("spark.mongodb.input.uri","mongodb://ta:ta@192.168.100.29:27018/IoTMalwareDetection.results?authSource=admin")
         .config("spark.mongodb.output.uri","mongodb://ta:ta@192.168.100.29:27018/IoTMalwareDetection.results?authSource=admin")
         .getOrCreate())

spark.conf.set("spark.sql.caseSensitive", "true")

sc = spark.sparkContext

In [7]:
df_train = spark.read.csv('/media/kmdr7/Seagate/TA/DATASETS/DatasetSampled.csv', header="true", inferSchema=True)

In [8]:
df_train = df_train.select([F.col(column).cast('double') for column in df_train.columns[:-1]] + ['Label'])

In [9]:
featureImp = ['FWD Init Win Bytes','Idle Max','Fwd Header Length','Bwd Packets/s','Flow Bytes/s','Bwd Init Win Bytes','Flow IAT Max']
dfImp = df_train.select(F.col('FWD Init Win Bytes'), F.col('Idle Max'), F.col('Fwd Header Length'),F.col('Bwd Packets/s'), F.col('Flow Bytes/s'), F.col('Bwd Init Win Bytes'), F.col('Flow IAT Max'),F.col('Label'))

In [10]:
vector_assembler = VectorAssembler(inputCols=featureImp, outputCol="features")
dfImp = vector_assembler.transform(dfImp)
scaler = StandardScaler(inputCol="features", outputCol="s_features", withStd=True, withMean=True)

In [11]:
train = scaler.fit(dfImp).transform(dfImp)

In [12]:
train = train.drop('FWD Init Win Bytes','Idle Max','Fwd Header Length','Bwd Packets/s','Flow Bytes/s','Bwd Init Win Bytes','Flow IAT Max')

In [13]:
l_indexer = StringIndexer(inputCol="Label", outputCol="label")
train = l_indexer.fit(train).transform(train)
train.show(3)

+---------+--------------------+--------------------+-----+
|    Label|            features|          s_features|label|
+---------+--------------------+--------------------+-----+
|Malicious|[35766.0,1.551387...|[1.71228477974615...|  1.0|
|Malicious|[35766.0,1.551389...|[1.71228477974615...|  1.0|
|Malicious|[35766.0,1.551388...|[1.71228477974615...|  1.0|
+---------+--------------------+--------------------+-----+
only showing top 3 rows



In [14]:
train = train.drop('Label')
train

DataFrame[features: vector, s_features: vector, label: double]

In [15]:
# Split the data
(training_data, test_data) = train.randomSplit([0.8,0.2], seed =2020)
print("Training Dataset Count: " + str(training_data.count()))
print("Test Dataset Count: " + str(test_data.count()))

Training Dataset Count: 190360
Test Dataset Count: 47587


In [16]:
training_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- s_features: vector (nullable = true)
 |-- label: double (nullable = false)



In [17]:
dt = DecisionTreeClassifier(featuresCol='s_features', labelCol='label', maxDepth=20)

In [18]:
dtModel = dt.fit(training_data)
dt_predictions = dtModel.transform(test_data)

In [19]:
multi_evaluator = MulticlassClassificationEvaluator(labelCol = 'label', metricName = 'accuracy')
print('Decision Tree Accuracy:', multi_evaluator.evaluate(dt_predictions))

Decision Tree Accuracy: 0.9997058020047492


In [20]:
jsonFormatSchema = spark.read.json("schema/schema.json")

In [21]:
rawData = (spark
  .readStream
  .format("kafka")
  .option("kafka.bootstrap.servers", "192.168.100.29:29092") # kafka server
  .option("subscribe", "iot23_34_1") # topic
  .option("startingOffsets", "latest") 
  .load())

In [22]:
parsedData = rawData.selectExpr("cast (value as string) as json").select(F.from_json("json",jsonFormatSchema.schema).alias("data")).select("data.*")
featureExtraction = parsedData.select(F.col('flow_id'), F.col('src_ip'), F.col('src_port'), F.col('dst_ip'), F.col('dst_port'), F.col('protocol'), F.col('timestamp'),F.col("extractFeature.*"))

In [23]:
data_flat = flatten_df(featureExtraction)
duplicate_columns = ['ActivePacket_max', 'ActivePacket_mean', 'ActivePacket_min', 'ActivePacket_std', 'IdlePacket_max', 'IdlePacket_mean', 'IdlePacket_min','IdlePacket_std']
data_flat = data_flat.drop(*duplicate_columns)

In [24]:
data_flat

DataFrame[flow_id: string, src_ip: string, src_port: bigint, dst_ip: string, dst_port: bigint, protocol: bigint, timestamp: string, average_packet_size: double, bwd_IAT_total: double, bwd_PSH_flags: double, bwd_URG_flags: double, bwd_header_length: double, bwd_packets_per_second: double, bwd_segment_size_avg: double, bwd_win_bytes: bigint, download_upload_ratio: double, flow_bytes_per_second: double, flow_duration: bigint, flow_pkts_per_second: double, fwd_IAT_total: double, fwd_PSH_flags: double, fwd_URG_flags: double, fwd_act_data_pkts: bigint, fwd_header_length: double, fwd_packets_per_second: double, fwd_seg_size_min: bigint, fwd_segment_size_avg: double, fwd_win_bytes: bigint, packet_length_variance: double, activePacket_max: double, activePacket_mean: double, activePacket_min: double, activePacket_std: double, bwd_IAT_max: double, bwd_IAT_mean: double, bwd_IAT_min: double, bwd_IAT_std: double, bwd_bulk_bulk_rate: bigint, bwd_bulk_bytes_per_bulk: bigint, bwd_bulk_packet_per_bulk: 

In [25]:
dataSelect =  data_flat.select(F.col('flow_id'), F.col('src_ip'), F.col('src_port'), F.col('dst_ip'), F.col('dst_port'), F.col('protocol'), F.col('timestamp'),F.col('fwd_win_bytes'), F.col('idlePacket_max'), F.col('fwd_header_length'),F.col('bwd_packets_per_second'), F.col('flow_bytes_per_second'), F.col('bwd_win_bytes'), F.col('flow_IAT_max'))

In [26]:
dataSelect

DataFrame[flow_id: string, src_ip: string, src_port: bigint, dst_ip: string, dst_port: bigint, protocol: bigint, timestamp: string, fwd_win_bytes: bigint, idlePacket_max: double, fwd_header_length: double, bwd_packets_per_second: double, flow_bytes_per_second: double, bwd_win_bytes: bigint, flow_IAT_max: double]

In [27]:
featureImpo = ['fwd_win_bytes','idlePacket_max','fwd_header_length','bwd_packets_per_second','flow_bytes_per_second','bwd_win_bytes','flow_IAT_max']
vector_assembler = VectorAssembler(inputCols=featureImpo, outputCol="features")
dataSelect = vector_assembler.transform(dataSelect)
dataSelect

DataFrame[flow_id: string, src_ip: string, src_port: bigint, dst_ip: string, dst_port: bigint, protocol: bigint, timestamp: string, fwd_win_bytes: bigint, idlePacket_max: double, fwd_header_length: double, bwd_packets_per_second: double, flow_bytes_per_second: double, bwd_win_bytes: bigint, flow_IAT_max: double, features: vector]

In [28]:
dataSelect

DataFrame[flow_id: string, src_ip: string, src_port: bigint, dst_ip: string, dst_port: bigint, protocol: bigint, timestamp: string, fwd_win_bytes: bigint, idlePacket_max: double, fwd_header_length: double, bwd_packets_per_second: double, flow_bytes_per_second: double, bwd_win_bytes: bigint, flow_IAT_max: double, features: vector]

In [29]:
#normalisasi data seleksi fitur
dataSelect = scaler.fit(dfImp).transform(dataSelect)
pred_df = dtModel.transform(dataSelect)

In [30]:
datetime_udf = F.udf(date_udf, StringType())
pred_df = pred_df.withColumn('datetime', datetime_udf(pred_df['timestamp']))
pred_df_1 = pred_df.select(F.col('flow_id'), F.col('src_ip'), F.col('src_port'), F.col('dst_ip'), F.col('dst_port'), F.col('protocol'), F.col('datetime'), F.col('prediction'))

In [31]:
query=pred_df_1.writeStream.foreachBatch(write_mongo_row).start()
query.awaitTermination()

KeyboardInterrupt: 