Software to detect network intrusions protects a computer network from unauthorized users, including perhaps insiders. This project aims to build a network intrusion detector, a predictive model capable of distinguishing between bad connections, called intrusions or attacks, and good normal connections.Model this problem as a BINARY classification problem. Use the following models to detect bad connections.

#### Requirement: We will be using a KDD dataset to try to classify a connection as 'normal' or others.

In [1]:
import findspark
findspark.init()

In [None]:
import pandas as pd

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

from pyspark.mllib.evaluation import MulticlassMetrics

In [3]:
spark = SparkSession.builder.appName('kdd').getOrCreate()

In [4]:
df = spark.read.csv("../../Data/kddcup.data_10_percent.gz", header=False, inferSchema=True)

In [5]:
df.count()

494021

In [6]:
str(df.columns)

"['_c0', '_c1', '_c2', '_c3', '_c4', '_c5', '_c6', '_c7', '_c8', '_c9', '_c10', '_c11', '_c12', '_c13', '_c14', '_c15', '_c16', '_c17', '_c18', '_c19', '_c20', '_c21', '_c22', '_c23', '_c24', '_c25', '_c26', '_c27', '_c28', '_c29', '_c30', '_c31', '_c32', '_c33', '_c34', '_c35', '_c36', '_c37', '_c38', '_c39', '_c40', '_c41']"

In [None]:
cols =['_c0', '_c1', '_c2', '_c3', '_c4', '_c5', '_c6', '_c7', '_c8', '_c9', 
       '_c10', '_c11', '_c12', '_c13', '_c14', '_c15', '_c16', '_c17', '_c18', '_c19', 
       '_c20', '_c21', '_c22', '_c23', '_c24', '_c25', '_c26', '_c27', '_c28', '_c29', 
       '_c30', '_c31', '_c32', '_c33', '_c34', '_c35', '_c36', '_c37', '_c38', '_c39', 
       '_c40']
for col_name in cols:
    df = df.withColumn(col_name, col(col_name).cast('float'))

In [None]:
df.show(3)

In [None]:
df.groupBy('_c41').count().show(30)

In [None]:
train_data, test_data = df.randomSplit([0.8, 0.2])

In [None]:
test_data.groupBy('_c41').count().show(30)

In [None]:
# Convert categorical strings to index values
indexer1 = StringIndexer(inputCol='_c1', outputCol='c1_idx')
indexer2 = StringIndexer(inputCol='_c2', outputCol='c2_idx')
indexer3 = StringIndexer(inputCol='_c3', outputCol='c3_idx')
indexer41 = StringIndexer(inputCol='_c41', outputCol='c41_idx')

In [None]:
# One-hot encode index values
onehot = OneHotEncoderEstimator(inputCols=['c1_idx', 'c2_idx' , 'c3_idx'],
                                outputCol=['c1_dummy', 'c2_dummy', 'c3_dummy'])

In [None]:
# Assemble predictions into a single column
assembler = VectorAssembler(inputCols=['_c0', 'c1_dummy', 'c2_dummy', 'c3_dummy', '_c4', '_c5', '_c6', '_c7', '_c8', '_c9', 
                                       '_c10', '_c11', '_c12', '_c13', '_c14', '_c15', '_c16', '_c17', '_c18', '_c19', 
                                       '_c20', '_c21', '_c22', '_c23', '_c24', '_c25', '_c26', '_c27', '_c28', '_c29', 
                                       '_c30', '_c31', '_c32', '_c33', '_c34', '_c35', '_c36', '_c37', '_c38', '_c39', 
                                       '_c40'], outputCol='feature')

In [None]:
# A linear regression object
dtc = DecisionTreeClassifier(featuresCol='features', labelCol='c41_idx', predictionCol='prediction')

In [None]:
# Construct a pipeline
pipeline = Pipeline(stages=[indexer1, indexer2, indexer3, indexer41, onehot, assembler, dtc])

In [None]:
# Train the pipeline on the training data
predictions = pipeline.transform(test_data)

In [None]:
# Inspect results
predictions.select('prediction', "c41_idx").show(5)

In [None]:
# Select (prediction, true label) and compute test error
acc_evaluator = MulticlassClassificationEvaluator(labelCol="c41_idx",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")

In [None]:
# Important: need to cast to float type, and order by prediction, else it won't work
preds_and_labels = predictions.select(['predictions','c41_idx'])
                              .withColumn('c41_idx', col('c41_idx').cast("float").orderBy('prediction')

In [None]:
# Select only prediction and label columns
preds_and_labels = preds_and_labels.select(['prediction', 'c41_idx'])

In [None]:
acc_evaluator.evaluate(preds_and_labels)

In [None]:
# Confusion matrix
metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))

In [None]:
print(metrics.confusionMatrix().toArray())

In [None]:
pd.set_option('display.max_columns', 30)

In [None]:
matrix = pd.DataFrame(metrics.confusionMatrix().toArray())

In [None]:
matrix

### Make new prediction

In [None]:
df_new = spark.read_csv('../../Data/kddcup.testdata.unlabeled_10_percent.gz', inferSchema=True, header=False)

In [None]:
df_new.count()

In [None]:
str(df_new.columns)

In [None]:
# Make predictions on the testing data
predictions_new = pipeline.transform(df_new)

In [None]:
predictions_new.select('features', 'prediction').show()