Software to detect network intrusions protects a computer network from unauthorized users, including perhaps insiders. This project aims to build a network intrusion detector, a predictive model capable of distinguishing between bad connections, called intrusions or attacks, and good normal connections.Model this problem as a BINARY classification problem. Use the following models to detect bad connections.

#### Requirement: We will be using a KDD dataset to try to classify a connection as 'normal' or others.

In [1]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [2]:
import findspark
findspark.init()

In [3]:
import pandas as pd

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

from pyspark.mllib.evaluation import MulticlassMetrics

In [5]:
spark = SparkSession.builder.appName('kdd').getOrCreate()

In [6]:
df = spark.read.csv("../../Data/kddcup.data_10_percent.gz", header=False, inferSchema=True)

In [7]:
df.count()

494021

In [8]:
str(df.columns)

"['_c0', '_c1', '_c2', '_c3', '_c4', '_c5', '_c6', '_c7', '_c8', '_c9', '_c10', '_c11', '_c12', '_c13', '_c14', '_c15', '_c16', '_c17', '_c18', '_c19', '_c20', '_c21', '_c22', '_c23', '_c24', '_c25', '_c26', '_c27', '_c28', '_c29', '_c30', '_c31', '_c32', '_c33', '_c34', '_c35', '_c36', '_c37', '_c38', '_c39', '_c40', '_c41']"

In [9]:
cols =['_c0', '_c4', '_c5', '_c6', '_c7', '_c8', '_c9', 
       '_c10', '_c11', '_c12', '_c13', '_c14', '_c15', '_c16', '_c17', '_c18', '_c19', 
       '_c20', '_c21', '_c22', '_c23', '_c24', '_c25', '_c26', '_c27', '_c28', '_c29', 
       '_c30', '_c31', '_c32', '_c33', '_c34', '_c35', '_c36', '_c37', '_c38', '_c39', 
       '_c40']
for col_name in cols:
    df = df.withColumn(col_name, col(col_name).cast('float'))

In [10]:
df.show(3)

+---+---+----+---+-----+------+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-------+
|_c0|_c1| _c2|_c3|  _c4|   _c5|_c6|_c7|_c8|_c9|_c10|_c11|_c12|_c13|_c14|_c15|_c16|_c17|_c18|_c19|_c20|_c21|_c22|_c23|_c24|_c25|_c26|_c27|_c28|_c29|_c30|_c31|_c32|_c33|_c34|_c35|_c36|_c37|_c38|_c39|_c40|   _c41|
+---+---+----+---+-----+------+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-------+
|0.0|tcp|http| SF|181.0|5450.0|0.0|0.0|0.0|0.0| 0.0| 1.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 8.0| 8.0| 0.0| 0.0| 0.0| 0.0| 1.0| 0.0| 0.0| 9.0| 9.0| 1.0| 0.0|0.11| 0.0| 0.0| 0.0| 0.0| 0.0|normal.|
|0.0|tcp|http| SF|239.0| 486.0|0.0|0.0|0.0|0.0| 0.0| 1.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 8.0| 8.0| 0.0| 0.0| 0.0| 0.0| 1.0| 0.0| 0.0|19.0

In [11]:
df.groupBy('_c41').count().show(30)

+----------------+------+
|            _c41| count|
+----------------+------+
|    warezmaster.|    20|
|          smurf.|280790|
|            pod.|   264|
|           imap.|    12|
|           nmap.|   231|
|   guess_passwd.|    53|
|        ipsweep.|  1247|
|      portsweep.|  1040|
|          satan.|  1589|
|           land.|    21|
|     loadmodule.|     9|
|      ftp_write.|     8|
|buffer_overflow.|    30|
|        rootkit.|    10|
|    warezclient.|  1020|
|       teardrop.|   979|
|           perl.|     3|
|            phf.|     4|
|       multihop.|     7|
|        neptune.|107201|
|           back.|  2203|
|            spy.|     2|
|         normal.| 97278|
+----------------+------+



In [12]:
train_data, test_data = df.randomSplit([0.8, 0.2])

In [13]:
test_data.groupBy('_c41').count().show(30)

+----------------+-----+
|            _c41|count|
+----------------+-----+
|    warezmaster.|    2|
|          smurf.|56387|
|            pod.|   52|
|           nmap.|   45|
|           imap.|    2|
|   guess_passwd.|   11|
|        ipsweep.|  256|
|      portsweep.|  213|
|          satan.|  308|
|           land.|    7|
|     loadmodule.|    1|
|      ftp_write.|    3|
|buffer_overflow.|    8|
|        rootkit.|    1|
|    warezclient.|  186|
|       teardrop.|  198|
|           perl.|    1|
|            phf.|    1|
|       multihop.|    3|
|        neptune.|21839|
|           back.|  438|
|         normal.|19324|
+----------------+-----+



In [14]:
# Convert categorical strings to index values
indexer1 = StringIndexer(inputCol='_c1', outputCol='c1_idx')
indexer2 = StringIndexer(inputCol='_c2', outputCol='c2_idx')
indexer3 = StringIndexer(inputCol='_c3', outputCol='c3_idx')
indexer41 = StringIndexer(inputCol='_c41', outputCol='c41_idx')

In [15]:
# One-hot encode index values
onehot = OneHotEncoder(inputCols=['c1_idx', 'c2_idx' , 'c3_idx'],
                                outputCols=['c1_dummy', 'c2_dummy', 'c3_dummy'])

In [16]:
# Assemble predictions into a single column
assembler = VectorAssembler(inputCols=['_c0', 'c1_dummy', 'c2_dummy', 'c3_dummy', '_c4', '_c5', '_c6', '_c7', '_c8', '_c9', 
                                       '_c10', '_c11', '_c12', '_c13', '_c14', '_c15', '_c16', '_c17', '_c18', '_c19', 
                                       '_c20', '_c21', '_c22', '_c23', '_c24', '_c25', '_c26', '_c27', '_c28', '_c29', 
                                       '_c30', '_c31', '_c32', '_c33', '_c34', '_c35', '_c36', '_c37', '_c38', '_c39', 
                                       '_c40'], outputCol='features')

In [17]:
# A linear regression object
dtc = DecisionTreeClassifier(featuresCol='features', labelCol='c41_idx', predictionCol='prediction')

In [18]:
# Construct a pipeline
pipeline = Pipeline(stages=[indexer1, indexer2, indexer3, indexer41, onehot, assembler, dtc])

In [19]:
# Train the pipeline on the training data
model = pipeline.fit(train_data)

In [20]:
# Make predictions on testing data
predictions = model.transform(test_data)

In [21]:
# Inspect results
predictions.select('prediction', "c41_idx").show(5)

+----------+-------+
|prediction|c41_idx|
+----------+-------+
|       5.0|    5.0|
|       5.0|    5.0|
|       5.0|    5.0|
|       5.0|    5.0|
|       5.0|    5.0|
+----------+-------+
only showing top 5 rows



In [22]:
# Select (prediction, true label) and compute test error
acc_evaluator = MulticlassClassificationEvaluator(labelCol="c41_idx",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")

In [23]:
# important: need to cast to float type, and other by prediction, else it won't work
preds_and_labels = predictions.select(['prediction', 'c41_idx'])\
                              .withColumn('c41_idx', col('c41_idx')\
                                          .cast("float")).orderBy('prediction')

In [24]:
# Select only prediction and label columns
preds_and_labels = preds_and_labels.select(['prediction', 'c41_idx'])

In [25]:
acc_evaluator.evaluate(preds_and_labels)

0.9932921056342284

In [26]:
# Confusion matrix
metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))

In [27]:
print(metrics.confusionMatrix().toArray())

[[5.6387e+04 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00]
 [0.0000e+00 2.1719e+04 1.1400e+02 0.0000e+00 6.0000e+00 0.0000e+00
  0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00]
 [0.0000e+00 0.0000e+00 1.9322e+04 0.0000e+00 0.0000e+00 2.0000e+00
  0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00]
 [0.0000e+00 0.0000e+00 1.8000e+01 4.2000e+02 0.0000e+00 0.0000e+00
  0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+

In [28]:
pd.set_option('display.max_columns', 30)

In [29]:
matrix = pd.DataFrame(metrics.confusionMatrix().toArray())

In [30]:
matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21
0,56387.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,21719.0,114.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,19322.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,18.0,420.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,14.0,0.0,294.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,41.0,0.0,0.0,215.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,186.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,1.0,194.0,0.0,7.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,198.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Make new prediction

In [33]:
df_new = spark.read.csv('../../Data/kddcup.testdata.unlabeled_10_percent.gz', inferSchema=True, header=False)

In [34]:
df_new.count()

311029

In [35]:
str(df_new.columns)

"['_c0', '_c1', '_c2', '_c3', '_c4', '_c5', '_c6', '_c7', '_c8', '_c9', '_c10', '_c11', '_c12', '_c13', '_c14', '_c15', '_c16', '_c17', '_c18', '_c19', '_c20', '_c21', '_c22', '_c23', '_c24', '_c25', '_c26', '_c27', '_c28', '_c29', '_c30', '_c31', '_c32', '_c33', '_c34', '_c35', '_c36', '_c37', '_c38', '_c39', '_c40']"

In [37]:
# Make predictions on the testing data
predictions_new = model.transform(df_new)

In [38]:
predictions_new.select('features', 'prediction').show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|(115,[4,68,78,79,...|       2.0|
|(115,[4,68,78,79,...|       2.0|
|(115,[4,68,78,79,...|       2.0|
|(115,[4,68,78,79,...|       2.0|
|(115,[4,68,78,79,...|       2.0|
|(115,[4,68,78,79,...|       2.0|
|(115,[8,68,78,96,...|       2.0|
|(115,[4,68,78,79,...|       2.0|
|(115,[4,68,78,79,...|       2.0|
|(115,[2,5,68,78,7...|       2.0|
|(115,[4,68,78,79,...|       2.0|
|(115,[2,5,68,78,7...|       2.0|
|(115,[4,68,78,79,...|       2.0|
|(115,[4,68,78,79,...|       2.0|
|(115,[0,2,6,68,78...|       2.0|
|(115,[2,5,68,78,7...|       2.0|
|(115,[2,5,68,78,7...|       2.0|
|(115,[2,5,68,78,7...|       2.0|
|(115,[4,68,78,79,...|       2.0|
|(115,[4,68,78,79,...|       2.0|
+--------------------+----------+
only showing top 20 rows

