![GMV](https://www.gmv.com/export/system/modules/com.gmv.teresa.site/resources/theme/img/logo_gmv.svg)  ![Apache Spark](http://spark.apache.org/images/spark-logo.png)

# KDD99 Supervised Learning I

# Apache Spark Initialization

In [None]:
import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext(appName="SecurityDataScience")

In [None]:
%matplotlib inline

## 0. Libraries

In [None]:
import numpy as np
import pandas as pd
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import * 
sqlContext = SQLContext(sc)

## 1. Data Description

**Intrinsic attributes**

These attributes are extracted from the headers' area of the network packets.

Col|Feature name  | description |	type
---|--------------|-------------|------------
1  |duration 	  |length (number of seconds) of the connection |continuous
2  |protocol_type |type of the protocol, e.g. tcp, udp, etc. |discrete
3  |service 	  |network service on the destination, e.g., http, telnet, etc. |discrete
4  |flag 	      |normal or error status of the connection. The possible status are this: SF, S0, S1, S2, S3, OTH, REJ, RSTO, RSTOS0, SH, RSTRH, SHR 	|discrete 
5  |src_bytes 	  |number of data bytes from source to destination 	|continuous
6  |dst_bytes 	  |number of data bytes from destination to source 	|continuous
7  |land 	      |1 if connection is from/to the same host/port; 0 otherwise 	|discrete
8  |wrong_fragment|sum of bad checksum packets in a connection 	|continuous
9  |urgent 	      |number of urgent packets. Urgent packets are packets with the urgent bit activated 	|continuous


**Class attribute**

The 42nd attribute is the ***class_attack*** attribute, it indicates which type of connections is each instance: normal or which attack. The values it can take are the following: *anomaly, dict, dict_simple, eject, eject-fail, ffb, ffb_clear, format, format_clear, format-fail, ftp-write, guest, imap, land, load_clear, loadmodule, multihop, perl_clear, perlmagic, phf, rootkit, spy, syslog, teardrop, warez, warezclient, warezmaster, pod, back, ip- sweep, neptune, nmap, portsweep, satan, smurf and normal*.

** Categories of class attribute **


class_attack |Category
-------|--------------
smurf| dos
neptune| dos
back| dos
teardrop| dos
pod| dos
land| dos
normal|normal
satan|probe
ipsweep|probe
portsweep|probe
nmap|probe
warezclient|r2l
guess_passwd|r2l
warezmaster|r2l
imap|r2l
ftp_write|r2l
multihop|r2l
phf|r2l
spy|r2l
buffer_overflow|u2r
rootkit|u2r
loadmodule|u2r
perl|u2r

## 2. Load Data

In [None]:
textFileConn = sc.textFile('./data/KDD/KDDTrain+.txt', 4)


In [None]:
#Creating the schema

#we define the name of the columns

columnNames=["class_attack", "duration","protocol_type","service","flag","src_bytes","dst_bytes","land",
                 "wrong_fragment","urgent"]

In [None]:
#quick fields initialitation all for FloatType
connFields = [StructField(colName, FloatType(), True) for colName in columnNames]

In [None]:
#we proceed to modify the respective fields so that they reflect the correct data type:
connFields[0].dataType = StringType()
connFields[2].dataType = StringType()
connFields[3].dataType = StringType()
connFields[4].dataType = StringType()

In [None]:
# we can construct our schema, which we will use later below for building the data frame
connSchema = StructType(connFields)

In [None]:
#Parsing the file
def parseReg(p):
    return ( p[41]
            ,float(p[0])
            ,p[1], p[2], p[3] 
            ,float(p[4])
            ,float(p[5])
            ,float(p[6])
            ,float(p[7])
            ,float(p[8])
            )

In [None]:
connParsedFile = (textFileConn.map(lambda line: line.split(','))
                              .map(parseReg))

In [None]:
# We are now ready to build our data frame, using the connParsedFile RDD computed above and the schema 
# variable already calculated:
conn = sqlContext.createDataFrame(connParsedFile, connSchema)
conn.cache()

In [None]:
conn.take(3)

In [None]:
conn.limit(4).toPandas()

In [None]:
#get all the distint values of class_attack
conn.select("class_attack").distinct().toPandas()

-----------------

## 3. Data Preparation

### 3.1 Encoding categorical features

In [None]:
from pyspark.sql import functions as F

In [None]:
def encodeCategorical(df, catName):
    #Encode the categorical variable in different columns foreach categories 
    #and the value is equal to 1 if the category is equal to column name and 0 otherwise. 
    #Finally drops the categorical variable
    
    categories = df.select(catName).distinct().toPandas()[catName]
    aux = df
    for c in categories:
        aux = aux.withColumn(c, F.when(df[catName] == c, 1).otherwise(0))
        
    return aux.drop(catName)

### Encoding *protocol_type*

In [None]:
conn.select("protocol_type").distinct().toPandas()

In [None]:
connEncoded = encodeCategorical(conn, "protocol_type")

In [None]:
connEncoded.limit(10).toPandas()

### Encoding *service*

In [None]:
connEncoded.select("service").distinct().toPandas()

In [None]:
connEncoded = encodeCategorical(connEncoded, "service")

In [None]:
connEncoded.limit(10).toPandas()

### Encoding *flag*

In [None]:
connEncoded.select("flag").distinct().toPandas()

In [None]:
connEncoded = encodeCategorical(connEncoded, "flag")

In [None]:
connEncoded.limit(10).toPandas()

###  Encoding *class_attack* (**label**) like Integers

In [None]:
connEncoded.select("class_attack").distinct().toPandas()

In [None]:
categories = connEncoded.select("class_attack").distinct().toPandas()["class_attack"]

In [None]:
dictCategories = dict((v,int(k)) for (k,v) in categories.to_dict().items())

In [None]:
from pyspark.sql.functions import udf

In [None]:
def categoriesToInt(cat):
    return dictCategories[cat]

udfCategoriesToInt = udf(categoriesToInt, IntegerType())

In [None]:
connEncoded = connEncoded.withColumn("class_attack", udfCategoriesToInt("class_attack") )

In [None]:
connEncoded.limit(10).toPandas()

## 4. Decision Trees

### 4.1 Data preparation for Decision Trees

In [None]:
#Take 30% of the data to improve the performance
connSample = connEncoded.sample(withReplacement = True, fraction = 0.30)

In [None]:
#Modify class_attack catagory to binary category 
connDT = connSample.withColumn('class_attack', when(connEncoded['class_attack'] == 4, 0.0).otherwise(1.0))
connDT.cache()

In [None]:
connDT.limit(10).toPandas()

In [None]:
#Adapt de DataFrame to the algorithm API
from pyspark.ml.linalg import DenseVector

#Define a DataFrame with columns "label" and "features"
connDT = connDT.rdd.map(lambda row: (row[0], DenseVector(row[1:]))).toDF(['label','features'])

In [None]:
connDT.limit(10).toPandas()

### 4.2 Training Decision Tree

**Parameters**:
* **Problem specification parameters** 
    * *algo*: Classification or Regression
    * *numClasses*: Number of classes (for Classification only)
    * *categoricalFeaturesInfo*: Specifies which features are categorical and how many categorical values each of those features can take. This is given as a map from feature indices to feature arity (number of categories). Any features not in this map are treated as continuous.
* **Stopping criteria**
    * *maxDepth*: Maximum depth of a tree. Deeper trees are more expressive (potentially allowing higher accuracy), but they are also more costly to train and are more likely to overfit.
    * *minInstancesPerNode*: For a node to be split further, each of its children must receive at least this number of training instances. This is commonly used with RandomForest since those are often trained deeper than individual trees.
    * *minInfoGain*: For a node to be split further, the split must improve at least this much (in terms of information gain).
* **Tunable parameters**
    * *maxBins*: Number of bins used when discretizing continuous features.
    * *maxMemoryInMB*: Amount of memory to be used for collecting sufficient statistics.
    * *impurity*: Impurity measure (Gini impurity, Entropy or Variance) used to choose between candidate splits. This measure must match the algo parameter.

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.evaluation import MulticlassMetrics

In [None]:
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(connDT)

In [None]:
# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 20 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol='features', outputCol="indexedFeatures", maxCategories=20).fit(connDT)

In [None]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = connDT.randomSplit([0.7, 0.3])

In [None]:
# Train a DecisionTree model.
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")

In [None]:
# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

In [None]:
# Train model with trainingData.  This also runs the indexers.
dtModel = pipeline.fit(trainingData)

In [None]:
# Make predictions with testData.
predictions = dtModel.transform(testData)

In [None]:
# Select example rows to display.
predictions.select("prediction", "indexedLabel", "features").limit(10).toPandas()

In [None]:
# Compute raw scores on the test set
predictionAndLabels = predictions.select("prediction", "indexedLabel").rdd

In [None]:
predictionAndLabels.take(3)

In [None]:
# Instantiate Basic Metrics object
basicMetrics = MulticlassMetrics(predictionAndLabels)

In [None]:
print("Summary Stats")
print("Weighted Precision = %s" % basicMetrics.weightedPrecision)
print("Weighted Recall = %s" % basicMetrics.weightedRecall)
print("Weighted F1 Score = %s" % basicMetrics.weightedFMeasure())
print("Confusion Matrix:")
pd.DataFrame(basicMetrics.confusionMatrix().toArray())

In [None]:
# Instantiate Advanced Metrics object
advMetrics = BinaryClassificationMetrics(predictionAndLabels)

In [None]:
# Area under precision-recall curve
print("Area under PR = %s" % advMetrics.areaUnderPR)

# Area under ROC curve
print("Area under ROC = %s" % advMetrics.areaUnderROC)

In [None]:
# Show rules
treeModel = dtModel.stages[2]
print(treeModel.toDebugString)

### 4.3 Model Selection via Cross-Validation to Decision Tree

In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(connDT)

# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 20 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol='features', outputCol="indexedFeatures", maxCategories=20).fit(connDT)
    
# Train a DecisionTree model.
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")

# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

In [None]:
grid = ParamGridBuilder().addGrid(dt.maxDepth, range(3,7)) \
                         .addGrid(dt.maxBins, range(20,51, 10)) \
                         .build()

In [None]:
evaluator = BinaryClassificationEvaluator(metricName="areaUnderPR")

In [None]:
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluator)

In [None]:
%%time 
cvModel = cv.fit(connDT)

In [None]:
%%time
evaluator.evaluate(cvModel.transform(connDT))

In [None]:
cvModel.bestModel.stages[2]

In [None]:
evaluator.evaluate(cvModel.transform(connDT), {evaluator.metricName: "areaUnderPR"})

## 5. Random Forests
Random forests are ensembles of decision trees. Random forests are one of the most successful machine learning models for classification and regression. They combine many decision trees in order to **reduce the risk of overfitting**. Like decision trees, random forests handle categorical features, extend to the multiclass classification setting, do not require feature scaling, and are able to capture non-linearities and feature interactions.

Random forests train a set of decision trees separately, so the training can be done in parallel. The algorithm injects randomness into the training process so that each decision tree is a bit different. Combining the predictions from each tree reduces the variance of the predictions, improving the performance on test data

**Parameters:**
* **numTrees**: Number of trees in the forest. Increasing the number of trees will decrease the variance in predictions, improving the modelâ€™s test-time accuracy. Training time increases roughly linearly in the number of trees.
* **maxDepth**: Maximum depth of each tree in the forest. Increasing the depth makes the model more expressive and powerful. However, deep trees take longer to train and are also more prone to overfitting. In general, it is acceptable to train deeper trees when using random forests than when using a single decision tree. One tree is more likely to overfit than a random forest (because of the variance reduction from averaging multiple trees in the forest).
* **subsamplingRate**: This parameter specifies the size of the dataset used for training each tree in the forest, as a fraction of the size of the original dataset. The default (1.0) is recommended, but decreasing this fraction can speed up training.
* **featureSubsetStrategy**: Number of features to use as candidates for splitting at each tree node. The number is specified as a fraction or function of the total number of features. Decreasing this number will speed up training, but can sometimes impact performance if too low.

### 5.1 Data preparation for RandomForest

In [None]:
#Take 30% of the data to improve the performance
connSample = connEncoded.sample(withReplacement = True, fraction = 0.30)

In [None]:
#Modify class_attack catagory to binary category 
connRF = connSample.withColumn('class_attack', when(connEncoded['class_attack'] == 4, 0.0).otherwise(1.0))
connRF.cache()

In [None]:
connRF.limit(10).toPandas()

In [None]:
#Adapt de DataFrame to the algorithm API
from pyspark.ml.linalg import DenseVector

#Define a DataFrame with columns "label" and "features"
connRF = connRF.rdd.map(lambda row: (row[0], DenseVector(row[1:]))).toDF(['label','features'])

In [None]:
connRF.limit(10).toPandas()

### 5.2 RandomForest Model

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.mllib.evaluation import BinaryClassificationMetrics

In [None]:
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(connRF)

In [None]:
# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 20 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol='features', outputCol="indexedFeatures", maxCategories=20).fit(connRF)

In [None]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = connRF.randomSplit([0.7, 0.3])

In [None]:
# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures",
                           numTrees = 10, maxDepth = 5, impurity = "gini")

In [None]:
# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf])

In [None]:
# Train model with trainingData.  This also runs the indexers.
rfModel = pipeline.fit(trainingData)

In [None]:
# Make predictions with testData.
predictions = rfModel.transform(testData)

In [None]:
# Select example rows to display.
predictions.select("prediction", "indexedLabel", "features").limit(10).toPandas()

In [None]:
# Compute raw scores on the test set
predictionAndLabels = predictions.select("prediction", "indexedLabel").rdd

In [None]:
predictionAndLabels.take(3)

In [None]:
# Instantiate Basic Metrics object
basicMetrics = MulticlassMetrics(predictionAndLabels)

In [None]:
print("Summary Stats")
print("Weighted Precision = %s" % basicMetrics.weightedPrecision)
print("Weighted Recall = %s" % basicMetrics.weightedRecall)
print("Weighted F1 Score = %s" % basicMetrics.weightedFMeasure())
print("Confusion Matrix:")
pd.DataFrame(basicMetrics.confusionMatrix().toArray())

In [None]:
# Instantiate Advanced Metrics object
advMetrics = BinaryClassificationMetrics(predictionAndLabels)

In [None]:
# Area under precision-recall curve
print("Area under PR = %s" % advMetrics.areaUnderPR)

# Area under ROC curve
print("Area under ROC = %s" % advMetrics.areaUnderROC)

In [None]:
# Show rules
treeModel = rfModel.stages[2]
print(treeModel.toDebugString) # summary only

### 5.3 Model Selection via Cross-Validation to RandomForest

In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(connRF)

# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 20 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol='features', outputCol="indexedFeatures", maxCategories=20).fit(connRF)
    
# Train a DecisionTree model.
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")

# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf])

In [None]:
grid = ParamGridBuilder().addGrid(rf.numTrees, range(10, 41, 10)) \
                         .addGrid(rf.maxDepth, range(2, 7, 2)) \
                         .build()

In [None]:
evaluator = BinaryClassificationEvaluator(metricName="areaUnderPR")

In [None]:
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluator)

In [None]:
%%time 
cvModel = cv.fit(connRF)

In [None]:
%%time
evaluator.evaluate(cvModel.transform(connRF))

In [None]:
cvModel.bestModel.stages[2]

In [None]:
evaluator.evaluate(cvModel.transform(connRF), {evaluator.metricName: "areaUnderPR"})

----------