In [2]:
data_file = "./kddcup.data.gz"
raw_data = sc.textFile(data_file)

print("Train data size is {}".format(raw_data.count()))

Train data size is 4898431


In [3]:
test_data_file = "./corrected.gz"
test_raw_data = sc.textFile(test_data_file)

print("Test data size is {}".format(test_raw_data.count()))

Test data size is 311029


### Preparing the data

In [4]:
from pyspark.mllib.regression import LabeledPoint
import numpy as np

csv_data = raw_data.map(lambda x:x.split(','))
test_csv_data = test_raw_data.map(lambda x:x.split(','))

protocols = csv_data.map(lambda x:x[1]).distinct().collect()
services = csv_data.map(lambda x:x[2]).distinct().collect()
flags = csv_data.map(lambda x:x[3]).distinct().collect()

And now we can use this Python lists in our create_labeled_point function. If a factor level is not in the training data, we assign an especial level. Remember that we cannot use testing data for training our model, not even the factor levels. The testing data represents the unknown to us in a real case.

In [5]:
def create_labeled_point(line_split):
    clean_line_split = line_split[0:41]
    try:
        clean_line_split[1] = protocols.index(clean_line_split[1])
    except:
        clean_line_split[1] = len(protocols)
        
    try:
        clean_line_split[2] = services.index(clean_line_split[2])
    except:
        clean_line_split[2] = len(services)
    
    # convert flag to numeric categorical variable
    try:
        clean_line_split[3] = flags.index(clean_line_split[3])
    except:
        clean_line_split[3] = len(flags)
    
    # convert label to binary label
    attack = 1.0
    if line_split[41] == 'normal.':
        attack = 0.0
    
    return LabeledPoint(attack,np.array([float(x) for x in clean_line_split]))

training_data = csv_data.map(create_labeled_point)
test_data = test_csv_data.map(create_labeled_point)

In [6]:
from pyspark.mllib.tree import DecisionTree,DecisionTreeModel
from time import time

# Build model
t0 = time()
tree_model = DecisionTree.trainClassifier(training_data,numClasses=2,
                                          categoricalFeaturesInfo={1:len(protocols),
                                                                   2:len(services),
                                                                   3:len(flags)},
                                          impurity='gini',maxDepth=4,maxBins=100)
tt = time() - t0
print ("Classifier trained in {} seconds".format(round(tt,3)))

Classifier trained in 104.302 seconds


In [14]:
predictions = tree_model.predict(test_data.map(lambda p:p.features))
labels_and_preds = test_data.map(lambda p:p.label).zip(predictions)

In [16]:
t0 = time()
test_accuracy = labels_and_preds.filter(lambda x: x[0] == x[1]).count() / float(test_data.count())
tt = time() - t0

print("Prediction made in {} seconds. Test accuracy is {}".format(round(tt,3), round(test_accuracy,4)))

Prediction made in 8.109 seconds. Test accuracy is 0.915


In [17]:
print("Learned classification tree model:")
print(tree_model.toDebugString())

Learned classification tree model:
DecisionTreeModel classifier of depth 4 with 23 nodes
  If (feature 22 <= 88.5)
   If (feature 38 <= 0.7949999999999999)
    If (feature 36 <= 0.49)
     If (feature 34 <= 0.9550000000000001)
      Predict: 0.0
     Else (feature 34 > 0.9550000000000001)
      Predict: 1.0
    Else (feature 36 > 0.49)
     If (feature 2 in {0.0,5.0,24.0,25.0,14.0,20.0,29.0,1.0,21.0,13.0,2.0,17.0,22.0,27.0,7.0,3.0,11.0,26.0,23.0,8.0,19.0,4.0})
      Predict: 0.0
     Else (feature 2 not in {0.0,5.0,24.0,25.0,14.0,20.0,29.0,1.0,21.0,13.0,2.0,17.0,22.0,27.0,7.0,3.0,11.0,26.0,23.0,8.0,19.0,4.0})
      Predict: 1.0
   Else (feature 38 > 0.7949999999999999)
    If (feature 3 in {0.0,1.0,6.0,2.0,8.0})
     Predict: 0.0
    Else (feature 3 not in {0.0,1.0,6.0,2.0,8.0})
     Predict: 1.0
  Else (feature 22 > 88.5)
   If (feature 5 <= 2.0)
    If (feature 11 <= 0.5)
     Predict: 1.0
    Else (feature 11 > 0.5)
     If (feature 2 in {14.0})
      Predict: 0.0
     Else (feature

In [18]:
def create_labeled_point_minimal(line_split):
    # leave_out = [41]
    clean_line_split = line_split[3:4] + line_split[5:6] + line_split[22:23]
    
    # convert flag to numeric categorical variable
    try:
        clean_line_split[0] = flags.index(clean_line_split[0])
    except:
        clean_line_split[0] = len(flags)
    
    # convert label to binary label
    attack = 1.0
    if line_split[41]=='normal.':
        attack = 0.0
        
    return LabeledPoint(attack, np.array([float(x) for x in clean_line_split]))

training_data_minimal = csv_data.map(create_labeled_point_minimal)
test_data_minimal = test_csv_data.map(create_labeled_point_minimal)

In [20]:
t0 = time()
tree_model_minial = DecisionTree.trainClassifier(training_data_minimal,
                                                 numClasses=2,categoricalFeaturesInfo={0:len(flags)},
                                                 impurity='gini',maxDepth=3,maxBins=32)
tt = time() - t0
print("Classifier trained in {} seconds".format(round(tt,3)))

Classifier trained in 66.613 seconds


In [24]:
predictions_minimal = tree_model_minial.predict(test_data_minimal.map(lambda p: p.features))
labels_and_preds_minimal = test_data_minimal.map(lambda p: p.label).zip(predictions_minimal)

In [25]:
t0 = time()
test_accuracy = labels_and_preds_minimal.filter(lambda x: x[0] == x[1]).count() / float(test_data_minimal.count())
tt = time() - t0

print("Prediction made in {} seconds. Test accuracy is {}".format(round(tt,3), round(test_accuracy,4)))

Prediction made in 4.373 seconds. Test accuracy is 0.9055
