In [2]:
from pyspark import SparkContext,SparkConf

In [3]:
sc=SparkContext()

In [4]:
data_file="/data/kdd/kddcup.data_10_percent"
raw_data=sc.textFile(data_file)

In [5]:
print "Train data size is {}".format(raw_data.count())

Train data size is 494021


In [6]:
test_data_file="/data/kdd/corrected"
test_raw_data=sc.textFile(test_data_file)
print "Test data size is {}".format(test_raw_data.count())

Test data size is 311029


In [7]:
## 1. Preparing the data
from pyspark.mllib.regression import LabeledPoint
from numpy import array

csv_data = raw_data.map(lambda x: x.split(","))
test_csv_data = test_raw_data.map(lambda x: x.split(","))

protocols = csv_data.map(lambda x: x[1]).distinct().collect()
services = csv_data.map(lambda x: x[2]).distinct().collect()
flags = csv_data.map(lambda x: x[3]).distinct().collect()

In [8]:
def create_labeled_point(line_split):
    # leave_out = [41]
    clean_line_split = line_split[0:41]
    
    # convert protocol to numeric categorical variable
    try: 
        clean_line_split[1] = protocols.index(clean_line_split[1])
    except:
        clean_line_split[1] = len(protocols)
        
    # convert service to numeric categorical variable
    try:
        clean_line_split[2] = services.index(clean_line_split[2])
    except:
        clean_line_split[2] = len(services)
    
    # convert flag to numeric categorical variable
    try:
        clean_line_split[3] = flags.index(clean_line_split[3])
    except:
        clean_line_split[3] = len(flags)
    
    # convert label to binary label
    attack = 1.0
    if line_split[41]=='normal.':
        attack = 0.0
        
    return LabeledPoint(attack, array([float(x) for x in clean_line_split]))

training_data = csv_data.map(create_labeled_point)
test_data = test_csv_data.map(create_labeled_point)

In [9]:
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from time import time

# Build the model
t0 = time()
tree_model = DecisionTree.trainClassifier(training_data, numClasses=2, 
                                          categoricalFeaturesInfo={1: len(protocols), 2: len(services), 3: len(flags)},
                                          impurity='gini', maxDepth=4, maxBins=100)
tt = time() - t0

print "Classifier trained in {} seconds".format(round(tt,3))

Classifier trained in 19.522 seconds


In [9]:
## Evaluating the model
predictions = tree_model.predict(test_data.map(lambda p: p.features))
labels_and_preds = test_data.map(lambda p: p.label).zip(predictions)

In [10]:
t0 = time()
test_accuracy = labels_and_preds.filter(lambda (v, p): v == p).count() / float(test_data.count())
tt = time() - t0

print "Prediction made in {} seconds. Test accuracy is {}".format(round(tt,3), round(test_accuracy,4))

Prediction made in 9.665 seconds. Test accuracy is 0.9221


In [11]:
## Interpreting the model
print "Learned classification tree model:"
print tree_model.toDebugString()

Learned classification tree model:
DecisionTreeModel classifier of depth 4 with 27 nodes
  If (feature 22 <= 84.0)
   If (feature 4 <= 28.0)
    If (feature 2 in {0.0,10.0,56.0,42.0,24.0,25.0,52.0,20.0,46.0,57.0,29.0,61.0,1.0,60.0,28.0,38.0,21.0,33.0,53.0,13.0,41.0,2.0,32.0,34.0,64.0,17.0,22.0,44.0,59.0,27.0,54.0,49.0,3.0,35.0,48.0,18.0,50.0,16.0,31.0,11.0,43.0,40.0,26.0,55.0,23.0,8.0,58.0,36.0,30.0,51.0,19.0,47.0,15.0,62.0})
     If (feature 37 <= 0.55)
      Predict: 0.0
     Else (feature 37 > 0.55)
      Predict: 1.0
    Else (feature 2 not in {0.0,10.0,56.0,42.0,24.0,25.0,52.0,20.0,46.0,57.0,29.0,61.0,1.0,60.0,28.0,38.0,21.0,33.0,53.0,13.0,41.0,2.0,32.0,34.0,64.0,17.0,22.0,44.0,59.0,27.0,54.0,49.0,3.0,35.0,48.0,18.0,50.0,16.0,31.0,11.0,43.0,40.0,26.0,55.0,23.0,8.0,58.0,36.0,30.0,51.0,19.0,47.0,15.0,62.0})
     If (feature 11 <= 0.0)
      Predict: 1.0
     Else (feature 11 > 0.0)
      Predict: 0.0
   Else (feature 4 > 28.0)
    If (feature 9 <= 0.0)
     If (feature 7 <= 0.0)
   

In [12]:
print "Service 0 is {}".format(services[0])
print "Service 52 is {}".format(services[52])

Service 0 is domain
Service 52 is time


In [13]:
def create_labeled_point_minimal(line_split):
    # leave_out = [41]
    clean_line_split = line_split[3:4] + line_split[5:6] + line_split[22:23]
    
    # convert flag to numeric categorical variable
    try:
        clean_line_split[0] = flags.index(clean_line_split[0])
    except:
        clean_line_split[0] = len(flags)
    
    # convert label to binary label
    attack = 1.0
    if line_split[41]=='normal.':
        attack = 0.0
        
    return LabeledPoint(attack, array([float(x) for x in clean_line_split]))

training_data_minimal = csv_data.map(create_labeled_point_minimal)
test_data_minimal = test_csv_data.map(create_labeled_point_minimal)

In [14]:
# Build the model
t0 = time()
tree_model_minimal = DecisionTree.trainClassifier(training_data_minimal, numClasses=2, 
                                          categoricalFeaturesInfo={0: len(flags)},
                                          impurity='gini', maxDepth=3, maxBins=32)
tt = time() - t0

print "Classifier trained in {} seconds".format(round(tt,3))

Classifier trained in 11.414 seconds


In [15]:
predictions_minimal = tree_model_minimal.predict(test_data_minimal.map(lambda p: p.features))
labels_and_preds_minimal = test_data_minimal.map(lambda p: p.label).zip(predictions_minimal)

In [16]:
t0 = time()
test_accuracy = labels_and_preds_minimal.filter(lambda (v, p): v == p).count() / float(test_data_minimal.count())
tt = time() - t0

print "Prediction made in {} seconds. Test accuracy is {}".format(round(tt,3), round(test_accuracy,4))

Prediction made in 5.228 seconds. Test accuracy is 0.9152
