In [1]:
from pyspark import SparkContext,SparkConf

In [2]:
sc=SparkContext()

In [3]:
data_file="/data/kdd/kddcup.data_10_percent"
raw_data=sc.textFile(data_file)

In [4]:
print "Train data size is {}".format(raw_data.count())

Train data size is 494021


In [5]:
test_data_file="/data/kdd/corrected"
test_raw_data=sc.textFile(test_data_file)
print "Test data size is {}".format(test_raw_data.count())

Test data size is 311029


In [6]:
from pyspark.mllib.regression import LabeledPoint
from numpy import array

csv_data = raw_data.map(lambda x: x.split(","))
test_csv_data = test_raw_data.map(lambda x: x.split(","))

protocols = csv_data.map(lambda x: x[1]).distinct().collect()
services = csv_data.map(lambda x: x[2]).distinct().collect()
flags = csv_data.map(lambda x: x[3]).distinct().collect()

In [7]:
def create_labeled_point(line_split):
    # leave_out = [41]
    clean_line_split = line_split[0:41]
    
    # convert protocol to numeric categorical variable
    try: 
        clean_line_split[1] = protocols.index(clean_line_split[1])
    except:
        clean_line_split[1] = len(protocols)
        
    # convert service to numeric categorical variable
    try:
        clean_line_split[2] = services.index(clean_line_split[2])
    except:
        clean_line_split[2] = len(services)
    
    # convert flag to numeric categorical variable
    try:
        clean_line_split[3] = flags.index(clean_line_split[3])
    except:
        clean_line_split[3] = len(flags)
    
    # convert label to binary label
    attack = 1.0
    if line_split[41]=='normal.':
        attack = 0.0
        
    return LabeledPoint(attack, array([float(x) for x in clean_line_split]))

training_data = csv_data.map(create_labeled_point)
test_data = test_csv_data.map(create_labeled_point)

In [9]:
from pyspark.mllib.tree import RandomForest, RandomForestModel
from time import time

t0 = time()
model = RandomForest.trainClassifier(training_data, numClasses=2, 
                                     categoricalFeaturesInfo={1: len(protocols), 2: len(services), 3: len(flags)},
                                     numTrees=3, featureSubsetStrategy="auto",
                                     impurity='gini', maxDepth=4, maxBins=100)
tt=time()-t0
print "Classifier trained in {} seconds".format(round(tt,3))

Classifier trained in 18.931 seconds


In [10]:
predictions = model.predict(test_data.map(lambda p: p.features))
labels_and_preds = test_data.map(lambda p: p.label).zip(predictions)

In [11]:
t0 = time()
test_accuracy = labels_and_preds.filter(lambda (v, p): v == p).count() / float(test_data.count())
tt = time() - t0

print "Prediction made in {} seconds. Test accuracy is {}".format(round(tt,3), round(test_accuracy,4))

Prediction made in 9.78 seconds. Test accuracy is 0.923


In [13]:
print "Learned classification tree model:"
print model.toDebugString()

Learned classification tree model:
TreeEnsembleModel classifier with 3 trees

  Tree 0:
    If (feature 22 <= 59.0)
     If (feature 35 <= 0.99)
      If (feature 25 <= 0.2)
       If (feature 40 <= 0.01)
        Predict: 0.0
       Else (feature 40 > 0.01)
        Predict: 0.0
      Else (feature 25 > 0.2)
       If (feature 32 <= 86.0)
        Predict: 1.0
       Else (feature 32 > 86.0)
        Predict: 0.0
     Else (feature 35 > 0.99)
      If (feature 2 in {10.0,24.0,25.0,52.0,20.0,46.0,38.0,33.0,65.0,41.0,2.0,45.0,22.0,49.0,3.0,35.0,63.0,50.0,43.0,8.0,58.0,36.0,19.0,4.0,62.0})
       If (feature 4 <= 1028.0)
        Predict: 0.0
       Else (feature 4 > 1028.0)
        Predict: 1.0
      Else (feature 2 not in {10.0,24.0,25.0,52.0,20.0,46.0,38.0,33.0,65.0,41.0,2.0,45.0,22.0,49.0,3.0,35.0,63.0,50.0,43.0,8.0,58.0,36.0,19.0,4.0,62.0})
       If (feature 36 <= 0.06)
        Predict: 1.0
       Else (feature 36 > 0.06)
        Predict: 1.0
    Else (feature 22 > 59.0)
     If (featur