In [1]:
from pyspark import SparkContext,SparkConf

In [2]:
sc=SparkContext()

In [3]:
data_file="/data/kdd/kddcup.data_10_percent"
raw_data=sc.textFile(data_file)

In [4]:
print "Train data size is {}".format(raw_data.count())

Train data size is 494021


In [5]:
test_data_file="/data/kdd/corrected"
test_raw_data=sc.textFile(test_data_file)
print "Test data size is {}".format(test_raw_data.count())

Test data size is 311029


In [6]:
from pyspark.mllib.regression import LabeledPoint
from numpy import array

csv_data = raw_data.map(lambda x: x.split(","))
test_csv_data = test_raw_data.map(lambda x: x.split(","))

protocols = csv_data.map(lambda x: x[1]).distinct().collect()
services = csv_data.map(lambda x: x[2]).distinct().collect()
flags = csv_data.map(lambda x: x[3]).distinct().collect()

In [7]:
def create_labeled_point(line_split):
    # leave_out = [41]
    clean_line_split = line_split[0:41]
    
    # convert protocol to numeric categorical variable
    try: 
        clean_line_split[1] = protocols.index(clean_line_split[1])
    except:
        clean_line_split[1] = len(protocols)
        
    # convert service to numeric categorical variable
    try:
        clean_line_split[2] = services.index(clean_line_split[2])
    except:
        clean_line_split[2] = len(services)
    
    # convert flag to numeric categorical variable
    try:
        clean_line_split[3] = flags.index(clean_line_split[3])
    except:
        clean_line_split[3] = len(flags)
    
    # convert label to binary label
    attack = 1.0
    if line_split[41]=='normal.':
        attack = 0.0
        
    return LabeledPoint(attack, array([float(x) for x in clean_line_split]))

training_data = csv_data.map(create_labeled_point)
test_data = test_csv_data.map(create_labeled_point)

In [10]:
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from time import time

t0=time()
tree_model = GradientBoostedTrees.trainClassifier(training_data,
                                                 categoricalFeaturesInfo={1: len(protocols), 2: len(services), 3: len(flags)}, 
                                                 numIterations=3,maxBins=100)

tt = time() - t0

print "Classifier trained in {} seconds".format(round(tt,3))

Classifier trained in 22.223 seconds


In [11]:
predictions = tree_model.predict(test_data.map(lambda p: p.features))
labels_and_preds = test_data.map(lambda p: p.label).zip(predictions)

In [12]:
t0 = time()
test_accuracy = labels_and_preds.filter(lambda (v, p): v == p).count() / float(test_data.count())
tt = time() - t0

print "Prediction made in {} seconds. Test accuracy is {}".format(round(tt,3), round(test_accuracy,4))

Prediction made in 9.707 seconds. Test accuracy is 0.9207


In [13]:
print "Learned classification tree model:"
print tree_model.toDebugString()

Learned classification tree model:
TreeEnsembleModel classifier with 3 trees

  Tree 0:
    If (feature 22 <= 84.0)
     If (feature 3 in {10.0,1.0,6.0,2.0,3.0,4.0})
      If (feature 9 <= 1.0)
       Predict: -0.9190060376228731
      Else (feature 9 > 1.0)
       Predict: 0.7106441612125387
     Else (feature 3 not in {10.0,1.0,6.0,2.0,3.0,4.0})
      If (feature 2 in {52.0,57.0,3.0})
       Predict: -0.43859649122807015
      Else (feature 2 not in {52.0,57.0,3.0})
       Predict: 0.9779353821907013
    Else (feature 22 > 84.0)
     If (feature 5 <= 0.0)
      If (feature 11 <= 0.0)
       Predict: 1.0
      Else (feature 11 > 0.0)
       Predict: -1.0
     Else (feature 5 > 0.0)
      If (feature 2 in {0.0,65.0,3.0,8.0,19.0})
       Predict: -0.9968
      Else (feature 2 not in {0.0,65.0,3.0,8.0,19.0})
       Predict: 0.6666666666666666
  Tree 1:
    If (feature 2 in {56.0,25.0,46.0,57.0,6.0,65.0,9.0,41.0,45.0,27.0,3.0,8.0,19.0,4.0})
     If (feature 12 <= 0.0)
      If (feature 28