In [1]:
data_file = './kddcup.data_10_percent.gz'
raw_data = sc.textFile(data_file)

#### filter transformation

In [2]:
normal_raw_data = raw_data.filter(lambda x:'normal.' in x)

Now we can count how many elements we have in the new RDD

In [3]:
from time import time
t0 = time()
normal_count = normal_raw_data.count()
tt = time() - t0

print("There are {} 'normal' interactions".format(normal_count))
print("Count completed in {} secends".format(round(tt,3)))

There are 97278 'normal' interactions
Count completed in 2.038 secends


#### map transformation
python's lambdas are specially expressive for this particular

In [5]:
from pprint import pprint
csv_data = raw_data.map(lambda x:x.split(","))
t0 = time()
head_rows = csv_data.take(5)
tt = time() - t0

print ("Parse completed in {} seconds".format(round(tt,3)))
pprint(head_rows[0])

Parse completed in 0.152 seconds
['0',
 'tcp',
 'http',
 'SF',
 '181',
 '5450',
 '0',
 '0',
 '0',
 '0',
 '0',
 '1',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '8',
 '8',
 '0.00',
 '0.00',
 '0.00',
 '0.00',
 '1.00',
 '0.00',
 '0.00',
 '9',
 '9',
 '1.00',
 '0.00',
 '0.11',
 '0.00',
 '0.00',
 '0.00',
 '0.00',
 '0.00',
 'normal.']


In [6]:
t0 = time()
head_rows = csv_data.take(100000)
tt = time() - t0
print ("Parse completed in {} seconds".format(round(tt,3)))

Parse completed in 1.533 seconds


#### using map and predefined functions

In [7]:
def parse_interaction(line):
    elems = line.split(',')
    tag = elems[41]
    return (tag,elems)

key_csv_data = raw_data.map(parse_interaction)
head_rows = key_csv_data.take(5)
pprint(head_rows[0])

('normal.',
 ['0',
  'tcp',
  'http',
  'SF',
  '181',
  '5450',
  '0',
  '0',
  '0',
  '0',
  '0',
  '1',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '8',
  '8',
  '0.00',
  '0.00',
  '0.00',
  '0.00',
  '1.00',
  '0.00',
  '0.00',
  '9',
  '9',
  '1.00',
  '0.00',
  '0.11',
  '0.00',
  '0.00',
  '0.00',
  '0.00',
  '0.00',
  'normal.'])


#### collect action

In [10]:
from timeit import default_timer as timer
t0 = timer()
all_raw_data = raw_data.collect()
tt = timer() - t0
print ("Data collected in {} seconds".format(round(tt,3)))

Data collected in 2.458 seconds


In [12]:
# get data from file
data_file = "./kddcup.data_10_percent.gz"
raw_data = sc.textFile(data_file)

# parse into key-value pairs
key_csv_data = raw_data.map(parse_interaction)

# filter normal key interactions
normal_key_interactions = key_csv_data.filter(lambda x: x[0] == "normal.")

# collect all
t0 = timer()
all_normal = normal_key_interactions.collect()
tt = timer() - t0
normal_count = len(all_normal)
print ("Data collected in {} seconds".format(round(tt,3)))
print ("There are {} 'normal' interactions".format(normal_count))

Data collected in 2.478 seconds
There are 97278 'normal' interactions


In [14]:
all_normal[0]

('normal.',
 ['0',
  'tcp',
  'http',
  'SF',
  '181',
  '5450',
  '0',
  '0',
  '0',
  '0',
  '0',
  '1',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '8',
  '8',
  '0.00',
  '0.00',
  '0.00',
  '0.00',
  '1.00',
  '0.00',
  '0.00',
  '9',
  '9',
  '1.00',
  '0.00',
  '0.11',
  '0.00',
  '0.00',
  '0.00',
  '0.00',
  '0.00',
  'normal.'])

In [15]:
type(all_normal)

list