We can aggregate RDD data in Spark by using three different actions: reduce, fold, and aggregate. The last one is the more general one and someway includes the first two.

In [1]:
data_file = "./kddcup.data_10_percent.gz"
raw_data = sc.textFile(data_file)

### Inspecting interaction duration by tag

In [2]:
csv_data = raw_data.map(lambda x:x.split(","))

# separate into diff RDDs
normal_csv_data = csv_data.filter(lambda x:x[41] == "normal.")
attack_csv_data = csv_data.filter(lambda x:x[41] != "normal.")

In [3]:
normal_duration_data = normal_csv_data.map(lambda x:int(x[0]))
attack_duration_data = attack_csv_data.map(lambda x:int(x[0]))

In [6]:
normal_duration_data.take(5)

[0, 0, 0, 0, 0]

In [7]:
attack_duration_data.take(5)

[184, 305, 79, 25, 0]

In [8]:
total_normal_duration = normal_duration_data.reduce(lambda x,y:x + y)
total_attack_duration = attack_duration_data.reduce(lambda x,y:x + y)

In [9]:
print ("Total duration for 'normal' interactions is {}".\
    format(total_normal_duration))
print ("Total duration for 'attack' interactions is {}".\
    format(total_attack_duration))

Total duration for 'normal' interactions is 21075991
Total duration for 'attack' interactions is 2626792


#### 第二种方法，采用add 官网文档

In [10]:
from operator import add
total_normal_duration_1 = normal_duration_data.reduce(add)
total_attack_duration_1 = attack_duration_data.reduce(add)
print('total duration for "normal" interactions is {}'.\
      format(total_normal_duration_1))
print('total duration for "attack" interactions is {}'.\
      format(total_attack_duration_1))

total duration for "normal" interactions is 21075991
total duration for "attack" interactions is 2626792


In [12]:
normal_count = normal_duration_data.count()
attack_count = attack_duration_data.count()

print("Mean duration for 'normal' interactions is {}".\
      format(round(total_normal_duration/float(normal_count),3)))
print("Mean duration for 'attack' interactions is {}".\
      format(round(total_attack_duration/float(attack_count),3)))

Mean duration for 'normal' interactions is 216.657
Mean duration for 'attack' interactions is 6.621


### A better way, using aggregate

In [13]:
normal_sum_count = normal_duration_data.aggregate(
    (0,0), # the initial value
    (lambda acc, value: (acc[0] + value, acc[1] + 1)), # combine value with acc
    (lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1])) # combine accumulators
)

print ("Mean duration for 'normal' interactions is {}".\
    format(round(normal_sum_count[0]/float(normal_sum_count[1]),3)))

Mean duration for 'normal' interactions is 216.657


In [14]:
normal_sum_count

(21075991, 97278)

In [15]:
attack_sum_count = attack_duration_data.aggregate(
    (0,0), # the initial value
    (lambda acc, value: (acc[0] + value, acc[1] + 1)), # combine value with acc
    (lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1])) # combine accumulators
)

print ("Mean duration for 'attack' interactions is {}".\
    format(round(attack_sum_count[0]/float(attack_sum_count[1]),3)))

Mean duration for 'attack' interactions is 6.621


In [16]:
attack_sum_count

(2626792, 396743)

aggregate函数的应用

aggregate(zeroValue, seqOp, combOp)

zeroValue 是初始值

seqOp 是对每个分区的操作

combOp 是对分区结果聚合的操作

In [17]:
data = [1,12,34,-5,5,25,26,89,64]
# 分成3个分区
rdd = sc.parallelize(data,3)

这里要求最大值和所有分区的总和

In [18]:
seqOp = (lambda x,y:(x[0] if x[0] > y else y,x[1] + y))
combOP = (lambda x,y:(x[0] if x[0] > y[0] else y[0],x[1] + y[1]))

In [19]:
rdd.aggregate((0,0),seqOp,combOP)

(89, 251)