In [1]:
"""
spark rdd 常用函数练习
"""

'\nspark rdd \xe5\xb8\xb8\xe7\x94\xa8\xe5\x87\xbd\xe6\x95\xb0\xe7\xbb\x83\xe4\xb9\xa0\n'

In [14]:
import random
import json
import pandas as pd
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns', None)

### 提交spark任务

In [3]:
from pyspark.sql import SparkSession
from pyspark import SparkConf, StorageLevel
from pyspark.sql.types import *
from pyspark.sql import functions

conf = SparkConf().setMaster("yarn-client").setAppName("spark exercise rdd")  # 集群模式
# conf = SparkConf().setMaster("local[*]").setAppName("antilaunder demo") # local模式
conf.set("spark.executor.instances", 10)
conf.set("spark.executor.memory", "5g")
conf.set("spark.executor.cores","1")
conf.set("spark.driver.memory", "5g")

spark = SparkSession.builder.config(conf=conf).getOrCreate()
sc = spark.sparkContext

### 读取文件  

In [4]:
rdd = sc.textFile("/data/fresh_train/rawlog").map(json.loads)
rdd.take(1)

[{u'device_model': u'pc',
  u'event_type': u'download_resource',
  u'gps': u'',
  u'ip': u'117.152.153.142',
  u'os_type': u'Windows 7',
  u'os_version': u'NT 6.1',
  u'resource_category': u'\u5176\u5b83',
  u'resource_id': u'9805867',
  u'resource_owner': u'xiaoyu5256',
  u'resource_score': u'18',
  u'resource_title': u'Python\u6570\u636e\u5206\u6790\u57fa\u7840\u6559\u7a0b\uff1aNumPy\u5b66\u4e60\u6307\u5357\uff08\u7b2c2\u7248\uff09.pdf',
  u'resource_type': u'pdf',
  u'resource_user_type': u'VIP',
  u'time_stamp': u'2018-07-01 22:47:17',
  u'uid': u'03bb7a85aa4bda74ec990b99c1a217e0',
  u'user_agent': u'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
  u'user_id': u'72318725',
  u'user_name': u'robert_1993'}]

In [5]:
rdd.count()

4195696

### df转rdd

In [24]:
df = spark.createDataFrame([
    ['a', 1],
    ['b', 2],
    ['b', 3],
    ['d', 4],
], schema=["A", "B"])

In [25]:
rdd = df.rdd.map(list)
rdd.take(2)

[[u'a', 1], [u'b', 2]]

### sc.parallelize

In [11]:
rdd1 = sc.parallelize(["a", 1, 2, "c"])
rdd1.take(2)

['a', 1]

### persist

In [9]:
rdd.persist()

PythonRDD[19] at RDD at PythonRDD.scala:48

### unpersist

In [18]:
rdd.unpersist()

PythonRDD[19] at RDD at PythonRDD.scala:48

### collect

In [12]:
# 注意：collect会把变量保存在driver的memory中，如果driver的内存不够大，可能会OOM
data = rdd.collect()
data

[[u'a', 1], [u'b', 2], [u'c', 3], [u'd', 4]]

### filter

In [13]:
rdd1 = rdd.filter(lambda x: x[1]>2)
rdd1.take(2)

[[u'c', 3], [u'd', 4]]

### map

In [15]:
def map_(x, num):
    return x + [num]

rdd1 = rdd.map(lambda x: map_(x, random.random()))
rdd1.take(2)

[[u'a', 1, 0.21632663060770863], [u'b', 2, 0.6889301695179241]]

### flatmap

In [17]:
rdd1 = rdd.flatMap(lambda x: map_(x, random.random()))
rdd1.take(5)

[u'a', 1, 0.21632663060770863, u'b', 2]

### countByValue

In [16]:
rdd.map(lambda x: x[0]).countByValue()

defaultdict(int, {u'a': 1, u'b': 1, u'c': 1, u'd': 1})

### zipWithIndex

In [22]:
# 给每条数据添加一个顺序编号,从0开始依次递增
rdd1 = rdd.sortBy(lambda x: x[0], ascending=False).zipWithIndex()
rdd1.take(5)

[([u'd', 4], 0), ([u'c', 3], 1), ([u'b', 2], 2), ([u'a', 1], 3)]

### groupByKey

In [27]:
rdd.groupByKey().map(lambda x: (x[0], list(x[1]))).take(5)

[(u'b', [2, 3]), (u'd', [4]), (u'a', [1])]

### join

In [23]:
x = sc.parallelize([("a", 1), ("b", 4)])
y = sc.parallelize([("a", 2), ("a", 3)])
sorted(x.join(y).collect())

[('a', (1, 2)), ('a', (1, 3))]

### union

In [28]:
x = sc.parallelize([("a", 1), ("b", 4)])
y = sc.parallelize([("a", 2), ("a", 3)])
z = x.union(y)
z.take(5)

[('a', 1), ('b', 4), ('a', 2), ('a', 3)]

### save

In [None]:
# 写入文件
rdd.saveAsTextFile("...") # 路径自行填写

In [None]:
sc.stop()  # 关闭spark任务