In [1]:
from pyspark import SparkContext, SparkConf

In [2]:
# 创建连接配置,连接到standalone模式的集群
conf = SparkConf().setAppName('sparkRddDemo').setMaster("spark://sparkstandalone:7077")
# 设定driver的地址，非常重要，standalone模式的集群
conf.set("spark.driver.host","192.168.88.1")
# 获取spark上下文,创建到集群的连接
sc =  SparkContext(conf=conf)

In [None]:
# # 创建连接配置，本地连接
# conf = SparkConf().setAppName('sparkRddDemo').setMaster("local[2]")
# # 获取spark上下文,创建到集群的连接
# sc =  SparkContext(conf=conf)

In [None]:
# 有两种方式可以创建rdds，一种是通过上下文提供的并行化方法从一个可迭代对象或者collection中获取
# 另一种是内部的存储系统
# 下面从一个可迭代的对象中获取
data = [i for i in range(1,6)]
distData = sc.parallelize(data)
print(type(distData))
print(distData.reduce(lambda a, b: a + b))

In [None]:
# 从文本文件创建
distFile = sc.textFile("./*.md")
# 计算所有的单词的长度
print(distFile.map(lambda s: len(s)).reduce(lambda a, b: a + b))

In [None]:
# 从文本文件创建
distFiles = sc.wholeTextFiles("./")
# distFiles.map(lambda line:len(line)).reduce(lambda a,b:a+b)
print(distFiles.collect())

In [None]:
# 将distFiles使用pickle的方式进行持久化
distFile.saveAsPickleFile("ts.pickle")

In [None]:
# 下面通过pickle的方式读取持久化的数据
ds = sc.pickleFile("ts.pickle")
print(type(ds))
ds.collect()

In [None]:
# 保存和读取sequenceFiles
rdd = sc.parallelize(range(1,4)).map(lambda x:(x,"a"*x))
print(type(rdd))
rdd.saveAsSequenceFile('sequence/to/file')

In [None]:
# 读取sequenceFiles文件
sorted(sc.sequenceFile('sequence/to/file').collect())

In [None]:
lines = sc.textFile("data.txt")
lineLengths = lines.map(lambda s:len(s))
print(lineLengths)
totalLengths = lineLengths.reduce(lambda a,b:a+b)
print(totalLengths)

In [14]:
# 使用hadoop的inputformat来读取数据
path = "hdfs://sparkstandalone:8020/data/"
rdd = sc.newAPIHadoopFile(
    path=path,
    inputFormatClass="org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat",
    keyClass="org.apache.hadoop.io.LongWritable",
    valueClass="org.apache.hadoop.io.Text",
    conf={
     "mapreduce.input.fileinputformat.split.maxsize": "4194304"
     # "mapreduce.input.fileinputformat.split.minsize":" 4194304"
    }
)

In [15]:
word_kv =rdd.flatMap(lambda x:x[1].split(' ')).map(lambda x:(x,1))
word_kv.reduceByKey(lambda a,b:a+b).collect()

[('hadoop', 574560),
 ('yiqiatguigu', 3),
 ('hello', 21),
 ('spark', 7),
 ('atguigu', 1149117),
 ('world', 14),
 ('yiqi', 574557)]

In [11]:
# 使用hadoop的inputformat来读取数据
path = "hdfs://sparkstandalone:8020/data/"
rdd = sc.newAPIHadoopFile(
    path=path,
    inputFormatClass="org.apache.hadoop.mapreduce.lib.input.TextInputFormat",
    keyClass="org.apache.hadoop.io.LongWritable",
    valueClass="org.apache.hadoop.io.Text",
)

In [13]:
word_kv =rdd.flatMap(lambda x:x[1].split(' ')).map(lambda x:(x,1))
word_kv.reduceByKey(lambda a,b:a+b).collect()

[('yiqiatguigu', 3),
 ('hello', 21),
 ('spark', 7),
 ('atguigu', 1149117),
 ('yiqi', 574557),
 ('hadoop', 574560),
 ('world', 14)]

In [12]:
def wordCount(s:str):
    words = s.split(' ')
    return len(words)

lines.map(wordCount).collect()

NameError: name 'lines' is not defined

In [None]:
class MyClass(object):
    def func(self,s):
        return s
    def doStuff(self,rdd):
        return rdd.map(self.func)
handler = MyClass()
handler.doStuff(lines).collect()

In [None]:
class MyClass(object):
    def __init__(self):
        self.field = "Hello"
    def doStuff(self, rdd):
        return rdd.map(lambda s: self.field + s)
handler = MyClass()
handler.doStuff(lines).collect()

In [None]:
class MyClass(object):
    def __init__(self):
        self.field = "Hello"
    def doStuff(self, rdd):
        field = self.field # 通过局部变量的方式，避免对类中其他变量的引用
        return rdd.map(lambda s: field + s)
handler = MyClass()
handler.doStuff(lines).collect()

In [None]:
counter = 0
rdd = sc.parallelize(data)

# Wrong: Don't do this!!
def increment_counter(x):
    global counter
    counter += x
rdd.foreach(increment_counter)

print("Counter value: ", counter)

In [None]:
lines = sc.textFile('data.txt')
pairs = lines.map(lambda s:(s,1))
counts = pairs.reduceByKey(lambda a,b:a+b)
print(counts.collect())
print(counts.sortByKey(ascending=False).collect())

In [None]:
a = [1,2,3,4]
broadcastVar = sc.broadcast(a)

In [None]:
broadcastVar.value

In [None]:
b = [1,2,3,4,5,6,7,8,9,0,]
broadcastVar = sc.broadcast(b)
broadcastVar.value

In [None]:
broadcastVar.destroy(True)

In [None]:
broadcastVar.value

In [None]:
accum = sc.accumulator(0)
type(accum)
print(accum)
sc.parallelize([1,2,3,4,5,6,7,8,9]).foreach(lambda x:accum.add(x))

In [None]:
from pyspark.accumulators import AccumulatorParam

class VectorAccumulatorParam(AccumulatorParam):

    def zero(self, value: list) -> list:
        return [0.0]*len(value)
    
    # 累加器提供一个add方法，这个是对add方法的实现
    def addInPlace(self, value1: list, value2: list) -> list:
        for i in range(len(value1)):
            value1[i] += value2[i]
        return value1

In [None]:
# init
va = sc.accumulator([1,2,3],VectorAccumulatorParam())
print("init:",va.value)
data = [[x]*3 for x in range(1,4)]
# data = [1,2,3]
rdd = sc.parallelize(data)
# # 定义一个函数，用于执行这样的累加运算
# def g(x):
#     global va
#     va.add([x]*3) # 如果 data = [1,2,3]，则可以这样子操作
# rdd.foreach(g)
# print("after oper:",va.value)

In [None]:
rdd.map(lambda x:va.add(x))

In [None]:
rdd.map(lambda x:va.add(x)).collect()

In [None]:
va.value

In [None]:
rdd = sc.parallelize(data)

In [None]:
rdd.take(1)

In [None]:
print(accum)

In [None]:
accum = sc.accumulator(0)
def g(x):
    accum.add(x)
    return f(x)
data.map(g)

In [None]:
sc.stop()

In [None]:
distFile.map(lambda s:len(s)).reduce(lambda a,b:a+b)

In [None]:
sc.stop()

In [None]:
from pyspark.sql import SparkSession

In [None]:
distData.reduce(lambda a,b:a+b).collect()