In [1]:
from pathlib import Path
import findspark

findspark.init(
    spark_home=str(Path.cwd() / ".." / "ext" / "spark-3.4.0-bin-hadoop3")
)

from pyspark import SparkContext

# Creating RDDs

In [2]:
nums = range(10)

with SparkContext('local[1]') as sc:  # change me 1, 2, 15
    rdd = sc.parallelize(nums) # change me

    print("Default parallelism: {}".format(sc.defaultParallelism))
    print("Number of partitions: {}".format(rdd.getNumPartitions()))
    print("Partitioner: {}".format(rdd.partitioner))
    print("Partitions structure: {}".format(rdd.glom().collect()))

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/14 22:08:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Default parallelism: 1
Number of partitions: 1
Partitioner: None


                                                                                

Partitions structure: [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]


In [3]:
# from collection
sentence = "Hurtownie Danych i Big Data i Coś Jeszcze"

with SparkContext('local') as sc:
    words = sc.parallelize(sentence.split(" "), 2)

    print(words.collect())
    print(words.glom().collect())

['Hurtownie', 'Danych', 'i', 'Big', 'Data', 'i', 'Coś', 'Jeszcze']
[['Hurtownie', 'Danych', 'i', 'Big'], ['Data', 'i', 'Coś', 'Jeszcze']]


In [4]:
# from text file
with SparkContext('local') as sc:
    file_rdd = sc.textFile("../README.md")
    print(file_rdd.first())

# Apache Spark


## Manipulating RDDs

In [5]:
# duplicate removal
with SparkContext('local') as sc:
    words_rdd = sc.parallelize(sentence.split(" "), 2)
    t1_rdd = words_rdd.distinct()
    print(t1_rdd.collect())

['Hurtownie', 'Danych', 'Big', 'Coś', 'Jeszcze', 'i', 'Data']


In [6]:
# filtering
def starts_with(word, letter):
  return word.startswith(letter)

with SparkContext('local') as sc:
    words_rdd = sc.parallelize(sentence.split(" "), 2)
    t2_rdd = words_rdd.filter(lambda word: starts_with(word, 'i'))
    print(t2_rdd.collect())

['i', 'i']


In [7]:
# map
with SparkContext('local') as sc:
    words_rdd = sc.parallelize(sentence.split(" "), 2)
    t3_rdd = words_rdd.map(lambda word: (word, word[0], word.startswith("B")))
    print(t3_rdd.collect())

[('Hurtownie', 'H', False), ('Danych', 'D', False), ('i', 'i', False), ('Big', 'B', True), ('Data', 'D', False), ('i', 'i', False), ('Coś', 'C', False), ('Jeszcze', 'J', False)]


In [8]:
# sort by desc word length
with SparkContext('local') as sc:
    words_rdd = sc.parallelize(sentence.split(" "), 2)
    t4_rdd = words_rdd.sortBy(lambda word: len(word) * -1)
    print(t4_rdd.take(3))

['Hurtownie', 'Jeszcze', 'Danych']


In [9]:
# random split with provided weights
seed = 123

with SparkContext('local') as sc:
    words_rdd = sc.parallelize(sentence.split(" "), 2)

    for rdd_split in words_rdd.randomSplit([0.8, 0.2], seed):
        print(rdd_split.collect())

['Danych', 'i', 'Big', 'Data', 'i', 'Coś', 'Jeszcze']
['Hurtownie']


In [10]:
# sampling
with SparkContext('local') as sc:
    words_rdd = sc.parallelize(sentence.split(" "), 2)

    t5_rdd = words_rdd.sample(False, .3)  # sample 30% without replacement
    print(t5_rdd.collect())

['Hurtownie']


## Actions

In [11]:
with SparkContext('local') as sc:
    t6_rdd = sc.parallelize(range(1, 21))
    print(t6_rdd.reduce(lambda x, y: x + y))

210


In [12]:
with SparkContext('local') as sc:
    words_rdd = sc.parallelize(sentence.split(" "), 2)
    print(words_rdd.count())
    print(words_rdd.first())
    print(words_rdd.take(2))

8
Hurtownie
['Hurtownie', 'Danych']


In [13]:
!rm -rf /tmp/words_rdd

In [14]:
with SparkContext('local') as sc:
    words_rdd = sc.parallelize(sentence.split(" "), 2)
    words_rdd.saveAsTextFile("file:/tmp/words_rdd")

In [15]:
!ls /tmp/words_rdd/

_SUCCESS    part-00000  part-00001


In [16]:
!cat /tmp/words_rdd/part-00000

[38;5;238m───────┬────────────────────────────────────────────────────────────────────────[0m
       [38;5;238m│ [0mFile: [1m/tmp/words_rdd/part-00000[0m
[38;5;238m───────┼────────────────────────────────────────────────────────────────────────[0m
[38;5;238m   1[0m   [38;5;238m│[0m [38;5;238mHurtownie[0m
[38;5;238m   2[0m   [38;5;238m│[0m [38;5;238mDanych[0m
[38;5;238m   3[0m   [38;5;238m│[0m [38;5;238mi[0m
[38;5;238m   4[0m   [38;5;238m│[0m [38;5;238mBig[0m
[38;5;238m───────┴────────────────────────────────────────────────────────────────────────[0m


## Key-Value pairs

In [17]:
with SparkContext('local') as sc:
    words_rdd = sc.parallelize(sentence.split(" "), 2)
    t7_rdd = words_rdd.map(lambda word: (word[0], word.lower()))

    print(t7_rdd.collect())
    print(dict(t7_rdd.countByKey()))
    print(t7_rdd.mapValues(lambda word: word.upper()).collect())
    # print(t7_rdd.groupByKey().collect())

[('H', 'hurtownie'), ('D', 'danych'), ('i', 'i'), ('B', 'big'), ('D', 'data'), ('i', 'i'), ('C', 'coś'), ('J', 'jeszcze')]
{'H': 1, 'D': 2, 'i': 2, 'B': 1, 'C': 1, 'J': 1}
[('H', 'HURTOWNIE'), ('D', 'DANYCH'), ('i', 'I'), ('B', 'BIG'), ('D', 'DATA'), ('i', 'I'), ('C', 'COŚ'), ('J', 'JESZCZE')]
