In [1]:
from time import sleep

from pyspark import StorageLevel
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

import findspark
findspark.init()

spark = SparkSession.builder.master("local").getOrCreate()
sc = spark.sparkContext

Spark cluster parallelism 
executors_num
memory_per_ex
cores_per_execut
s = executors_num * cores_per_execut = 400 slotes
20 block => 20 slotes ~ 95%

# 1. HOW TO CREATE RDD
# we can build RDDs out of local collections

In [15]:
numbers = range(1, 1000000)
numbers_parent_rdd = sc.parallelize(numbers, 4)

# Dependency: numbers_rdd => numbers_rdd_2
# Linage: partition: block => numbers_rdd => numbers_rdd_2
numbers_parent_rdd.take(10)

numbers_parent_rdd.getNumPartitions()

4

How to read a file in parallel

In [16]:
stocks_rdd_v2 = sc.textFile("data/stocks/aapl.csv"). \
    map(lambda row: row.split(",")). \
    filter(lambda tokens: float(tokens[2]) > 15)

stocks_rdd_v2.take(10)

# protected def getPartitions: Array[Partition]
# Partition -> adress block of file -> 
# aapl.csv
# block 1  -> Partition
# block 2  
# block 3
# ...


[['AAPL', 'Jan 1 2000', '25.94'],
 ['AAPL', 'Feb 1 2000', '28.66'],
 ['AAPL', 'Mar 1 2000', '33.95'],
 ['AAPL', 'Apr 1 2000', '31.01'],
 ['AAPL', 'May 1 2000', '21'],
 ['AAPL', 'Jun 1 2000', '26.19'],
 ['AAPL', 'Jul 1 2000', '25.41'],
 ['AAPL', 'Aug 1 2000', '30.47'],
 ['AAPL', 'Jun 1 2004', '16.27'],
 ['AAPL', 'Jul 1 2004', '16.17']]

In [17]:
# read from a DF
stocks_df = spark.read.csv("data/stocks"). \
    withColumnRenamed("_c0", "company"). \
    withColumnRenamed("_c1", "date"). \
    withColumnRenamed("_c2", "price")

stocks_rdd_v3 = stocks_df.rdd

prices_rdd = stocks_rdd_v3.map(lambda row: row.price)
prices_rdd.toDebugString()
prices_rdd.take(10)

['25.94',
 '28.66',
 '33.95',
 '31.01',
 '21',
 '26.19',
 '25.41',
 '30.47',
 '12.88',
 '9.78']

In [18]:
# RDD to DF
# condition: the RDD must contain Spark Rows (data structures conforming to a schema)
stocks_df_v2 = spark.createDataFrame(stocks_rdd_v3)
stocks_df_v2.take(10)

[Row(company='AAPL', date='Jan 1 2000', price='25.94'),
 Row(company='AAPL', date='Feb 1 2000', price='28.66'),
 Row(company='AAPL', date='Mar 1 2000', price='33.95'),
 Row(company='AAPL', date='Apr 1 2000', price='31.01'),
 Row(company='AAPL', date='May 1 2000', price='21'),
 Row(company='AAPL', date='Jun 1 2000', price='26.19'),
 Row(company='AAPL', date='Jul 1 2000', price='25.41'),
 Row(company='AAPL', date='Aug 1 2000', price='30.47'),
 Row(company='AAPL', date='Sep 1 2000', price='12.88'),
 Row(company='AAPL', date='Oct 1 2000', price='9.78')]

    Use cases for RDDs
    - the computations that cannot work on DFs/Spark SQL API
    - very custom perf optimizations

In [6]:
# RDD transformations
# map, filter, flatMap

# distinct
company_names_rdd = stocks_rdd_v3 \
    .map(lambda row: row.company) \
    .distinct()
company_names_rdd.take(10)


['AAPL', 'AMZN', 'MSFT', 'IBM', 'GOOG']

In [19]:
# counting
total_entries = stocks_rdd_v3.count()  # action - the RDD must be evaluated
total_entries

560

In [8]:
# min and max
aapl_stocks_rdd = stocks_rdd_v3 \
    .filter(lambda row: row.company == "AAPL") \
    .map(lambda row: float(row.price))


# narrow transformation
# filter: RDD => RDD
# map: RDD => RDD

# action
# count: RDD => integer
# max: RDD => max



max_aapl = aapl_stocks_rdd.max()
min_aapl = aapl_stocks_rdd.min()
print(max_aapl)
print(min_aapl)

223.02
7.07


In [20]:
# reduce ACTION
sum_prices = aapl_stocks_rdd \
    .reduce(lambda x, y: x + y)  # can use ANY Python function here  1,2,3,4 => 1+2 = 3 + 3 = 6 + 4
print(sum_prices)

7961.850000000001


In [21]:
# grouping
grouped_stocks_rdd = stocks_rdd_v3 \
    .groupBy(lambda row: row.company)  # can use ANY grouping criterion as a Python function
# grouping is expensive - involves a shuffle
grouped_stocks_rdd.take(10)

[('AAPL', <pyspark.resultiterable.ResultIterable at 0x2119a967110>),
 ('AMZN', <pyspark.resultiterable.ResultIterable at 0x2119a9d4990>),
 ('MSFT', <pyspark.resultiterable.ResultIterable at 0x2119aa03b10>),
 ('IBM', <pyspark.resultiterable.ResultIterable at 0x2119aa13810>),
 ('GOOG', <pyspark.resultiterable.ResultIterable at 0x2119aa23550>)]

In [12]:
# partitioning
repartitioned_stocks_rdd = stocks_rdd_v3.repartition(4).coalesce(2) #.repartition(row.company)  #.coalesce(2)

repartitioned_stocks_rdd.getNumPartitions()
# repartitioned_stocks_rdd.getNumPartitions()

# RDD
#  part1 => |||||| 20           
#  part2 => |||||||||||||| 40   
#  part3 => ||||| 10            
#  part4 => ||||| 10       


# nums of executor * nums of cores = max_parallesim = number slot = 100
#  I/O or filter, map, ....
.repartition(2) # shuffle
#  part1 => ||||| 40
#  part2 => ||||| 40

# 50 block => 50 partitions => 50 tasks 
# 50 slot will 
# 50 slots will 

# 1 GB ~  size of each row * count
# size of each row = 1MB  => 200 MB

.coalesce(5) # 5 parttions => 
# part1 => |||||| 20 + |||||||||||||| 40 => 60
# part2 => ||||| 10 +   ||||| 10 => 20

.coalesce(1)


.repartition(col("company")) # 200
#  part1 => |||||| 20           
#  part2 => |||||||||||||| 40   
#  part3 => ||||| 10            
#  part4 => ||||| 10       



repartitioned_stocks_rdd.join()

repartitioned_stocks_rdd.group()

repartitioned_stocks_rdd.group()

# getHash from "company" => number - 12312312312321 % 200 => rest 1-200 => 1
# part1 => | => groups with a equal  "company"  "Coca Cola"
# .....
# part36 => ||| => "BMW"
# .....
# part200  => |||||||| "VW"

# groupBy = "company"
# join = "company"




# TODO: SLIDE
# .repartition(30)  # involves a shuffle
# involves a shuffle
#  .repartition(5) 100
#  part1 => |||||| 20           20 2  =>
#  part2 => |||||||||||||| 40   20 2  => |||||||||||||| 40 + |||||| 20 = 60
#  part3 => ||||| 10            20 2
#  part4 => |||||||||| 30       20 2  => |||||||||| 30 + ||||| 10 = 40
#  part5 =>                     20 2


SyntaxError: invalid syntax (4174633332.py, line 16)

Exercises

1. Read the movies dataset as an RDD

2. Show the distinct genres as an RDD

3. Print all the movies in the Drama genre with IMDB rating > 6


In [14]:
movies_df = spark.read.json("data/movies")
movies_rdd = movies_df.rdd

movies_rdd.take(5)

[Row(Creative_Type=None, Director=None, Distributor='Gramercy', IMDB_Rating=6.1, IMDB_Votes=1071, MPAA_Rating='R', Major_Genre=None, Production_Budget=8000000, Release_Date='12-Jun-98', Rotten_Tomatoes_Rating=None, Running_Time_min=None, Source=None, Title='The Land Girls', US_DVD_Sales=None, US_Gross=146083, Worldwide_Gross=146083),
 Row(Creative_Type=None, Director=None, Distributor='Strand', IMDB_Rating=6.9, IMDB_Votes=207, MPAA_Rating='R', Major_Genre='Drama', Production_Budget=300000, Release_Date='7-Aug-98', Rotten_Tomatoes_Rating=None, Running_Time_min=None, Source=None, Title='First Love, Last Rites', US_DVD_Sales=None, US_Gross=10876, Worldwide_Gross=10876),
 Row(Creative_Type=None, Director=None, Distributor='Lionsgate', IMDB_Rating=6.8, IMDB_Votes=865, MPAA_Rating=None, Major_Genre='Comedy', Production_Budget=250000, Release_Date='28-Aug-98', Rotten_Tomatoes_Rating=None, Running_Time_min=None, Source=None, Title='I Married a Strange Person', US_DVD_Sales=None, US_Gross=20313

# 2. HOW TO SAVE AND PERSIST RDD

In [60]:
r = [1, 2, 3, 4, 5, 6, 7, 8]
ints = sc.parallelize(r, 4).coalesce(2)

ints.getNumPartitions()

ints.saveAsTextFile("data/output/ints")


# ints = sc.parallelize(r).coalesce(1)
# ints.coalesce(2) \
#     .saveAsTextFile("data/output/ints")

# 3. HOW TO GROUP AND JOIN RDD

In [69]:
data = [("Ivan", 240), ("Petr", 39), ("Elena", 290), ("Elena", 300)]
codeRows = sc.parallelize(data)
codeRows.collect()

[('Ivan', 240), ('Petr', 39), ('Elena', 290), ('Elena', 300)]

In [70]:
# how to reduce
reduced = codeRows.reduceByKey(lambda x, y: x + y)
print(reduced.collect())

[('Ivan', 240), ('Petr', 39), ('Elena', 590)]


In [71]:
# how to deduplicate
deduplicated = codeRows.reduceByKey(lambda x, y: x if (x > y) else y)
print(deduplicated.collect())

[('Ivan', 240), ('Petr', 39), ('Elena', 300)]


In [42]:
# how to fold by key
folded = codeRows.foldByKey(1000, lambda x, y: x + y)


# TODO Sliding
#     How to foldByKey works
#     part1 (k1:2, k2:2, k3:2, k1:2) => shufle => reduce (k1:2, k1:2, k1:2) => k1:6
#     part2 (k2:2, k2:2, k3:2, k1:2) shufle => (k2:2, k2:2, k2:2) => k2:6, (k3:2, k3:2) => k3:4

folded.collect()

[('Ivan', 1240), ('Petr', 1039), ('Elena', 1590)]

In [72]:
# Aggregated
aggregated = codeRows.aggregateByKey(1000, lambda x, y: x + y, lambda x, y: x + y)
aggregated.collect()

#     How to aggregateByKey works, shuffle less
#     part1 (k1:2, k2:2, k3:2, k1:2) => (k1:4, k2:2, k3:2) =>  shuffle => (k1:4, k1:2) => k1:6
#     part2 (k2:2, k2:2, k3:2, k1:2) => (k1:2, k2:4, k3:2) => shuffle => (k2:4, k2:2) => k2:6, (k3:2, k3:2) => k3:4

[('Ivan', 1240), ('Petr', 1039), ('Elena', 1590)]

In [72]:
# # groupByKey works
grouped = codeRows.groupByKey()
# TODO show the inner array
print(grouped.collect())

grouped.toDebugString().decode("utf-8")

# b'(1) PythonRDD[19] at collect at C:/Users/VOpolskiy/PycharmProjects/another/eas-017-RDD-py/lection/01-RDD.py:208 []\n |
# MapPartitionsRDD[18] at mapPartitions at PythonRDD.scala:145 []\n |
# ShuffledRDD[17] at partitionBy at NativeMethodAccessorImpl.java:0 []
# \n +-(1) PairwiseRDD[16] at groupByKey at C:/Users/VOpolskiy/PycharmProjects/another/eas-017-RDD-py/lection/01-RDD.py:207 []
# \n    |  PythonRDD[15] at groupByKey at C:/Users/VOpolskiy/PycharmProjects/another/eas-017-RDD-py/lection/01-RDD.py:207 []
# \n    |  ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:274 []'
# # Don't forget about joins with preferred languages
#

[('Ivan', <pyspark.resultiterable.ResultIterable object at 0x7fc7ee232430>), ('Petr', <pyspark.resultiterable.ResultIterable object at 0x7fc7ee2322b0>), ('Elena', <pyspark.resultiterable.ResultIterable object at 0x7fc7ee2323a0>)]


'(1) PythonRDD[290] at collect at /tmp/ipykernel_109/2934942739.py:3 []\n |  MapPartitionsRDD[289] at mapPartitions at PythonRDD.scala:145 []\n |  ShuffledRDD[288] at partitionBy at NativeMethodAccessorImpl.java:0 []\n +-(1) PairwiseRDD[287] at groupByKey at /tmp/ipykernel_109/2934942739.py:2 []\n    |  PythonRDD[286] at groupByKey at /tmp/ipykernel_109/2934942739.py:2 []\n    |  ParallelCollectionRDD[235] at readRDDFromFile at PythonRDD.scala:274 []'

# Joining

In [73]:
profileData = [("Ivan", "Java"), ("Elena", "Scala"), ("Petr", "Scala")]
programmerProfiles = sc.parallelize(profileData)
programmerProfiles.collect()

[('Ivan', 'Java'), ('Elena', 'Scala'), ('Petr', 'Scala')]

In [74]:
# RDD joining possible for only tuples

joined = programmerProfiles.join(codeRows)
print(joined.toDebugString().decode("utf-8"))
joined.collect()

(2) PythonRDD[202] at RDD at PythonRDD.scala:53 []
 |  MapPartitionsRDD[201] at mapPartitions at PythonRDD.scala:158 []
 |  ShuffledRDD[200] at partitionBy at NativeMethodAccessorImpl.java:0 []
 +-(2) PairwiseRDD[199] at join at C:\Users\kokhrime\AppData\Local\Temp\ipykernel_8112\3472480033.py:3 []
    |  PythonRDD[198] at join at C:\Users\kokhrime\AppData\Local\Temp\ipykernel_8112\3472480033.py:3 []
    |  UnionRDD[197] at union at NativeMethodAccessorImpl.java:0 []
    |  PythonRDD[195] at RDD at PythonRDD.scala:53 []
    |  ParallelCollectionRDD[194] at readRDDFromFile at PythonRDD.scala:287 []
    |  PythonRDD[196] at RDD at PythonRDD.scala:53 []
    |  ParallelCollectionRDD[178] at readRDDFromFile at PythonRDD.scala:287 []


[('Elena', ('Scala', 290)),
 ('Elena', ('Scala', 300)),
 ('Petr', ('Scala', 39)),
 ('Ivan', ('Java', 240))]

In [75]:
# cogroup is fullouter join with dividing array

data = [("Ivan", 240), ("Petr", 39), ("Elena", 290), ("Elena", 300)]
codeRows = sc.parallelize(data)
codeRows = programmerProfiles.cogroup(codeRows)
codeRows.take(5)

[('Elena',
  (<pyspark.resultiterable.ResultIterable at 0x2119ceba650>,
   <pyspark.resultiterable.ResultIterable at 0x2119cfa0b50>)),
 ('Petr',
  (<pyspark.resultiterable.ResultIterable at 0x2119d098950>,
   <pyspark.resultiterable.ResultIterable at 0x2119cf2e9d0>)),
 ('Ivan',
  (<pyspark.resultiterable.ResultIterable at 0x2119d15ea10>,
   <pyspark.resultiterable.ResultIterable at 0x2119d15fa50>))]

In [77]:
# Sorting
programmerProfiles.cogroup(codeRows).sortByKey(False).collect()

# TODO Write code to show inner arry

[('Petr',
  (<pyspark.resultiterable.ResultIterable at 0x2119a9d7350>,
   <pyspark.resultiterable.ResultIterable at 0x2119cdac8d0>)),
 ('Ivan',
  (<pyspark.resultiterable.ResultIterable at 0x2119cfe96d0>,
   <pyspark.resultiterable.ResultIterable at 0x2119d06b550>)),
 ('Elena',
  (<pyspark.resultiterable.ResultIterable at 0x2119cc13010>,
   <pyspark.resultiterable.ResultIterable at 0x2119d122ed0>))]

In [80]:
print("== CountByKey")
print(str(joined.countByKey()))

== CountByKey
defaultdict(<class 'int'>, {'Elena': 2, 'Petr': 1, 'Ivan': 1})


In [85]:
# codeRows keys only
print("== Keys")
codeRows.keys().collect()

== Keys


['Elena', 'Petr', 'Ivan']

In [86]:
# Print values only
print("== Value")
codeRows.values().take(4)

== Value


[(<pyspark.resultiterable.ResultIterable at 0x2119d1fea10>,
  <pyspark.resultiterable.ResultIterable at 0x2119d1fe110>),
 (<pyspark.resultiterable.ResultIterable at 0x2119d1ff210>,
  <pyspark.resultiterable.ResultIterable at 0x2119d1fc110>),
 (<pyspark.resultiterable.ResultIterable at 0x2119d1d9710>,
  <pyspark.resultiterable.ResultIterable at 0x2119d209090>)]

In [22]:
df = spark.read.option("inferSchema", "true").json("data/movies")
movies_rdd = df.rdd

print(movies_rdd.take(5))

[Row(Creative_Type=None, Director=None, Distributor='Gramercy', IMDB_Rating=6.1, IMDB_Votes=1071, MPAA_Rating='R', Major_Genre=None, Production_Budget=8000000, Release_Date='12-Jun-98', Rotten_Tomatoes_Rating=None, Running_Time_min=None, Source=None, Title='The Land Girls', US_DVD_Sales=None, US_Gross=146083, Worldwide_Gross=146083), Row(Creative_Type=None, Director=None, Distributor='Strand', IMDB_Rating=6.9, IMDB_Votes=207, MPAA_Rating='R', Major_Genre='Drama', Production_Budget=300000, Release_Date='7-Aug-98', Rotten_Tomatoes_Rating=None, Running_Time_min=None, Source=None, Title='First Love, Last Rites', US_DVD_Sales=None, US_Gross=10876, Worldwide_Gross=10876), Row(Creative_Type=None, Director=None, Distributor='Lionsgate', IMDB_Rating=6.8, IMDB_Votes=865, MPAA_Rating=None, Major_Genre='Comedy', Production_Budget=250000, Release_Date='28-Aug-98', Rotten_Tomatoes_Rating=None, Running_Time_min=None, Source=None, Title='I Married a Strange Person', US_DVD_Sales=None, US_Gross=203134,

In [23]:
dist_movies = movies_rdd.map(lambda row: row.Major_Genre).distinct()
dist_movies.take(5)

[None, 'Drama', 'Comedy', 'Musical', 'Thriller/Suspense']

In [55]:
# spark_dsl_only_df = col("Major_Genre") == "Drama" && col("IMDB_Rating") > 6
python_lambda_rdd = lambda movie: (movie.Major_Genre == "Drama") and (movie.IMDB_Rating is not None and movie.IMDB_Rating > 6)

s_movies = movies_rdd.filter(python_lambda_rdd)

print(s_movies.take(5))

[Row(Creative_Type=None, Director=None, Distributor='Strand', IMDB_Rating=6.9, IMDB_Votes=207, MPAA_Rating='R', Major_Genre='Drama', Production_Budget=300000, Release_Date='7-Aug-98', Rotten_Tomatoes_Rating=None, Running_Time_min=None, Source=None, Title='First Love, Last Rites', US_DVD_Sales=None, US_Gross=10876, Worldwide_Gross=10876), Row(Creative_Type=None, Director='Sidney Lumet', Distributor='United Artists', IMDB_Rating=8.9, IMDB_Votes=119101, MPAA_Rating=None, Major_Genre='Drama', Production_Budget=340000, Release_Date='13-Apr-57', Rotten_Tomatoes_Rating=None, Running_Time_min=None, Source='Original Screenplay', Title='12 Angry Men', US_DVD_Sales=None, US_Gross=0, Worldwide_Gross=0), Row(Creative_Type='Science Fiction', Director='Terry Gilliam', Distributor='Universal', IMDB_Rating=8.1, IMDB_Votes=169858, MPAA_Rating='R', Major_Genre='Drama', Production_Budget=29000000, Release_Date='27-Dec-95', Rotten_Tomatoes_Rating=None, Running_Time_min=None, Source='Based on Short Film', T

In [3]:
data = [("Ivan", 240), ("Petr", 39), ("Elena", 290), ("Elena", 300)]
codeRows = sc.parallelize(data)

grouped = codeRows.groupByKey()

grouped.toDebugString().decode("utf-8")

grouped.count()

3