In [0]:
from pyspark.sql.functions import col, rand

In [0]:
%sql
SHOW DATABASES

databaseName
default


In [0]:
%sql
SHOW TABLES

database,tableName,isTemporary
default,countries,False
default,flights_data,False
default,happiness_scores,False
default,loan_data,False
default,loan_data_females_under_30,False
default,olympics,False
default,partitioned_loan_data_females_under_30,False


In [0]:
# let's generate a huge dataset with customer IDs
customer_id = spark.range(1, 15000000, 1, 15)

display(customer_id)

id
1
2
3
4
5
6
7
8
9
10


In [0]:
# we'll generate a dataset with 2 columns: id and percentage (yield on client)
pct_invest_2019 = customer_id.select(col("id").alias("customer_id"), rand(1).alias("pct_investment_2019"))

display(pct_invest_2019)

customer_id,pct_investment_2019
1,0.6363787615254752
2,0.5993846534021868
3,0.134842710012538
4,0.076841639054609
5,0.8539211111755448
6,0.7167704217972344
7,0.2473902407597975
8,0.1367450741851369
9,0.3869569887491171
10,0.6051540605040805


In [0]:
# let' write the dataset as Table in parquet format
pct_invest_2019.write.format("parquet").saveAsTable("pct_invest_2019_unbucketed")

In [0]:
# write the same dataset into another parquet file, but this time perform bucketing (create 15 buckets based on id) and also save as table
pct_invest_2019.write.format("parquet").bucketBy(15, "customer_id").sortBy("pct_investment_2019").saveAsTable("pct_invest_2019_bucketed")

In [0]:
%sql
SHOW TABLES

database,tableName,isTemporary
default,countries,False
default,flights_data,False
default,happiness_scores,False
default,loan_data,False
default,loan_data_females_under_30,False
default,olympics,False
default,partitioned_loan_data_females_under_30,False
default,pct_invest_2019_bucketed,False
default,pct_invest_2019_unbucketed,False


In [0]:
%sql
SELECT * FROM pct_invest_2019_bucketed

customer_id,pct_investment_2019
3394715,3.3500006213493805e-06
3925063,2.5082757760030997e-05
3616448,4.721477589331169e-05
3353138,0.00010022960771494116
3704865,0.00011444282188599608
3831339,0.00012513462393792807
3205115,0.00013116886429109798
3420937,0.00017840825513182337
3367163,0.00018359268950907115
3910888,0.0001963006651413135


In [0]:
%sql
SELECT * FROM pct_invest_2019_unbucketed

customer_id,pct_investment_2019
9000000,0.1709497137955568
9000001,0.8051143958005459
9000002,0.5775925576589018
9000003,0.9476047869880924
9000004,0.2093704977577
9000005,0.3666422261794781
9000006,0.8078688178371882
9000007,0.7135143433452461
9000008,0.7195325566306053
9000009,0.3133529231117545


In [0]:
%sql 
SELECT min(customer_id), max(customer_id) FROM pct_invest_2019_unbucketed

min(customer_id),max(customer_id)
1,14999999


In [0]:
%sql 
SELECT min(pct_investment_2019), max(pct_investment_2019) FROM pct_invest_2019_unbucketed

min(pct_investment_2019),max(pct_investment_2019)
1.3972530576999986e-08,0.9999999933863358


In [0]:
# Lets create another table with pct_investment in 2020
pct_invest_2020 = customer_id.select(col("id").alias("customer_id"), rand(2).alias("pct_investment_2020"))

display(pct_invest_2020)

customer_id,pct_investment_2020
1,0.5311207224659675
2,0.2861372051669987
3,0.4944306372895662
4,0.4553707744971322
5,0.8792399632068049
6,0.3644632675391507
7,0.4501968242181094
8,0.4199726628902539
9,0.7051587870577706
10,0.0150881458780699


In [0]:
pct_invest_2020.write.format("parquet").bucketBy(15, "customer_id").sortBy("pct_investment_2020").saveAsTable("pct_invest_2020_bucketed")

In [0]:
%sql
SHOW TABLES

database,tableName,isTemporary
default,countries,False
default,flights_data,False
default,happiness_scores,False
default,loan_data,False
default,loan_data_females_under_30,False
default,olympics,False
default,partitioned_loan_data_females_under_30,False
default,pct_invest_2019_bucketed,False
default,pct_invest_2019_unbucketed,False
default,pct_invest_2020_bucketed,False


In [0]:
# Read from tables into new spark tables
pct_2019_unbucketed = spark.table("pct_invest_2019_unbucketed")
pct_2019_bucketed = spark.table("pct_invest_2019_bucketed")
pct_2020_bucketed = spark.table("pct_invest_2020_bucketed")

In [0]:
# perform join on unbucketed and bucketed tables (both have to be repartitioned and shuffled in process)
pct_2019_unbucketed.join(pct_2020_bucketed, "customer_id").explain()

In [0]:
display(pct_2019_unbucketed.join(pct_2020_bucketed, "customer_id"))

customer_id,pct_investment_2019,pct_investment_2020
6,0.7167704217972344,0.3644632675391507
16,0.9643107647469809,0.091261354181826
63,0.2970280250906274,0.9409547568198008
64,0.3992368306476186,0.2326988506357845
70,0.2519628438595569,0.3222232236580341
80,0.1001956565216214,0.990574198197933
123,0.0644410787913165,0.1884219435268566
130,0.942723722436596,0.0575830742136311
148,0.7218504473666008,0.8206723867477947
163,0.5427676535722059,0.6064469726646279


In [0]:
# perform repartitioning that corresponds to the number of buckets. Now repartition and shuffling runs just on unbucketed table
pct_2019_unbucketed.repartition(15, "customer_id").join(pct_2020_bucketed, "customer_id").explain()

In [0]:
display(pct_2019_unbucketed.repartition(15, "customer_id").join(pct_2020_bucketed, "customer_id"))

customer_id,pct_investment_2019,pct_investment_2020
6,0.7167704217972344,0.3644632675391507
16,0.9643107647469809,0.091261354181826
63,0.2970280250906274,0.9409547568198008
64,0.3992368306476186,0.2326988506357845
70,0.2519628438595569,0.3222232236580341
80,0.1001956565216214,0.990574198197933
123,0.0644410787913165,0.1884219435268566
130,0.942723722436596,0.0575830742136311
148,0.7218504473666008,0.8206723867477947
163,0.5427676535722059,0.6064469726646279


In [0]:
# perform another repartitioning (without specifying number of partitions)
pct_2019_unbucketed.repartition("customer_id").join(pct_2020_bucketed, "customer_id").explain()

In [0]:
# in this query no repartitions and shuffles will be performed at all
pct_2019_bucketed.join(pct_2020_bucketed, "customer_id").explain()

In [0]:
display(pct_2019_bucketed.join(pct_2020_bucketed, "customer_id"))

customer_id,pct_investment_2019,pct_investment_2020
6,0.7167704217972344,0.3644632675391507
16,0.9643107647469809,0.091261354181826
63,0.2970280250906274,0.9409547568198008
64,0.3992368306476186,0.2326988506357845
70,0.2519628438595569,0.3222232236580341
80,0.1001956565216214,0.990574198197933
123,0.0644410787913165,0.1884219435268566
130,0.942723722436596,0.0575830742136311
148,0.7218504473666008,0.8206723867477947
163,0.5427676535722059,0.6064469726646279
