In [0]:
import pandas as pd

In [0]:
#load data directly to spark dataframe. It will locate up to 128Mb data under single partition by default
loan_data = spark.read.format("csv") \
                      .option("inferSchema", True) \
                      .option("header", True) \
                      .option("sep", ",") \
                      .load("/FileStore/datasets/loan_data.csv")

In [0]:
# get number of partitions
loan_data.rdd.getNumPartitions()

In [0]:
# to make data loaded to multiple partitions they have to be loaded to pandas dataframe first. Note that it won't work in local IDE (path)
loan_pandas_df = pd.read_csv("/dbfs/FileStore/datasets/loan_data.csv") #note that path is spelled differently

In [0]:
# next create spark dataframe from pandas dataframe
loan_df = spark.createDataFrame(loan_pandas_df)

In [0]:
# check number of partitions. Now it is 4 (default == number of cores in selected spark cluster)
loan_df.rdd.getNumPartitions()

In [0]:
# to check the default number of partitions for current cluster
spark.sparkContext.defaultParallelism

In [0]:
# get data from all partitions into a single (multidimentional) list
loan_df.rdd.glom().collect()

In [0]:
type(loan_df.rdd.glom().collect())

In [0]:
# if you check the length of the outer list, it is 4 (contains 4 inner lists each for partition) 
len(loan_df.rdd.glom().collect())

In [0]:
# let's move back to the original spark dataframe, loaded to single partition
loan_data.filter(loan_data['Degree'] == 'Graduate').display() # task creates 1 job run in 1 partition

ID,Default,Loan_type,Gender,Age,Degree,Income,Credit_score,Loan_length,Signers,Citizenship
6,0,Car,Female,57,Graduate,119627,624,3,2,Non-citizen
24,1,Home,Female,53,Graduate,118848,701,8,1,Citizen
27,0,Car,Female,44,Graduate,122276,709,3,1,Non-citizen
32,0,Car,Female,55,Graduate,120783,585,3,1,Citizen
80,0,Car,Female,46,Graduate,87658,688,2,2,Citizen
89,0,Car,Female,42,Graduate,118537,623,2,2,Citizen
105,0,Car,Male,54,Graduate,118278,471,3,2,Citizen
125,0,Car,Male,52,Graduate,101998,636,3,2,Non-citizen
144,0,Home,Male,38,Graduate,107839,682,12,2,Citizen
190,0,Car,Male,49,Graduate,126511,716,3,2,Citizen


In [0]:
# now perform the same task on spark df created from pandas dataframe (has 4 partitions)
loan_df.filter(loan_df['Degree'] == 'Graduate').display() #here 1st job with 1 task/1 partition, and 2nd job with 3 tasks/3 partitions

ID,Default,Loan_type,Gender,Age,Degree,Income,Credit_score,Loan_length,Signers,Citizenship
6,0,Car,Female,57,Graduate,119627,624,3,2,Non-citizen
24,1,Home,Female,53,Graduate,118848,701,8,1,Citizen
27,0,Car,Female,44,Graduate,122276,709,3,1,Non-citizen
32,0,Car,Female,55,Graduate,120783,585,3,1,Citizen
80,0,Car,Female,46,Graduate,87658,688,2,2,Citizen
89,0,Car,Female,42,Graduate,118537,623,2,2,Citizen
105,0,Car,Male,54,Graduate,118278,471,3,2,Citizen
125,0,Car,Male,52,Graduate,101998,636,3,2,Non-citizen
144,0,Home,Male,38,Graduate,107839,682,12,2,Citizen
190,0,Car,Male,49,Graduate,126511,716,3,2,Citizen


In [0]:
# get max size per partition in bytes (appr. 134Mb)
spark.conf.get("spark.sql.files.maxPartitionBytes")

In [0]:
# set max partition size manually
spark.conf.set("spark.sql.files.maxPartitionBytes", 10000)

In [0]:
# now recreate spark dataframe and check the number of partitions
loan_data_with_partitions = spark.read.format("csv") \
                      .option("inferSchema", True) \
                      .option("header", True) \
                      .option("sep", ",") \
                      .load("/FileStore/datasets/loan_data.csv")
loan_data_with_partitions.rdd.getNumPartitions() #returns 25 partitions

In [0]:
# now the same command invoke 3 Jobs (1+4+20=25 tasks, each per partition)
loan_data_with_partitions.filter(loan_data_with_partitions['Degree'] =='Graduate').display()

ID,Default,Loan_type,Gender,Age,Degree,Income,Credit_score,Loan_length,Signers,Citizenship
6,0,Car,Female,57,Graduate,119627,624,3,2,Non-citizen
24,1,Home,Female,53,Graduate,118848,701,8,1,Citizen
27,0,Car,Female,44,Graduate,122276,709,3,1,Non-citizen
32,0,Car,Female,55,Graduate,120783,585,3,1,Citizen
80,0,Car,Female,46,Graduate,87658,688,2,2,Citizen
89,0,Car,Female,42,Graduate,118537,623,2,2,Citizen
105,0,Car,Male,54,Graduate,118278,471,3,2,Citizen
125,0,Car,Male,52,Graduate,101998,636,3,2,Non-citizen
144,0,Home,Male,38,Graduate,107839,682,12,2,Citizen
190,0,Car,Male,49,Graduate,126511,716,3,2,Citizen


In [0]:
# repartition data
#currently it is 4
loan_df.rdd.getNumPartitions()

In [0]:
# perform repartition. Now it is 8
repartitioned_loan_df = loan_df.repartition(8)
repartitioned_loan_df.rdd.getNumPartitions()

In [0]:
# another way to get the number of partitions
len(repartitioned_loan_df.rdd.glom().collect())

In [0]:
# to perform manual partitioning based on column value, first we have to turn off adaptive partitioning
# this command returns 200, despite just 3 values in target column. Because shuffling is performed by default

spark.conf.set("spark.sql.adaptive.enabled", False)

repartitioned_col_loan_df =loan_df.repartition('Degree')
repartitioned_col_loan_df.rdd.getNumPartitions()

In [0]:
spark.conf.get("spark.sql.shuffle.partitions")

In [0]:
# command reveals that all of partitions are empty, but 3 partitions containing data
repartitioned_col_loan_df.rdd.glom().collect()

In [0]:
# decrease number of partitions to 2
coalesced_loan_df = loan_df.coalesce(2)
coalesced_loan_df.rdd.getNumPartitions()

In [0]:
# coalesce() doesn't work for increasing partitions (returns the same number instead)
coalesced_loan_df = loan_df.coalesce(8)
coalesced_loan_df.rdd.getNumPartitions()