In [0]:
%python
df = spark.read.csv('dbfs:/FileStore/tables/SalesTeam-6.csv',header=True,inferSchema=True)

In [0]:
%python
df.printSchema()

root
 |-- Sales Rep: string (nullable = true)
 |-- Sales Rep Id: string (nullable = true)
 |-- Company Name: string (nullable = true)
 |-- Company Id: string (nullable = true)



In [0]:
%python
df.describe().show()

+-------+---------------+----------------+----------------+----------------+
|summary|      Sales Rep|    Sales Rep Id|    Company Name|      Company Id|
+-------+---------------+----------------+----------------+----------------+
|  count|           4725|            4725|            4725|            4725|
|   mean|           null|            null|            null|            null|
| stddev|           null|            null|            null|            null|
|    min|     Ada Pappas|05USHZ7BLRS3JV37|       '48 Wills|007RF0BCBVMXTL80|
|    max|Yolanda Decarlo|ZWT68Z5JHHDHQKRJ|Zodiacal Roberts|ZZYB9JLJKXRH7U6O|
+-------+---------------+----------------+----------------+----------------+



In [0]:
df = (
    df
    .withColumnRenamed("Company Name", "Company_Name")
    .withColumnRenamed("Company Id", "Company_Id")
)

In [0]:
df.show()

+-----------------+----------------+--------------------+----------------+
|        Sales Rep|    Sales Rep Id|        Company_Name|      Company_Id|
+-----------------+----------------+--------------------+----------------+
|Jessie Mcallister|97UNNAT790E0WM4N|Chimera-Chasing C...|LJKS5NK6788CYMUU|
|Jessie Mcallister|97UNNAT790E0WM4N|    Tangential Sheds|36MFTZOYMTAJP1RK|
|Jessie Mcallister|97UNNAT790E0WM4N|       Two-Mile Grab|H3JRC7XX7WJAD4ZO|
|Jessie Mcallister|97UNNAT790E0WM4N|Three-Men-And-A-H...|HB25MDZR0MGCQUGX|
|Jessie Mcallister|97UNNAT790E0WM4N|Biophysical Battl...|7RVA8TIVBLBXMNO4|
|Jessie Mcallister|97UNNAT790E0WM4N|    Verbal Greenwich|KKM6EZRN9W5NYXP6|
|Jessie Mcallister|97UNNAT790E0WM4N|Unpremeditated He...|NBJDWUB8J2DHLH29|
|Jessie Mcallister|97UNNAT790E0WM4N|    Excitatory Joint|D3Q0HECTK80RRGGO|
|Jessie Mcallister|97UNNAT790E0WM4N|          Dandy Fist|KGJBNJQHPF0WZ6SI|
|Jessie Mcallister|97UNNAT790E0WM4N|      Various Caesar|7FLAHMERFCITB6RT|
|Jessie Mcallister|97UNNA

In [0]:
# partitioning df by company id
# single level partition
# it may take a long time to create partitions 
(
    df
    .write
    .partitionBy("Company Name") # partitioning dataset on the basis of company id
    .mode("overwrite")
    .parquet('/FileStore/tables/partitioned_df2') # saving partitioned file in file store
)

In [0]:
df = spark.read.csv('dbfs:/FileStore/tables/SalesTeam-6.csv',header=True,inferSchema=True)

In [0]:
# multi level partition
 
(df
 .write
 .mode("overwrite")
 .partitionBy("Sales Rep","Company Name") # multilevel partitioning
 .parquet('/FileStore/tables/multilevel_partition_df')
 )

repartition/coalesce with partionBy to control the number of partition


In [0]:
(df
 .repartition(3)
 .write
 .mode("overwrite")
 .partitionBy("Sales Rep")
 .parquet("/FileStore/tables/repartition_df")
 )

In [0]:
(df
 .coalesce(3)
 .write
 .mode("overwrite")
 .partitionBy("Sales Rep")
 .parquet("/FileStore/tables/coalesce_df")
 )

spark.sql.files.maxPartitionBytes

In [0]:
# finding the default partition done by maxPartitionBytes
df_default = spark.read.csv("dbfs:/FileStore/tables/SalesTeam-6.csv", header=True, inferSchema=True)
default_partitions = df_default.rdd.getNumPartitions()
print(f"Number of partitions with default maxPartitionBytes: {default_partitions}")

Number of partitions with default maxPartitionBytes: 1


In [0]:
spark.conf.set("spark.sql.files.maxPartitionBytes", "1000")
# partitoning df_modified in size of 1kb i.e. 1000 bytes
df_modified = spark.read.csv("dbfs:/FileStore/tables/SalesTeam-6.csv", header=True, inferSchema=True)
modified_partitions = df_modified.rdd.getNumPartitions()
print(f"Number of partitions with modified maxPartitionBytes: {modified_partitions}")

Number of partitions with modified maxPartitionBytes: 322
