**CHECK DEFAULT PARAMETERS**

In [0]:
sc.defaultParallelism

8

In [0]:
spark.conf.get("spark.sql.files.maxPartitionBytes")

'134217728b'

**GENERATING DATA WITHIN SPARK ENVIRONMENT**

In [0]:
from pyspark.sql.types import IntegerType
df = spark.createDataFrame(range(10), IntegerType())

df.rdd.getNumPartitions()

8

**DISPLAY THE DATA**

In [0]:
df.rdd.glom().collect()

[[Row(value=0)],
 [Row(value=1)],
 [Row(value=2)],
 [Row(value=3), Row(value=4)],
 [Row(value=5)],
 [Row(value=6)],
 [Row(value=7)],
 [Row(value=8), Row(value=9)]]

**READ EXTERNAL FILE TO SPARK ENV**

In [0]:
dbutils.fs.ls("/FileStore/tables/test_data/")

[FileInfo(path='dbfs:/FileStore/tables/test_data/Lead_Lag.csv', name='Lead_Lag.csv', size=1160, modificationTime=1743390074000),
 FileInfo(path='dbfs:/FileStore/tables/test_data/States.csv', name='States.csv', size=791, modificationTime=1743390096000)]

In [0]:
df = spark.read.format("csv").option("inferschema", True).option("header", True).option("sep", ",").load("/FileStore/tables/test_data/")

df.rdd.getNumPartitions()

2

**CHANGE THE MAX PARTITION BASED ON THE FILE**

In [0]:
spark.conf.set("spark.sql.files.maxPartitionBytes", 250)
spark.conf.get("spark.sql.files.maxPartitionBytes")

'250'

In [0]:
df = spark.read.format("csv").option("inferschema", True).option("header", True).option("sep", ",").load("/FileStore/tables/test_data/")

df.rdd.getNumPartitions()

9

**REPARTITION**

In [0]:
from pyspark.sql.types import IntegerType
df = spark.createDataFrame(range(10), IntegerType())

df.rdd.getNumPartitions()

8

In [0]:
df.rdd.glom().collect()

[[Row(value=0)],
 [Row(value=1)],
 [Row(value=2)],
 [Row(value=3), Row(value=4)],
 [Row(value=5)],
 [Row(value=6)],
 [Row(value=7)],
 [Row(value=8), Row(value=9)]]

**INCREASE THE VALUE OF PARTITION**

In [0]:
df1 = df.repartition(20)
df1.rdd.getNumPartitions()

20

In [0]:
df1.rdd.glom().collect()

[[],
 [Row(value=8)],
 [Row(value=9)],
 [],
 [Row(value=1)],
 [],
 [Row(value=6)],
 [],
 [Row(value=3)],
 [Row(value=0), Row(value=2), Row(value=4)],
 [],
 [Row(value=7)],
 [],
 [],
 [Row(value=5)],
 [],
 [],
 [],
 [],
 []]

In [0]:
df1 = df.repartition(2)
df1.rdd.getNumPartitions()

2

In [0]:
df1.rdd.glom().collect()

[[Row(value=2),
  Row(value=3),
  Row(value=5),
  Row(value=6),
  Row(value=7),
  Row(value=9)],
 [Row(value=0), Row(value=1), Row(value=4), Row(value=8)]]

**COALESCE (to REDUCE PARTITION)**

In [0]:
df2 = df.coalesce(3)
df2.rdd.getNumPartitions()
df2.rdd.glom().collect()

[[Row(value=0), Row(value=1)],
 [Row(value=2), Row(value=3), Row(value=4), Row(value=5)],
 [Row(value=6), Row(value=7), Row(value=8), Row(value=9)]]