### Optimize huge file read

In [None]:
# Create Spark Session

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Optimize huge file reads") \
    .master("spark://spark-master:7077") \
    .getOrCreate()

spark

In [None]:
# Check the default partition size
partition_size = spark.conf.get("spark.sql.files.maxPartitionBytes").replace("b","")
print(f"Partition Size: {partition_size} in bytes and {int(partition_size) / 1024 / 1024} in MB")

In [None]:
# Check the default parallelism available
print(f"Parallelism : {spark.sparkContext.defaultParallelism}")

In [None]:
# File size that we are going to import
import os
file_size = os.path.getsize('dataset/sales_combined_2.csv')
print(f"""Data File Size: 
            {file_size} in bytes 
            {int(file_size) / 1024 / 1024} in MB
            {int(file_size) / 1024 / 1024 / 1024} in GB""")

In [None]:
# Lets read the file and write in noop format for Performance Benchmarking
@get_time
def x():
    df = spark.read.format("csv").option("header", True).load("dataset/sales_combined_2.csv")
    print(f"Number of Partition -> {df.rdd.getNumPartitions()}")
    df.write.format("noop").mode("overwrite").save()

In [None]:
# Change the default partition size to 3 times to decrease the number of partitions
spark.conf.set("spark.sql.files.maxPartitionBytes", str(128 * 3 * 1024 * 1024)+"b")

# Verify the partition size
partition_size = spark.conf.get("spark.sql.files.maxPartitionBytes").replace("b","")
print(f"Partition Size: {partition_size} in bytes and {int(partition_size) / 1024 / 1024} in MB")

In [None]:
# Lets read the file again with new partition size and write in noop format for Performance Benchmarking
@get_time
def x():
    df = spark.read.format("csv").option("header", True).load("dataset/sales_combined_2.csv")
    print(f"Number of Partition -> {df.rdd.getNumPartitions()}")
    df.write.format("noop").mode("overwrite").save()

In [None]:
# Change the default partition size to 160 MB to decrease the number of partitions
spark.conf.set("spark.sql.files.maxPartitionBytes", str(160 * 1024 * 1024)+"b")

# Verify the partition size
partition_size = spark.conf.get("spark.sql.files.maxPartitionBytes").replace("b","")
print(f"Partition Size: {partition_size} in bytes and {int(partition_size) / 1024 / 1024} in MB")

In [None]:
# Lets read the file again with new partition size and write in noop format for Performance Benchmarking
@get_time
def x():
    df = spark.read.format("csv").option("header", True).load("dataset/sales_combined_2.csv")
    print(f"Number of Partition -> {df.rdd.getNumPartitions()}")
    df.write.format("noop").mode("overwrite").save()

In [None]:
spark.stop()