### Optimize huge file read

In [1]:
# Create Spark Session

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Optimize huge file reads") \
    .master("spark://spark-master:7077") \
    .getOrCreate()

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/06 19:38:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Check the default partition size
partition_size = spark.conf.get("spark.sql.files.maxPartitionBytes").replace("b","")
print(f"Partition Size: {partition_size} in bytes and {int(partition_size) / 1024 / 1024} in MB")

Partition Size: 134217728 in bytes and 128.0 in MB


In [3]:
# Check the default parallelism available
print(f"Parallelism : {spark.sparkContext.defaultParallelism}")

Parallelism : 8


In [4]:
# File size that we are going to import
import os
file_size = os.path.getsize('/home/jovyan/data/employee_records.csv')
print(f"""Data File Size: 
            {file_size} in bytes 
            {int(file_size) / 1024 / 1024} in MB
            {int(file_size) / 1024 / 1024 / 1024} in GB""")

Data File Size: 
            97427001 in bytes 
            92.91362857818604 in MB
            0.0907359654083848 in GB


In [5]:
# Lets read the file and write in noop format for Performance Benchmarking

df = spark.read.format("csv").option("header", True).load("hdfs://namenode:9000/input/data/employee_records.csv")
print(f"Number of Partition -> {df.rdd.getNumPartitions()}")
df.write.format("noop").mode("overwrite").save()

                                                                                

Number of Partition -> 8


                                                                                

In [6]:
# Change the default partition size to 3 times to decrease the number of partitions
spark.conf.set("spark.sql.files.maxPartitionBytes", str(128 * 3 * 1024 * 1024)+"b")

# Verify the partition size
partition_size = spark.conf.get("spark.sql.files.maxPartitionBytes").replace("b","")
print(f"Partition Size: {partition_size} in bytes and {int(partition_size) / 1024 / 1024} in MB")

Partition Size: 402653184 in bytes and 384.0 in MB


In [7]:
# Lets read the file again with new partition size and write in noop format for Performance Benchmarking

df = spark.read.format("csv").option("header", True).load("hdfs://namenode:9000/input/data/employee_records.csv")
print(f"Number of Partition -> {df.rdd.getNumPartitions()}")
df.write.format("noop").mode("overwrite").save()

Number of Partition -> 8


                                                                                

In [8]:
# Change the default partition size to 16 MB to decrease the number of partitions
spark.conf.set("spark.sql.files.maxPartitionBytes", str(2 * 1024 * 1024)+"b")

# Verify the partition size
partition_size = spark.conf.get("spark.sql.files.maxPartitionBytes").replace("b","")
print(f"Partition Size: {partition_size} in bytes and {int(partition_size) / 1024 / 1024} in MB")

Partition Size: 2097152 in bytes and 2.0 in MB


In [9]:
# Lets read the file again with new partition size and write in noop format for Performance Benchmarking

df = spark.read.format("csv").option("header", True).load("hdfs://namenode:9000/input/data/employee_records.csv")
print(f"Number of Partition -> {df.rdd.getNumPartitions()}")
df.write.format("noop").mode("overwrite").save()

Number of Partition -> 47


                                                                                

In [11]:
# How to estimate the size in memory of a dataframe

def get_size_in_megabytes(df):
    df.cache().count()
    size_in_bytes = df._jdf.queryExecution().optimizedPlan().stats().sizeInBytes()
    df.unpersist(blocking=True)
    return size_in_bytes/1024**2

get_size_in_megabytes(df)

                                                                                

92.91362857818604

In [12]:
# If the full datdaframe is very big, you can do an approximation with a sample

sample_perc = 0.1
sample_size_in_megabytes = get_size_in_megabytes(df.sample(sample_perc))
approx_size_in_megabytes = sample_size_in_megabytes / sample_perc
approx_size_in_megabytes

                                                                                

113.69709014892578

In [14]:
# Estimation of optimal partition number
import math

max_partition_size = int(spark.conf.get("spark.sql.files.maxPartitionBytes").replace("b",""))/1024**2
optimal_partition_number = math.ceil(get_size_in_megabytes(df) / max_partition_size)
optimal_partition_number

                                                                                

47

In [15]:
spark.stop()