# 1. Performance Testing Notebook

This notebook is designed for conducting performance tests and analyzing the execution time of different operations (both in Pandas and PySpark).

In [9]:
import pandas as pd
import utils
import test_functions as test

from pyspark.sql import SparkSession

## Load Data

### Load Pandas Dataframes 
execution time: 37 seconds

In [10]:
# load small step data
small_step_pd = []
for i in range(10_000, 100_001, 10_000):
    df = pd.read_csv(f"data/small/pandas_test_{i}_rows.csv")
    small_step_pd.append(df)

# load large step data
large_step_pd = []
for i in range(50_000, 1_000_001, 50_000):
    df = pd.read_csv(f"data/large/pandas_test_{i}_rows.csv")
    large_step_pd.append(df)

### Load Spark Dataframes 
execution time: 28 seconds

In [11]:
# Create a SparkSession
spark = SparkSession.builder \
    .appName("Performance Analysis") \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/13 12:33:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [12]:
# load small step data
small_step_spark = []
for i in range(10_000, 100_001, 10_000):
    df = spark.read.csv(f"data/small/pandas_test_{i}_rows.csv", header=True, inferSchema=True, sep=",")
    small_step_spark.append(df)

# load large step data
large_step_spark = []
for i in range(50_000, 1_000_001, 50_000):
    df = spark.read.csv(f"data/large/pandas_test_{i}_rows.csv", header=True, inferSchema=True)
    large_step_spark.append(df)

                                                                                

## Run Tests

#### Test 1: Write Dataframe to CSV

In [15]:
# load time small statistics
time_statistics_small = pd.read_csv("data/time_statistics_small.csv")

# run test
time_statistics_small = utils.test_run(test.write_data, "write_pd", "write_spark", time_statistics_small, small_step_pd, small_step_spark, spark, 5)

# save time small statistics
time_statistics_small.to_csv("data/time_statistics_small.csv", index=False)

(10000, 20)


                                                                                

(20000, 20)
(30000, 20)
(40000, 20)
(50000, 20)
(60000, 20)
(70000, 20)
(80000, 20)


                                                                                

(90000, 20)


                                                                                

(100000, 20)


23/07/13 12:36:56 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/13 12:36:59 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/13 12:37:02 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/13 12:37:05 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/13 12:37:08 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers


In [62]:
# load time statistics
time_statistics_large = pd.read_csv("data/time_statistics_large.csv")

# I iterated in steps - because of memory allocation issues
for i in range(0, 3):
    df_pd = large_step_pd[i]
    df_spark = large_step_spark[i]
    print(df_pd.shape)
    average_pandas_time, average_pyspark_time, _ , _ = utils.iterations(test.write_data, 5, spark, df_pd, df_spark)
    time_statistics_large.loc[(time_statistics_large['row_count'] == df_pd.shape[0]) & (time_statistics_large["column_count"] == df_pd.shape[1]), ["write_pd", "write_spark"]] = [average_pandas_time, average_pyspark_time]

# save time statistics
time_statistics_large.to_csv("data/time_statistics_large.csv", index=False)

(850000, 20)
pandas_df to csv


23/07/03 22:47:44 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:47:44 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:47:44 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:47:44 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:47:44 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:47:44 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:47:44 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
pandas_df to csv


23/07/03 22:48:05 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:48:05 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:48:05 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:48:05 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:48:05 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:48:05 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:48:05 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
pandas_df to csv


23/07/03 22:48:25 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:48:25 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:48:25 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:48:25 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:48:25 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:48:25 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:48:25 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
pandas_df to csv


23/07/03 22:48:46 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:48:46 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:48:46 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:48:46 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:48:46 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:48:46 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:48:46 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
pandas_df to csv


23/07/03 22:49:07 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:49:07 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:49:07 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:49:07 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:49:07 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:49:07 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:49:07 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
(900000, 20)
pandas_df to csv


23/07/03 22:49:28 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:49:28 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:49:28 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:49:28 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:49:28 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:49:28 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:49:28 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
pandas_df to csv


23/07/03 22:49:49 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:49:49 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:49:49 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:49:49 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:49:50 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:49:50 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:49:50 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
pandas_df to csv


23/07/03 22:50:10 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:50:10 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:50:10 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:50:10 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:50:10 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:50:10 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:50:10 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
pandas_df to csv


23/07/03 22:50:31 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:50:31 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:50:31 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:50:31 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:50:31 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:50:31 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:50:31 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
pandas_df to csv


23/07/03 22:50:53 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:50:53 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:50:53 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:50:53 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:50:53 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:50:53 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:50:53 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
(950000, 20)
pandas_df to csv


23/07/03 22:51:16 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:51:16 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:51:16 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:51:16 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:51:16 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:51:16 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:51:16 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
pandas_df to csv


23/07/03 22:51:38 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:51:38 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:51:38 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:51:38 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:51:38 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:51:38 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:51:38 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
pandas_df to csv


23/07/03 22:52:01 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:52:01 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:52:01 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:52:01 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:52:01 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:52:01 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:52:01 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
pandas_df to csv


23/07/03 22:52:24 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:52:24 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:52:24 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:52:24 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:52:24 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:52:24 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:52:24 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
pandas_df to csv


23/07/03 22:52:46 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:52:46 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:52:46 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:52:46 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:52:46 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:52:46 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:52:46 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
(1000000, 20)
pandas_df to csv


23/07/03 22:53:09 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:53:09 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:53:09 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:53:09 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:53:09 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:53:09 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:53:09 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
pandas_df to csv


23/07/03 22:53:32 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:53:32 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:53:32 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:53:32 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:53:32 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:53:32 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:53:32 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
pandas_df to csv


23/07/03 22:53:55 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:53:55 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:53:55 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:53:55 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:53:55 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:53:55 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:53:55 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
pandas_df to csv


23/07/03 22:54:17 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:54:17 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:54:17 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:54:17 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:54:17 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:54:17 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:54:17 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
pandas_df to csv


23/07/03 22:54:40 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:54:40 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:54:40 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:54:40 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:54:40 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:54:40 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:54:40 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv


23/07/03 22:54:42 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 54,29% for 14 writers
23/07/03 22:54:42 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:54:42 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:54:42 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:54:42 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:54:42 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:54:42 WARN MemoryManager: Total allocation exceeds 95

#### Test 2: Load Dataframe from CSV

In [17]:
# load time small statistics
time_statistics_small = pd.read_csv("data/time_statistics_small.csv")

# run test
time_statistics_small = utils.test_run(test.load_data, "read_pd", "read_spark", time_statistics_small, small_step_pd, small_step_spark, spark, 5)

# save time small statistics
time_statistics_small.to_csv("data/time_statistics_small.csv", index=False)

(10000, 20)
(20000, 20)
(30000, 20)
(40000, 20)
(50000, 20)
(60000, 20)
(70000, 20)
(80000, 20)
(90000, 20)
(100000, 20)


In [19]:
# load time statistics
time_statistics_large = pd.read_csv("data/time_statistics_large.csv")

# run test
time_statistics_large = utils.test_run(test.load_data, "read_pd", "read_spark", time_statistics_large, large_step_pd, large_step_spark, spark, 5)

# save time statistics
time_statistics_large.to_csv("data/time_statistics_large.csv", index=False)

(50000, 20)
(100000, 20)
(150000, 20)
(200000, 20)
(250000, 20)
(300000, 20)
(350000, 20)
(400000, 20)
(450000, 20)
(500000, 20)


                                                                                

(550000, 20)


                                                                                

(600000, 20)


                                                                                

(650000, 20)


                                                                                

(700000, 20)


                                                                                

(750000, 20)


                                                                                

(800000, 20)


                                                                                

(850000, 20)


                                                                                

(900000, 20)


                                                                                

(950000, 20)


                                                                                

(1000000, 20)


                                                                                

#### Test 3: Drop NaN Values

In [6]:
# load time small statistics
time_statistics_small = pd.read_csv("data/time_statistics_small.csv")

# run test
time_statistics_small = utils.test_run(test.drop_nan, "drop_na_pd", "drop_na_spark", time_statistics_small, small_step_pd, small_step_spark, spark, 100)

# save time small statistics
time_statistics_small.to_csv("data/time_statistics_small.csv", index=False)

(10000, 20)
(20000, 20)
(30000, 20)
(40000, 20)
(50000, 20)
(60000, 20)
(70000, 20)
(80000, 20)
(90000, 20)
(100000, 20)


In [7]:
# load time statistics
time_statistics_large = pd.read_csv("data/time_statistics_large.csv")

# run test
time_statistics_large = utils.test_run(test.drop_nan, "drop_na_pd", "drop_na_spark", time_statistics_large, large_step_pd, large_step_spark, spark, 100)

# save time statistics
time_statistics_large.to_csv("data/time_statistics_large.csv", index=False)

(50000, 20)
(100000, 20)
(150000, 20)
(200000, 20)
(250000, 20)
(300000, 20)
(350000, 20)
(400000, 20)
(450000, 20)
(500000, 20)
(550000, 20)
(600000, 20)
(650000, 20)
(700000, 20)
(750000, 20)
(800000, 20)
(850000, 20)
(900000, 20)
(950000, 20)
(1000000, 20)


#### Test 4: Fill NaN Values

In [8]:
# load time small statistics
time_statistics_small = pd.read_csv("data/time_statistics_small.csv")

# run test
time_statistics_small = utils.test_run(test.fill_nan, "fill_na_pd", "fill_na_spark", time_statistics_small, small_step_pd, small_step_spark, spark, 100)

# save time small statistics
time_statistics_small.to_csv("data/time_statistics_small.csv", index=False)

(10000, 20)
(20000, 20)
(30000, 20)
(40000, 20)
(50000, 20)
(60000, 20)
(70000, 20)
(80000, 20)
(90000, 20)
(100000, 20)


In [None]:
# load time statistics
time_statistics_large = pd.read_csv("data/time_statistics_large.csv")

# run test
time_statistics_large = utils.test_run(test.fill_nan, "fill_na_pd", "fill_na_spark", time_statistics_large, large_step_pd, large_step_spark, spark, 50)

# save time statistics
time_statistics_large.to_csv("data/time_statistics_large.csv", index=False)

(50000, 20)
(100000, 20)
(150000, 20)
(200000, 20)
(250000, 20)
(300000, 20)
(350000, 20)
(400000, 20)
(450000, 20)
(500000, 20)
(550000, 20)
(600000, 20)
(650000, 20)
(700000, 20)
(750000, 20)
(800000, 20)
(850000, 20)
(900000, 20)
(950000, 20)
(1000000, 20)


#### Test 5: Groupby

In [8]:
# load time small statistics
time_statistics_small = pd.read_csv("data/time_statistics_small.csv")

# run test
time_statistics_small = utils.test_run(test.group_df, "group_pd", "group_spark", time_statistics_small, small_step_pd, small_step_spark, spark, 100)

# save time small statistics
time_statistics_small.to_csv("data/time_statistics_small.csv", index=False)

(10000, 20)
(20000, 20)
(30000, 20)
(40000, 20)
(50000, 20)
(60000, 20)
(70000, 20)
(80000, 20)
(90000, 20)
(100000, 20)


In [9]:
# load time statistics
time_statistics_large = pd.read_csv("data/time_statistics_large.csv")

# run test
time_statistics_large = utils.test_run(test.group_df, "group_pd", "group_spark", time_statistics_large, large_step_pd, large_step_spark, spark, 100)

# save time statistics
time_statistics_large.to_csv("data/time_statistics_large.csv", index=False)

(50000, 20)
(100000, 20)
(150000, 20)
(200000, 20)
(250000, 20)
(300000, 20)
(350000, 20)
(400000, 20)
(450000, 20)
(500000, 20)
(550000, 20)
(600000, 20)
(650000, 20)
(700000, 20)
(750000, 20)
(800000, 20)
(850000, 20)
(900000, 20)
(950000, 20)
(1000000, 20)


#### Test 6: GroupBy and Sum

In [8]:
# load time small statistics
time_statistics_small = pd.read_csv("data/time_statistics_small.csv")

# run test
time_statistics_small = utils.test_run(test.group_sum_df, "group_sum_pd", "group_sum_spark", time_statistics_small, small_step_pd, small_step_spark, spark, 50)

# save time small statistics
time_statistics_small.to_csv("data/time_statistics_small.csv", index=False)

(10000, 20)
(20000, 20)
(30000, 20)
(40000, 20)
(50000, 20)
(60000, 20)
(70000, 20)
(80000, 20)
(90000, 20)
(100000, 20)


In [9]:
# load time statistics
time_statistics_large = pd.read_csv("data/time_statistics_large.csv")

# run test
time_statistics_large = utils.test_run(test.group_sum_df, "group_sum_pd", "group_sum_spark", time_statistics_large, large_step_pd, large_step_spark, spark, 30)

# save time statistics
time_statistics_large.to_csv("data/time_statistics_large.csv", index=False)

(50000, 20)
(100000, 20)
(150000, 20)
(200000, 20)
(250000, 20)
(300000, 20)
(350000, 20)
(400000, 20)
(450000, 20)
(500000, 20)
(550000, 20)
(600000, 20)
(650000, 20)
(700000, 20)
(750000, 20)
(800000, 20)
(850000, 20)
(900000, 20)
(950000, 20)
(1000000, 20)


#### Test 7: GroupBy and Count

In [6]:
# load time small statistics
time_statistics_small = pd.read_csv("data/time_statistics_small.csv")

# run test
time_statistics_small = utils.test_run(test.group_count_df, "group_count_pd", "group_count_spark", time_statistics_small, small_step_pd, small_step_spark, spark, 50)

# save time small statistics
time_statistics_small.to_csv("data/time_statistics_small.csv", index=False)

(10000, 20)
(20000, 20)
(30000, 20)
(40000, 20)
(50000, 20)
(60000, 20)
(70000, 20)
(80000, 20)
(90000, 20)
(100000, 20)


In [9]:
# load time statistics
time_statistics_large = pd.read_csv("data/time_statistics_large.csv")

# run test
time_statistics_large = utils.test_run(test.group_count_df, "group_count_pd", "group_count_spark", time_statistics_large, large_step_pd, large_step_spark, spark, 30)

# save time statistics
time_statistics_large.to_csv("data/time_statistics_large.csv", index=False)

(50000, 20)
(100000, 20)
(150000, 20)
(200000, 20)
(250000, 20)
(300000, 20)
(350000, 20)
(400000, 20)
(450000, 20)
(500000, 20)
(550000, 20)
(600000, 20)
(650000, 20)
(700000, 20)
(750000, 20)
(800000, 20)
(850000, 20)
(900000, 20)
(950000, 20)
(1000000, 20)


#### Test 8: Filter by Column Value (under 0)

In [8]:
# load time small statistics
time_statistics_small = pd.read_csv("data/time_statistics_small.csv")

# run test
time_statistics_small = utils.test_run(test.filter_less_0, "filter_less_0_pd", "filter_less_0_spark", time_statistics_small, small_step_pd, small_step_spark, spark, 100)

# save time small statistics
time_statistics_small.to_csv("data/time_statistics_small.csv", index=False)

(10000, 20)
(20000, 20)
(30000, 20)
(40000, 20)
(50000, 20)
(60000, 20)
(70000, 20)
(80000, 20)
(90000, 20)
(100000, 20)


In [10]:
# load time statistics
time_statistics_large = pd.read_csv("data/time_statistics_large.csv")

# run test
time_statistics_large = utils.test_run(test.filter_less_0, "filter_less_0_pd", "filter_less_0_spark", time_statistics_large, large_step_pd, large_step_spark, spark, 50)

# save time statistics
time_statistics_large.to_csv("data/time_statistics_large.csv", index=False)

(50000, 20)
(100000, 20)
(150000, 20)
(200000, 20)
(250000, 20)
(300000, 20)
(350000, 20)
(400000, 20)
(450000, 20)
(500000, 20)
(550000, 20)
(600000, 20)
(650000, 20)
(700000, 20)
(750000, 20)
(800000, 20)
(850000, 20)
(900000, 20)
(950000, 20)
(1000000, 20)


#### Test 9: Filter by Column Value (under 10)

In [9]:
# load time small statistics
time_statistics_small = pd.read_csv("data/time_statistics_small.csv")

# run test
time_statistics_small = utils.test_run(test.filter_less_10, "filter_less_10_pd", "filter_less_10_spark", time_statistics_small, small_step_pd, small_step_spark, spark, 100)

# save time small statistics
time_statistics_small.to_csv("data/time_statistics_small.csv", index=False)

(10000, 20)
(20000, 20)
(30000, 20)
(40000, 20)
(50000, 20)
(60000, 20)
(70000, 20)
(80000, 20)
(90000, 20)
(100000, 20)


In [None]:
# load time statistics
time_statistics_large = pd.read_csv("data/time_statistics_large.csv")

# run test
time_statistics_large = utils.test_run(test.filter_less_10, "filter_less_10_pd", "filter_less_10_spark", time_statistics_large, large_step_pd, large_step_spark, spark, 50)

# save time statistics
time_statistics_large.to_csv("data/time_statistics_large.csv", index=False)

(50000, 20)
(100000, 20)
(150000, 20)
(200000, 20)
(250000, 20)
(300000, 20)
(350000, 20)
(400000, 20)
(450000, 20)
(500000, 20)
(550000, 20)
(600000, 20)
(650000, 20)
(700000, 20)
(750000, 20)
(800000, 20)
(850000, 20)
(900000, 20)
(950000, 20)
(1000000, 20)


#### Test 10: Join Dataframes

In [5]:
# load time small statistics
time_statistics_small = pd.read_csv("data/time_statistics_small.csv")

# run test
time_statistics_small = utils.test_run(test.join_df, "join_pd", "join_spark", time_statistics_small, small_step_pd, small_step_spark, spark, 5)

# save time small statistics
time_statistics_small.to_csv("data/time_statistics_small.csv", index=False)


(10000, 20)
(20000, 20)
(30000, 20)
(40000, 20)
(50000, 20)


In [None]:
# load time statistics
time_statistics_large = pd.read_csv("data/time_statistics_large.csv")

# run test
time_statistics_large = utils.test_run(test.join_df, "join_pd", "join_spark", time_statistics_large, large_step_pd, large_step_spark, spark, 3)

# save time statistics
time_statistics_large.to_csv("data/time_statistics_large.csv", index=False)

#### Test 11: Multiplication (Build-In)

In [11]:
# load time statistics
time_statistics_small = pd.read_csv("data/time_statistics_small.csv")

# run test
time_statistics_small = utils.test_run(test.multiply_build_in, "mul_build_pd", "mul_build_spark", time_statistics_small, small_step_pd, small_step_spark, 100000)

# save time small statistics
time_statistics_small.to_csv("data/time_statistics_small.csv", index=False)

(10000, 20)
(20000, 20)
(30000, 20)
(40000, 20)
(50000, 20)
(60000, 20)
(70000, 20)
(80000, 20)
(90000, 20)
(100000, 20)


In [12]:
# load time statistics
time_statistics_large = pd.read_csv("data/time_statistics_large.csv")

# run test
time_statistics_large = utils.test_run(test.multiply_build_in, "mul_build_pd", "mul_build_spark", time_statistics_large, large_step_pd, large_step_spark, 100000)

# save time statistics
time_statistics_large.to_csv("data/time_statistics_large.csv", index=False)

(50000, 20)
(100000, 20)
(150000, 20)
(200000, 20)
(250000, 20)
(300000, 20)
(350000, 20)
(400000, 20)
(450000, 20)
(500000, 20)
(550000, 20)
(600000, 20)
(650000, 20)
(700000, 20)
(750000, 20)
(800000, 20)
(850000, 20)
(900000, 20)
(950000, 20)
(1000000, 20)


#### Test 12: Multiplication (Column)

In [16]:
# load time small statistics
time_statistics_small = pd.read_csv("data/time_statistics_small.csv")

# run test
time_statistics_small = utils.test_run(test.multiply_by_selection, "mul_col_pd", "mul_col_spark", time_statistics_small, small_step_pd, small_step_spark, spark, 1000)

# save time small statistics
time_statistics_small.to_csv("data/time_statistics_small.csv", index=False)

(10000, 20)
(20000, 20)
(30000, 20)
(40000, 20)
(50000, 20)
(60000, 20)
(70000, 20)
(80000, 20)
(90000, 20)
(100000, 20)


In [17]:
# load time statistics
time_statistics_large = pd.read_csv("data/time_statistics_large.csv")

# run test
time_statistics_large = utils.test_run(test.multiply_by_selection, "mul_col_pd", "mul_col_spark", time_statistics_large, large_step_pd, large_step_spark, spark, 500)

# save time statistics
time_statistics_large.to_csv("data/time_statistics_large.csv", index=False)

(50000, 20)
(100000, 20)
(150000, 20)
(200000, 20)
(250000, 20)
(300000, 20)
(350000, 20)
(400000, 20)
(450000, 20)
(500000, 20)
(550000, 20)
(600000, 20)
(650000, 20)
(700000, 20)
(750000, 20)
(800000, 20)
(850000, 20)
(900000, 20)
(950000, 20)
(1000000, 20)


#### Test 13: Convert Dataframe

In [18]:
# load time small statistics
time_statistics_small = pd.read_csv("data/time_statistics_small.csv")

# run test
time_statistics_small = utils.test_run(test.convert_df, "pd_to_spark", "pyspark_to_pd", time_statistics_small, small_step_pd, small_step_spark, spark, 5)

# save time small statistics
time_statistics_small.to_csv("data/time_statistics_small.csv", index=False)

(10000, 20)
(20000, 20)
(30000, 20)
(40000, 20)
(50000, 20)
(60000, 20)
(70000, 20)
(80000, 20)
(90000, 20)
(100000, 20)


In [19]:
# load time statistics
time_statistics_large = pd.read_csv("data/time_statistics_large.csv")

# run test
time_statistics_large = utils.test_run(test.convert_df, "pd_to_spark", "pyspark_to_pd", time_statistics_large, large_step_pd, large_step_spark, spark, 3)

# save time statistics
time_statistics_large.to_csv("data/time_statistics_large.csv", index=False)

(50000, 20)
(100000, 20)
(150000, 20)


KeyboardInterrupt: 

### HELPER

In [16]:
spark.stop()

# Create a SparkSession
spark = SparkSession.builder \
    .appName("Performance Analysis") \
    .getOrCreate()

# load small step data
small_step_spark = []
for i in range(10_000, 100_001, 10_000):
    df = spark.read.csv(f"data/small/pandas_test_{i}_rows.csv", header=True, inferSchema=True, sep=",")
    small_step_spark.append(df)

# load large step data
large_step_spark = []
for i in range(50_000, 1_000_001, 50_000):
    df = spark.read.csv(f"data/large/pandas_test_{i}_rows.csv", header=True, inferSchema=True)
    large_step_spark.append(df)

                                                                                

To Do: 
- write large
- convert large