# 1. Performance Testing Notebook

This notebook is designed for conducting performance tests and analyzing the execution time of different operations (both in Pandas and PySpark).

Imports

In [9]:
import pandas as pd
import utils
import test_functions as test

from pyspark.sql import SparkSession

## Load Data

### Load Pandas Dataframes 
execution time: 37 seconds

In [10]:
# load small step data
small_step_pd = []
for i in range(10_000, 100_001, 10_000):
    df = pd.read_csv(f"data/small/pandas_test_{i}_rows.csv")
    small_step_pd.append(df)

# load large step data
large_step_pd = []
for i in range(50_000, 1_000_001, 50_000):
    df = pd.read_csv(f"data/large/pandas_test_{i}_rows.csv")
    large_step_pd.append(df)

### Load Spark Dataframes 
execution time: 28 seconds

In [None]:
# Create a SparkSession
spark = SparkSession.builder \
    .appName("Performance Analysis") \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/13 11:01:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
# load small step data
small_step_spark = []
for i in range(10_000, 100_001, 10_000):
    df = spark.read.csv(f"data/small/pandas_test_{i}_rows.csv", header=True, inferSchema=True, sep=",")
    small_step_spark.append(df)

# load large step data
large_step_spark = []
for i in range(50_000, 1_000_001, 50_000):
    df = spark.read.csv(f"data/large/pandas_test_{i}_rows.csv", header=True, inferSchema=True)
    large_step_spark.append(df)

                                                                                

## Run Tests

#### Test 1: Write Dataframe to CSV

In [None]:
# load time small statistics
time_statistics_small = pd.read_csv("data/time_statistics_small.csv")

# run test
time_statistics_small = utils.test_run(test.write_data, "write_pd", "write_spark", time_statistics_small, small_step_pd, small_step_spark, spark, 5)

# save time small statistics
time_statistics_small.to_csv("data/time_statistics_small.csv", index=False)

(10000, 20)
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
(20000, 20)
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
(30000, 20)
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
(40000, 20)
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
(50000, 20)
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
(60000, 20)
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pand

23/07/03 22:07:53 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers


spark_df to csv
pandas_df to csv


23/07/03 22:07:56 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers


spark_df to csv
pandas_df to csv


23/07/03 22:07:59 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers


spark_df to csv
pandas_df to csv


23/07/03 22:08:02 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers


spark_df to csv
pandas_df to csv


23/07/03 22:08:05 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers


spark_df to csv


In [62]:
# load time statistics
time_statistics_large = pd.read_csv("data/time_statistics_large.csv")

# I iterated in steps - because of memory allocation issues
for i in range(0, 3):
    df_pd = large_step_pd[i]
    df_spark = large_step_spark[i]
    print(df_pd.shape)
    average_pandas_time, average_pyspark_time, _ , _ = utils.iterations(test.write_data, 5, spark, df_pd, df_spark)
    time_statistics_large.loc[(time_statistics_large['row_count'] == df_pd.shape[0]) & (time_statistics_large["column_count"] == df_pd.shape[1]), ["write_pd", "write_spark"]] = [average_pandas_time, average_pyspark_time]

# save time statistics
time_statistics_large.to_csv("data/time_statistics_large.csv", index=False)

(850000, 20)
pandas_df to csv


23/07/03 22:47:44 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:47:44 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:47:44 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:47:44 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:47:44 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:47:44 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:47:44 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
pandas_df to csv


23/07/03 22:48:05 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:48:05 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:48:05 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:48:05 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:48:05 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:48:05 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:48:05 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
pandas_df to csv


23/07/03 22:48:25 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:48:25 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:48:25 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:48:25 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:48:25 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:48:25 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:48:25 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
pandas_df to csv


23/07/03 22:48:46 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:48:46 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:48:46 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:48:46 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:48:46 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:48:46 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:48:46 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
pandas_df to csv


23/07/03 22:49:07 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:49:07 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:49:07 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:49:07 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:49:07 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:49:07 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:49:07 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
(900000, 20)
pandas_df to csv


23/07/03 22:49:28 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:49:28 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:49:28 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:49:28 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:49:28 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:49:28 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:49:28 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
pandas_df to csv


23/07/03 22:49:49 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:49:49 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:49:49 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:49:49 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:49:50 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:49:50 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:49:50 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
pandas_df to csv


23/07/03 22:50:10 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:50:10 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:50:10 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:50:10 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:50:10 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:50:10 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:50:10 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
pandas_df to csv


23/07/03 22:50:31 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:50:31 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:50:31 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:50:31 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:50:31 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:50:31 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:50:31 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
pandas_df to csv


23/07/03 22:50:53 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:50:53 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:50:53 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:50:53 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:50:53 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:50:53 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:50:53 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
(950000, 20)
pandas_df to csv


23/07/03 22:51:16 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:51:16 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:51:16 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:51:16 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:51:16 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:51:16 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:51:16 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
pandas_df to csv


23/07/03 22:51:38 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:51:38 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:51:38 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:51:38 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:51:38 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:51:38 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:51:38 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
pandas_df to csv


23/07/03 22:52:01 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:52:01 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:52:01 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:52:01 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:52:01 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:52:01 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:52:01 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
pandas_df to csv


23/07/03 22:52:24 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:52:24 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:52:24 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:52:24 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:52:24 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:52:24 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:52:24 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
pandas_df to csv


23/07/03 22:52:46 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:52:46 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:52:46 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:52:46 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:52:46 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:52:46 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:52:46 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
(1000000, 20)
pandas_df to csv


23/07/03 22:53:09 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:53:09 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:53:09 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:53:09 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:53:09 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:53:09 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:53:09 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
pandas_df to csv


23/07/03 22:53:32 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:53:32 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:53:32 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:53:32 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:53:32 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:53:32 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:53:32 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
pandas_df to csv


23/07/03 22:53:55 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:53:55 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:53:55 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:53:55 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:53:55 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:53:55 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:53:55 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
pandas_df to csv


23/07/03 22:54:17 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:54:17 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:54:17 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:54:17 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:54:17 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:54:17 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:54:17 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv
pandas_df to csv


23/07/03 22:54:40 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/07/03 22:54:40 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:54:40 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:54:40 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:54:40 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:54:40 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:54:40 WARN MemoryManager: Total allocation exceeds 95,

spark_df to csv


23/07/03 22:54:42 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 54,29% for 14 writers
23/07/03 22:54:42 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 58,46% for 13 writers
23/07/03 22:54:42 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
23/07/03 22:54:42 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
23/07/03 22:54:42 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
23/07/03 22:54:42 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
23/07/03 22:54:42 WARN MemoryManager: Total allocation exceeds 95

#### Test 2: Load Dataframe from CSV

In [6]:
# load time small statistics
time_statistics_small = pd.read_csv("data/time_statistics_small.csv")

# iterate over small step dataframes
for i in range(len(small_step_pd)):
    df_pd = small_step_pd[i]
    df_spark = small_step_spark[i]
    print(df_pd.shape)
    average_pandas_time, average_pyspark_time, _ , _ = utils.iterations(test.load_data, 5, spark, df_pd, df_spark)
    time_statistics_small.loc[(time_statistics_small['row_count'] == df_pd.shape[0]) & (time_statistics_small["column_count"] == df_pd.shape[1]), ["read_pd", "read_spark"]] = [average_pandas_time, average_pyspark_time]

# save time small statistics
time_statistics_small.to_csv("data/time_statistics_small.csv", index=False)

(10000, 20)
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
(20000, 20)
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
(30000, 20)
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
(40000, 20)
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
(50000, 20)
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
(60000, 20)
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pand

In [8]:
# load time statistics
time_statistics_large = pd.read_csv("data/time_statistics_large.csv")

# I iterated in steps - because of memory allocation issues
for i in range(len(large_step_pd)):
    df_pd = large_step_pd[i]
    df_spark = large_step_spark[i]
    print(df_pd.shape)
    average_pandas_time, average_pyspark_time, _ , _ = utils.iterations(test.load_data, 5, spark, df_pd, df_spark)
    time_statistics_large.loc[(time_statistics_large['row_count'] == df_pd.shape[0]) & (time_statistics_large["column_count"] == df_pd.shape[1]), ["read_pd", "read_spark"]] = [average_pandas_time, average_pyspark_time]

# save time statistics
time_statistics_large.to_csv("data/time_statistics_large.csv", index=False)

(50000, 20)
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
(100000, 20)
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
(150000, 20)
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
(200000, 20)
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
(250000, 20)
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
(300000, 20)
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv

                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
(500000, 20)
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
(550000, 20)
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv
spark_df to csv
pandas_df to csv
spark_df to csv
(600000, 20)
pandas_df to csv
spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
(650000, 20)
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
(700000, 20)
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
(750000, 20)
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
(800000, 20)
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
(850000, 20)
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
(900000, 20)
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
(950000, 20)
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
(1000000, 20)
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


                                                                                

spark_df to csv
pandas_df to csv


[Stage 361:>                                                      (0 + 16) / 16]

spark_df to csv


                                                                                

#### Test 3: Drop NaN Values

In [6]:
# load time small statistics
time_statistics_small = pd.read_csv("data/time_statistics_small.csv")

# run test
time_statistics_small = utils.test_run(test.drop_nan, "drop_na_pd", "drop_na_spark", time_statistics_small, small_step_pd, small_step_spark, spark, 100)

# save time small statistics
time_statistics_small.to_csv("data/time_statistics_small.csv", index=False)

(10000, 20)
(20000, 20)
(30000, 20)
(40000, 20)
(50000, 20)
(60000, 20)
(70000, 20)
(80000, 20)
(90000, 20)
(100000, 20)


In [7]:
# load time statistics
time_statistics_large = pd.read_csv("data/time_statistics_large.csv")

# run test
time_statistics_large = utils.test_run(test.drop_nan, "drop_na_pd", "drop_na_spark", time_statistics_large, large_step_pd, large_step_spark, spark, 100)

# save time statistics
time_statistics_large.to_csv("data/time_statistics_large.csv", index=False)

(50000, 20)
(100000, 20)
(150000, 20)
(200000, 20)
(250000, 20)
(300000, 20)
(350000, 20)
(400000, 20)
(450000, 20)
(500000, 20)
(550000, 20)
(600000, 20)
(650000, 20)
(700000, 20)
(750000, 20)
(800000, 20)
(850000, 20)
(900000, 20)
(950000, 20)
(1000000, 20)


#### Test 4: Fill NaN Values

In [8]:
# load time small statistics
time_statistics_small = pd.read_csv("data/time_statistics_small.csv")

# run test
time_statistics_small = utils.test_run(test.fill_nan, "fill_na_pd", "fill_na_spark", time_statistics_small, small_step_pd, small_step_spark, spark, 100)

# save time small statistics
time_statistics_small.to_csv("data/time_statistics_small.csv", index=False)

(10000, 20)
(20000, 20)
(30000, 20)
(40000, 20)
(50000, 20)
(60000, 20)
(70000, 20)
(80000, 20)
(90000, 20)
(100000, 20)


In [None]:
# load time statistics
time_statistics_large = pd.read_csv("data/time_statistics_large.csv")

# run test
time_statistics_large = utils.test_run(test.fill_nan, "fill_na_pd", "fill_na_spark", time_statistics_large, large_step_pd, large_step_spark, spark, 50)

# save time statistics
time_statistics_large.to_csv("data/time_statistics_large.csv", index=False)

(50000, 20)
(100000, 20)
(150000, 20)
(200000, 20)
(250000, 20)
(300000, 20)
(350000, 20)
(400000, 20)
(450000, 20)
(500000, 20)
(550000, 20)
(600000, 20)
(650000, 20)
(700000, 20)
(750000, 20)
(800000, 20)
(850000, 20)
(900000, 20)
(950000, 20)
(1000000, 20)


#### Test 5: Groupby

In [8]:
# load time small statistics
time_statistics_small = pd.read_csv("data/time_statistics_small.csv")

# run test
time_statistics_small = utils.test_run(test.group_df, "group_pd", "group_spark", time_statistics_small, small_step_pd, small_step_spark, spark, 100)

# save time small statistics
time_statistics_small.to_csv("data/time_statistics_small.csv", index=False)

(10000, 20)
(20000, 20)
(30000, 20)
(40000, 20)
(50000, 20)
(60000, 20)
(70000, 20)
(80000, 20)
(90000, 20)
(100000, 20)


In [9]:
# load time statistics
time_statistics_large = pd.read_csv("data/time_statistics_large.csv")

# run test
time_statistics_large = utils.test_run(test.group_df, "group_pd", "group_spark", time_statistics_large, large_step_pd, large_step_spark, spark, 100)

# save time statistics
time_statistics_large.to_csv("data/time_statistics_large.csv", index=False)

(50000, 20)
(100000, 20)
(150000, 20)
(200000, 20)
(250000, 20)
(300000, 20)
(350000, 20)
(400000, 20)
(450000, 20)
(500000, 20)
(550000, 20)
(600000, 20)
(650000, 20)
(700000, 20)
(750000, 20)
(800000, 20)
(850000, 20)
(900000, 20)
(950000, 20)
(1000000, 20)


#### Test 6: GroupBy and Sum

In [8]:
# load time small statistics
time_statistics_small = pd.read_csv("data/time_statistics_small.csv")

# run test
time_statistics_small = utils.test_run(test.group_sum_df, "group_sum_pd", "group_sum_spark", time_statistics_small, small_step_pd, small_step_spark, spark, 50)

# save time small statistics
time_statistics_small.to_csv("data/time_statistics_small.csv", index=False)

(10000, 20)
(20000, 20)
(30000, 20)
(40000, 20)
(50000, 20)
(60000, 20)
(70000, 20)
(80000, 20)
(90000, 20)
(100000, 20)


In [9]:
# load time statistics
time_statistics_large = pd.read_csv("data/time_statistics_large.csv")

# run test
time_statistics_large = utils.test_run(test.group_sum_df, "group_sum_pd", "group_sum_spark", time_statistics_large, large_step_pd, large_step_spark, spark, 30)

# save time statistics
time_statistics_large.to_csv("data/time_statistics_large.csv", index=False)

(50000, 20)
(100000, 20)
(150000, 20)
(200000, 20)
(250000, 20)
(300000, 20)
(350000, 20)
(400000, 20)
(450000, 20)
(500000, 20)
(550000, 20)
(600000, 20)
(650000, 20)
(700000, 20)
(750000, 20)
(800000, 20)
(850000, 20)
(900000, 20)
(950000, 20)
(1000000, 20)


#### Test 7: GroupBy and Count

In [6]:
# load time small statistics
time_statistics_small = pd.read_csv("data/time_statistics_small.csv")

# run test
time_statistics_small = utils.test_run(test.group_count_df, "group_count_pd", "group_count_spark", time_statistics_small, small_step_pd, small_step_spark, spark, 50)

# save time small statistics
time_statistics_small.to_csv("data/time_statistics_small.csv", index=False)

(10000, 20)
(20000, 20)
(30000, 20)
(40000, 20)
(50000, 20)
(60000, 20)
(70000, 20)
(80000, 20)
(90000, 20)
(100000, 20)


In [9]:
# load time statistics
time_statistics_large = pd.read_csv("data/time_statistics_large.csv")

# run test
time_statistics_large = utils.test_run(test.group_count_df, "group_count_pd", "group_count_spark", time_statistics_large, large_step_pd, large_step_spark, spark, 30)

# save time statistics
time_statistics_large.to_csv("data/time_statistics_large.csv", index=False)

(50000, 20)
(100000, 20)
(150000, 20)
(200000, 20)
(250000, 20)
(300000, 20)
(350000, 20)
(400000, 20)
(450000, 20)
(500000, 20)
(550000, 20)
(600000, 20)
(650000, 20)
(700000, 20)
(750000, 20)
(800000, 20)
(850000, 20)
(900000, 20)
(950000, 20)
(1000000, 20)


#### Test 8: Filter by Column Value (under 0)

In [8]:
# load time small statistics
time_statistics_small = pd.read_csv("data/time_statistics_small.csv")

# run test
time_statistics_small = utils.test_run(test.filter_less_0, "filter_less_0_pd", "filter_less_0_spark", time_statistics_small, small_step_pd, small_step_spark, spark, 100)

# save time small statistics
time_statistics_small.to_csv("data/time_statistics_small.csv", index=False)

(10000, 20)
(20000, 20)
(30000, 20)
(40000, 20)
(50000, 20)
(60000, 20)
(70000, 20)
(80000, 20)
(90000, 20)
(100000, 20)


In [10]:
# load time statistics
time_statistics_large = pd.read_csv("data/time_statistics_large.csv")

# run test
time_statistics_large = utils.test_run(test.filter_less_0, "filter_less_0_pd", "filter_less_0_spark", time_statistics_large, large_step_pd, large_step_spark, spark, 50)

# save time statistics
time_statistics_large.to_csv("data/time_statistics_large.csv", index=False)

(50000, 20)
(100000, 20)
(150000, 20)
(200000, 20)
(250000, 20)
(300000, 20)
(350000, 20)
(400000, 20)
(450000, 20)
(500000, 20)
(550000, 20)
(600000, 20)
(650000, 20)
(700000, 20)
(750000, 20)
(800000, 20)
(850000, 20)
(900000, 20)
(950000, 20)
(1000000, 20)


#### Test 9: Filter by Column Value (under 10)

In [9]:
# load time small statistics
time_statistics_small = pd.read_csv("data/time_statistics_small.csv")

# run test
time_statistics_small = utils.test_run(test.filter_less_10, "filter_less_10_pd", "filter_less_10_spark", time_statistics_small, small_step_pd, small_step_spark, spark, 100)

# save time small statistics
time_statistics_small.to_csv("data/time_statistics_small.csv", index=False)

(10000, 20)
(20000, 20)
(30000, 20)
(40000, 20)
(50000, 20)
(60000, 20)
(70000, 20)
(80000, 20)
(90000, 20)
(100000, 20)


In [None]:
# load time statistics
time_statistics_large = pd.read_csv("data/time_statistics_large.csv")

# run test
time_statistics_large = utils.test_run(test.filter_less_10, "filter_less_10_pd", "filter_less_10_spark", time_statistics_large, large_step_pd, large_step_spark, spark, 50)

# save time statistics
time_statistics_large.to_csv("data/time_statistics_large.csv", index=False)

(50000, 20)
(100000, 20)
(150000, 20)
(200000, 20)
(250000, 20)
(300000, 20)
(350000, 20)
(400000, 20)
(450000, 20)
(500000, 20)
(550000, 20)
(600000, 20)
(650000, 20)
(700000, 20)
(750000, 20)
(800000, 20)
(850000, 20)
(900000, 20)
(950000, 20)
(1000000, 20)


#### Test 10: Join Dataframes

In [5]:
# load time small statistics
time_statistics_small = pd.read_csv("data/time_statistics_small.csv")

# run test
time_statistics_small = utils.test_run(test.join_df, "join_pd", "join_spark", time_statistics_small, small_step_pd, small_step_spark, spark, 5)

# save time small statistics
time_statistics_small.to_csv("data/time_statistics_small.csv", index=False)


(10000, 20)
(20000, 20)
(30000, 20)
(40000, 20)
(50000, 20)


In [None]:
# load time statistics
time_statistics_large = pd.read_csv("data/time_statistics_large.csv")

# run test
time_statistics_large = utils.test_run(test.join_df, "join_pd", "join_spark", time_statistics_large, large_step_pd, large_step_spark, spark, 3)

# save time statistics
time_statistics_large.to_csv("data/time_statistics_large.csv", index=False)

#### Test 11: Multiplication (Build-In)

In [11]:
# load time statistics
time_statistics_small = pd.read_csv("data/time_statistics_small.csv")

# run test
time_statistics_small = utils.test_run(test.multiply_build_in, "mul_build_pd", "mul_build_spark", time_statistics_small, small_step_pd, small_step_spark, 100000)

# save time small statistics
time_statistics_small.to_csv("data/time_statistics_small.csv", index=False)

(10000, 20)
(20000, 20)
(30000, 20)
(40000, 20)
(50000, 20)
(60000, 20)
(70000, 20)
(80000, 20)
(90000, 20)
(100000, 20)


In [12]:
# load time statistics
time_statistics_large = pd.read_csv("data/time_statistics_large.csv")

# run test
time_statistics_large = utils.test_run(test.multiply_build_in, "mul_build_pd", "mul_build_spark", time_statistics_large, large_step_pd, large_step_spark, 100000)

# save time statistics
time_statistics_large.to_csv("data/time_statistics_large.csv", index=False)

(50000, 20)
(100000, 20)
(150000, 20)
(200000, 20)
(250000, 20)
(300000, 20)
(350000, 20)
(400000, 20)
(450000, 20)
(500000, 20)
(550000, 20)
(600000, 20)
(650000, 20)
(700000, 20)
(750000, 20)
(800000, 20)
(850000, 20)
(900000, 20)
(950000, 20)
(1000000, 20)


#### Test 12: Multiplication (Column)

In [16]:
# load time small statistics
time_statistics_small = pd.read_csv("data/time_statistics_small.csv")

# run test
time_statistics_small = utils.test_run(test.multiply_by_selection, "mul_col_pd", "mul_col_spark", time_statistics_small, small_step_pd, small_step_spark, spark, 1000)

# save time small statistics
time_statistics_small.to_csv("data/time_statistics_small.csv", index=False)

(10000, 20)
(20000, 20)
(30000, 20)
(40000, 20)
(50000, 20)
(60000, 20)
(70000, 20)
(80000, 20)
(90000, 20)
(100000, 20)


In [17]:
# load time statistics
time_statistics_large = pd.read_csv("data/time_statistics_large.csv")

# run test
time_statistics_large = utils.test_run(test.multiply_by_selection, "mul_col_pd", "mul_col_spark", time_statistics_large, large_step_pd, large_step_spark, spark, 500)

# save time statistics
time_statistics_large.to_csv("data/time_statistics_large.csv", index=False)

(50000, 20)
(100000, 20)
(150000, 20)
(200000, 20)
(250000, 20)
(300000, 20)
(350000, 20)
(400000, 20)
(450000, 20)
(500000, 20)
(550000, 20)
(600000, 20)
(650000, 20)
(700000, 20)
(750000, 20)
(800000, 20)
(850000, 20)
(900000, 20)
(950000, 20)
(1000000, 20)


#### Test 13: Convert Dataframe

In [18]:
# load time small statistics
time_statistics_small = pd.read_csv("data/time_statistics_small.csv")

# run test
time_statistics_small = utils.test_run(test.convert_df, "pd_to_spark", "pyspark_to_pd", time_statistics_small, small_step_pd, small_step_spark, spark, 5)

# save time small statistics
time_statistics_small.to_csv("data/time_statistics_small.csv", index=False)

(10000, 20)
(20000, 20)
(30000, 20)
(40000, 20)
(50000, 20)
(60000, 20)
(70000, 20)
(80000, 20)
(90000, 20)
(100000, 20)


KeyboardInterrupt: 

In [19]:
# load time statistics
time_statistics_large = pd.read_csv("data/time_statistics_large.csv")

# run test
time_statistics_large = utils.test_run(test.convert_df, "pd_to_spark", "pyspark_to_pd", time_statistics_large, large_step_pd, large_step_spark, spark, 1)

# save time statistics
time_statistics_large.to_csv("data/time_statistics_large.csv", index=False)

(50000, 20)
(100000, 20)
(150000, 20)


KeyboardInterrupt: 

### HELPER

In [None]:
spark.stop()

# Create a SparkSession
spark = SparkSession.builder \
    .appName("Performance Analysis") \
    .getOrCreate()

# # load small step data
# small_step_spark = []
# for i in range(10_000, 100_001, 10_000):
#     df = spark.read.csv(f"data/small/pandas_test_{i}_rows.csv", header=True, inferSchema=True, sep=",")
#     small_step_spark.append(df)

# load large step data
large_step_spark = []
for i in range(50_000, 1_000_001, 50_000):
    df = spark.read.csv(f"data/large/pandas_test_{i}_rows.csv", header=True, inferSchema=True)
    large_step_spark.append(df)

                                                                                

In [4]:
time_statistics_small = pd.read_csv("data/time_statistics_small.csv")


In [6]:
# drop column write_pd
time_statistics_small = time_statistics_small.drop(columns=["write_pd", "write_spark", "read_pd", "read_spark"])

In [7]:
time_statistics_small

Unnamed: 0,row_count,column_count,drop_na_pd,drop_na_spark,fill_na_pd,fill_na_spark,group_pd,group_spark,group_sum_pd,group_sum_spark,...,filter_less_10_pd,filter_less_10_spark,join_pd,join_spark,mul_build_pd,mul_build_spark,mul_col_pd,mul_col_spark,pd_to_pyspark,pyspark_to_pd
0,10000,20,2273045.35,6127367.31,930211.45,11401758.28,100231.27,2546892.11,7201648.04,6930286.58,...,369474.87,1626980.56,667079400.0,10302926.4,494447.2,3904174.0,294690.746,2660812.946,1350945000.0,566775800.0
1,20000,20,3412051.71,6261533.34,1725123.9,8619587.99,107764.62,2516342.15,15881935.84,7038621.18,...,360388.72,1455366.93,2757284000.0,7913321.8,400458.2,3669802.8,355983.59,2923234.584,2320653000.0,425219600.0
2,30000,20,2987325.82,5147554.41,2597924.96,8275803.16,82897.48,2087834.89,19820452.32,7044692.7,...,319890.13,1307907.29,6277202000.0,7275263.0,727355.6,8246784.8,353742.482,2728613.477,3372815000.0,498583100.0
3,40000,20,3717455.12,5243979.08,3450599.02,7734008.96,112366.11,2559911.11,25808129.34,7087528.32,...,422855.56,1365134.79,11784900000.0,7722078.2,516536.4,4488748.6,410364.53,3007573.881,4287349000.0,553549200.0
4,50000,20,4669801.2,5245320.74,4365054.36,8012003.86,114761.06,2757335.8,33218271.86,7100801.98,...,297103.62,1034083.87,25401350000.0,24746986.8,424197.4,8857106.2,412987.833,2774710.565,5893922000.0,670083000.0
5,60000,20,5296804.84,5731909.75,4578323.31,8047922.16,96016.01,2224244.43,38010596.3,6692867.72,...,295778.19,1390522.96,36907610000.0,36499633.2,381444.4,4538409.8,400499.1,2608809.617,5962499000.0,685936200.0
6,70000,20,5076857.11,4683326.08,5794457.36,8014643.13,78869.55,2020267.91,40556381.84,6953347.44,...,311670.39,1062952.36,51181990000.0,39077798.8,486232.8,2917980.6,542694.18,3038352.58,7880144000.0,763618400.0
7,80000,20,6817160.31,5749304.94,6753651.55,8162628.19,87658.71,2095853.4,50931398.32,7887529.06,...,472910.05,1392389.67,67899000000.0,33089260.6,515857.0,3233960.0,449705.629,2523107.487,8692141000.0,872458500.0
8,90000,20,6786482.0,5381216.38,7470260.19,7676873.22,92142.78,2321904.72,52956586.8,7199329.08,...,567072.46,1792351.27,89490090000.0,39015719.0,692149.4,3786643.2,469702.901,2507444.073,9777973000.0,1071158000.0
9,100000,20,7762776.87,5912820.19,8208674.65,7851123.79,94323.88,2243208.52,61432439.1,7763827.66,...,523807.84,1664374.66,115635900000.0,82607634.6,624393.2,3220083.4,557645.332,2829119.802,11339900000.0,1030080000.0


In [8]:
time_statistics_small.to_csv("data/time_statistics_small.csv", index=False)