# Pandas File Format Benchmarking

## Setup and collect benchmark results

In [None]:
import numpy as np
import pandas as pd

number_of_repeats = 20  # How often to repeat benchmarks?

# Generate data
np.random.seed = 42
DF_SIZE = 1000_000
DF = pd.DataFrame({
    'a': np.random.rand(DF_SIZE),
    'b': np.random.rand(DF_SIZE),
    'c': np.random.rand(DF_SIZE),
    'd': np.random.rand(DF_SIZE),
    'e': np.random.rand(DF_SIZE)
})

In [None]:
# Calculate and collect results
from format_benchmark_tool.format_benchmark_tool import FormatBenchmarkTool
results = pd.DataFrame(FormatBenchmarkTool(DF, number_of_repeats=number_of_repeats).get_results())
#results

## Analyze results and draw plots

In [None]:
# Setup
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(palette="colorblind")

In [None]:
# Prepare time data
time_data = pd.DataFrame([], columns=['format', 'time', 'type'])

write_times = results[['format', 'write_time']].copy()
write_times.rename(columns={'write_time': 'time'}, inplace=True)
write_times['type'] = 'Write time'

read_times = results[['format', 'read_time']].copy()
read_times.rename(columns={'read_time': 'time'}, inplace=True)
read_times['type'] = 'Read time'

time_data = pd.concat([time_data, write_times, read_times], ignore_index=True)

# Plot minimum write and read times (with std deviation) per file format as barplot
ax = sns.barplot(data=time_data, x="time", y="format", hue='type', estimator='min', errorbar="ci")
ax.set_title(f"Minimum write and read times per file format with confidence intervals; {number_of_repeats} trials")
ax.xaxis.set_label_text("Minimum time [s]")
ax.yaxis.set_label_text("File format")
plt.legend(title='Method')

for p in ax.patches:
    ax.annotate(
        str(p.get_width().round(5)), 
        (p.get_x() + p.get_width(), p.get_y() + p.get_height() / 2), 
        ha = 'left', va = 'center', 
        size=10,
        xytext = (10, 0), 
        textcoords = 'offset points')

plt.show()

In [None]:
# Plot file sizes
ax = sns.pointplot(data=results, x='file_size', y='format', errorbar='ci', join=False)
ax.set_title("Size of output file per file format")
ax.xaxis.set_label_text("Output file size [Bytes]")
ax.yaxis.set_label_text("File format")