# 2. Data Plotting Notebook

This notebook is dedicated to visualizing the data obtained from performance tests. It provides an interactive and visual representation of the execution time, enabling in-depth analysis and comparison of different scenarios.

In [1]:
import pandas as pd
import plotly.graph_objects as go
import utils

Read the statistic dataframes from the csv files:

In [2]:
time_statistics_small = pd.read_csv("data/time_statistics_small.csv")
time_statistics_large = pd.read_csv("data/time_statistics_large.csv")

#### Plots for the Analysis:

In [3]:
small_read = utils.plot_statistic("Write to CSV", time_statistics_small, "write")
small_read.show()

large_read = utils.plot_statistic("Write to CSV", time_statistics_large, "write")
large_read.show()

In [4]:
small_drop = utils.plot_statistic("Drop NaN Values", time_statistics_small, "drop_na")
small_drop.show()

large_drop = utils.plot_statistic("Drop NaN Values", time_statistics_large, "drop_na")
large_drop.show()

In [5]:
small_fillna = utils.plot_statistic("Fill NaN Values", time_statistics_small, "fill_na")
small_fillna.show()

large_fillna = utils.plot_statistic("Fill NaN Values", time_statistics_large, "fill_na")
large_fillna.show()

In [6]:
small_group = utils.plot_statistic("GroupBy", time_statistics_small, "group")
small_group.show()

large_group = utils.plot_statistic("GroupBy", time_statistics_large, "group")
large_group.show()

In [7]:
small_group_sum = utils.plot_statistic("GroupBy and Sum", time_statistics_small, "group_sum")
small_group_sum.show()

large_group_sum = utils.plot_statistic("GroupBy and Sum", time_statistics_large, "group_sum")
large_group_sum.show()

In [8]:
small_group_count = utils.plot_statistic("GroupBy and Count", time_statistics_small, "group_count")
small_group_count.show()

large_group_count = utils.plot_statistic("GroupBy and Count", time_statistics_large, "group_count")
large_group_count.show()

In [9]:
small_filter_0 = utils.plot_statistic("Filter by Value (less than 0)", time_statistics_small, "filter_less_0")
small_filter_0.show()

large_filter_0 = utils.plot_statistic("Filter by Value (less than 0)", time_statistics_large, "filter_less_0")
large_filter_0.show()

In [10]:
small_filter_10 = utils.plot_statistic("Filter by Value (less than 10)", time_statistics_small, "filter_less_10")
small_filter_10.show()

large_filter_10 = utils.plot_statistic("Filter by Value (less than 10)", time_statistics_large, "filter_less_10")
large_filter_10.show()

In [11]:
small_join = utils.plot_statistic("Join", time_statistics_small, "join")
small_join.show()

large_join = utils.plot_statistic("Join", time_statistics_large, "join")
large_join.show()

In [12]:
small_mul_build_in = utils.plot_statistic("Multiplication (Build-In Functions)", time_statistics_small, "mul_build")
small_mul_build_in.show()

large_mul_build_in = utils.plot_statistic("Multiplication (Build-In Functions)", time_statistics_large, "mul_build")
large_mul_build_in.show()

In [13]:
small_mul_col = utils.plot_statistic("Multiplication (Column Selection)", time_statistics_small, "mul_col")
small_mul_col.show()

large_mul_col = utils.plot_statistic("Multiplication (Column Selection)", time_statistics_large, "mul_col")
large_mul_col.show()

I had to choose a different Implementation, because the column name was different than the normal schema.

In [14]:
# create figure
fig1 = go.Figure()

# add the traces
fig1.add_trace(go.Scatter(x=time_statistics_small["row_count"], y=time_statistics_small[f"pd_to_pyspark"].apply(utils.calculate_seconds), mode='lines', name='Pandas to PySpark', hovertemplate='Rows: %{x}<br>Seconds: %{y}'))
fig1.add_trace(go.Scatter(x=time_statistics_small["row_count"], y=time_statistics_small[f"pyspark_to_pd"].apply(utils.calculate_seconds), mode='lines', name='PySpark to Pandas',hovertemplate='Rows: %{x}<br>Seconds: %{y}'))

# update the layout
fig1.update_layout(title="Convert Dataframe - Time Comparison",
                xaxis_title="Number of Rows",
                yaxis_title="Seconds")


# create figure
fig2 = go.Figure()

# add the traces
fig2.add_trace(go.Scatter(x=time_statistics_large["row_count"], y=time_statistics_large[f"pd_to_pyspark"].apply(utils.calculate_seconds), mode='lines', name='Pandas to PySpark', hovertemplate='Rows: %{x}<br>Seconds: %{y}'))
fig2.add_trace(go.Scatter(x=time_statistics_large["row_count"], y=time_statistics_large[f"pyspark_to_pd"].apply(utils.calculate_seconds), mode='lines', name='PySpark to Pandas',hovertemplate='Rows: %{x}<br>Seconds: %{y}'))

# update the layout
fig2.update_layout(title="Convert Dataframe - Time Comparison",
                xaxis_title="Number of Rows",
                yaxis_title="Seconds")


fig1.show()           
fig2.show()