In [3]:
from pathlib import Path
from typing import List
import pandas as pd
from pandas import DataFrame
import seaborn as sns
import load_workload_data as ld
import load_benchmark_data as be
import matplotlib.pyplot as plt

In [4]:
# Init
sns.set()
sns.set(rc={'figure.figsize':(30,14)})
sns.set(font_scale=6.5)
plt.tight_layout()
workloads: Path = Path("../data/workloads/")

def plot_stacked(data: DataFrame, x: str, y: str, hue: str, hue_order: List[str]=None, order: List[str]=None, color=None):
    df_plottable: DataFrame = data.groupby([x, hue])[y].sum().reset_index().pivot(columns=hue, index=x, values=y)
    if hue_order:
        assert sorted(hue_order) == sorted(list(df_plottable))
        df_plottable = df_plottable[hue_order]
    if order:
        df_plottable = df_plottable.reindex(order)
    df_plottable.plot(kind="bar", stacked=True, ax=plt.gca(), rot=0)
    plt.ylabel(y)

def plot_workload(workload_folder: Path):
    df = ld.get_workload_data(workload_folder)
    x_order: List[str] = sorted(df[ld.DATA_TYPE].drop_duplicates(), reverse=True)
    order: List[str] = ["TABLE_SCAN", "JOIN", "AGGREGATE", "PROJECTION"]
    order_filtered = [operator for operator in order if operator in list(df[ld.OPERATOR_TYPE])]
    plot_stacked(data=df, x=ld.DATA_TYPE, y=ld.RUNTIME_S, hue=ld.OPERATOR_TYPE, hue_order=order_filtered, order=x_order)

<Figure size 2160x1008 with 0 Axes>

In [5]:
import sys
from enum import Enum
from pathlib import Path
from typing import List, Tuple

import pandas as pd
from pandas import DataFrame

class Operator(str, Enum):
    SCAN = "table_scans",
    PROJECTION = "projections",
    AGGREGATE = "aggregates",
    JOIN = "joins"

RUNTIME_S = "Runtime (in s)"
COLUMN_TYPE = "Data Access"
QUERY_HASH = "QUERY_HASH"
OPERATOR_HASH = "OPERATOR_HASH"
DATA_TYPE = "Data Type"
TABLE_NAME = "TABLE_NAME"
COLUMN_NAME = "COLUMN_NAME"
OPERATOR_TYPE = "Operator"
WORKLOAD = "WORKLOAD"

BENCHMARKS: List[str] = ["CH-benCHmark", "Join Order Benchmark", "TPC-C", "TPC-DS", "TPC-H"]

def get_with_column_data_type(table: DataFrame, metadata: DataFrame) -> DataFrame:
    table[TABLE_NAME] = table[TABLE_NAME].astype(object)
    table[COLUMN_NAME] = table[COLUMN_NAME].astype(object)
    table_with_data_types = table.merge(metadata, how="left", on=[TABLE_NAME, COLUMN_NAME])
    return table_with_data_types

def get_grouped_by_operator_hash(table: DataFrame) -> DataFrame:
    grouped_by_operator_hash: DataFrame = table.groupby([QUERY_HASH, OPERATOR_HASH], as_index=False)[RUNTIME_S] \
        .agg(["count", "mean"])
    grouped_by_operator_hash[RUNTIME_S] = [mean / count for mean, count in zip(grouped_by_operator_hash["mean"], grouped_by_operator_hash["count"])]
    grouped_by_operator_hash = grouped_by_operator_hash.reset_index()

    table = table.drop(RUNTIME_S, axis=1)
    table = table.merge(grouped_by_operator_hash, on=[QUERY_HASH, OPERATOR_HASH])
    return table

def get_workload_data(workload_directory: Path) -> DataFrame:
    # Initialize
    metadata = pd.read_csv(workload_directory / "column_meta_data.csv", delimiter="|")
    metadata = metadata.rename(columns={"DATA_TYPE": DATA_TYPE})
    workload_name: str = workload_directory.name
    aggregated_data = []
    for operator in list(Operator):
        # print(f"Processing {operator}")

        # Get Dataframe
        table: DataFrame = pd.read_csv(workload_directory / f"{operator}.csv", delimiter="|")
        #print(table)
        table["RUNTIME_NS"] = [runtime / 1e9 for runtime in table["RUNTIME_NS"]]
        table = table.rename(columns={"RUNTIME_NS" : RUNTIME_S, "DATA_TYPE": DATA_TYPE,
                                      "COLUMN_TYPE": COLUMN_TYPE, "OPERATOR_TYPE": OPERATOR_TYPE})
        table[WORKLOAD] = [workload_name for i in range(len(table))]

        # Preprocess in case that we have a join (since both columns that we join on have the same type, we
        # select the left column as the "true" column)
        if operator is Operator.JOIN:
            table = table.rename({f"LEFT_COLUMN_NAME": COLUMN_NAME, f"LEFT_COLUMN_TYPE":COLUMN_TYPE,
                                 f"LEFT_TABLE_NAME":TABLE_NAME}, axis="columns")

        # Groupby to avoid having missleading results
        if operator is not Operator.SCAN:
            table = get_grouped_by_operator_hash(table)
        table = get_with_column_data_type(table, metadata)

        # Calculate Information
        #grouped_by_column_type: DataFrame = table.groupby([COLUMN_TYPE, DATA_TYPE, OPERATOR_TYPE, WORKLOAD], as_index=False)[RUNTIME_S].sum().reset_index()
        # 0: DATA, 1: REFERENCE
        aggregated_data.append((str(operator), table.groupby(COLUMN_TYPE)[COLUMN_TYPE].count()[0], table.groupby(COLUMN_TYPE)[COLUMN_TYPE].count()[1]))
    return aggregated_data



In [6]:
relative = []
for b in ["TPC-H", "TPC-DS", "CH-benCHmark", "TPC-C", "Join Order Benchmark"]:
    d = get_workload_data(workloads / b)
    data = sum([v[1] for v in d])
    ref = sum([v[2] for v in d])
    print(b, data, ref)
    relative.append(data / (data + ref))

TPC-H 14654 16564
TPC-DS 44150 82227
CH-benCHmark 45779 925270
TPC-C 2264 40729


IndexError: index 1 is out of bounds for axis 0 with size 1

In [7]:
sum([1-r for r in relative]) / 4

0.770359046216605

In [13]:
relative = []
for b in ["TPC-H", "TPC-DS", "CH-benCHmark", "TPC-C", "Join Order Benchmark"]:
    table: DataFrame = pd.read_csv(workloads / b / f"table_scans.csv", delimiter="|")
    #print(table)
    table["selectivity"] = table["OUTPUT_ROW_COUNT"] / table["INPUT_ROW_COUNT"]
    #break
    relative.append(table["selectivity"].mean())
sum(relative) / len(relative)

0.3116443658189082

[0.4073986681907011,
 0.23868022680140927,
 0.10919891562679623,
 0.1727336854886215,
 0.6302103329870131]