In [1]:
import pandas as pd
import json

In [54]:
class AlgorithmStep():
    def __init__(self, indexes, number_of_indexes, step, memory_consumption, workload_processing_cost):
        self.indexes = indexes
        self.number_of_indexes = number_of_indexes
        self.step = step
        self.memory_consumption = memory_consumption
        self.workload_processing_cost = workload_processing_cost
        
    def memory_consumption_gb(self):
        return f"{self.memory_consumption / 1E9:.2f}"
    
    def __repr__(self):
        return f"At step: {self.step} with {self.number_of_indexes} indexes ({self.memory_consumption_gb()} GB):\nCost: {self.workload_processing_cost}\nIndexes: {self.indexes}"
        
def calculate_cost(columns, row):
    cost = 0
    for column in columns:
        if column[0] == 'q':
            cost += json.loads(row[column])['Cost']
    
    return cost

def shorten_tbl_name(index_name):
    return index_name.replace('nation.', 'n.').replace('customer.', 'c.').replace('lineitem.', 'l.').replace('partsupp.', 'ps.').replace('orders.', 'o.').replace('supplier.', 's.')

def cut_index_string(indexes):
    result = []
    removed_brackets = indexes[1:-1]
    for split_1 in removed_brackets.split('I('):
        without_brace = split_1.split(')')[0]
        if without_brace == '':
            continue
        index_name = f"[{without_brace.replace('C ', '')}]"
        index_name = shorten_tbl_name(index_name)
        result.append(index_name)
        
    return result



In [55]:
df = pd.read_csv('tpch/results_no_index_22_queries.csv', sep=';')
initial_cost = calculate_cost(df.columns, df.iloc[0])

In [56]:
algorithm_names = sorted(['dexter', 'epic', 'microsoft', 'ibm', 'drop_heuristic'])

algorithm_steps = {}

for algorithm_name in algorithm_names:
    path = f'tpch/results_{algorithm_name}_22_queries.csv'
    df = pd.read_csv(path,sep=';')
    algorithm_steps[algorithm_name] = []
    for step, row in df.iterrows():
        cost = calculate_cost(df.columns, row)
        ast = AlgorithmStep(row['indexed columns'], row['#indexes'], step, row['memory consumption'], cost)
        algorithm_steps[algorithm_name].append(ast)

#     print(algorithm_name)
#     for step in algorithm_steps[algorithm_name]:
#         print(step)
#         print(f'Cost relative to no indexes: {step.workload_processing_cost / initial_cost * 100}%')
#         print()
#     print()
#     print()
#     print()

In [71]:
pd.set_option('display.max_colwidth', 500)
df = pd.DataFrame(columns=['Step #', 'Metric'] + algorithm_names)
metrics = ['New Indexes', 'Workload Cost', 'Size (GB)']
most_steps = max(list(map(lambda x: len(algorithm_steps[x]), algorithm_steps)))
already_seen_indexes_per_algo = {}

for step in range(0, most_steps):
    for idx, metric in enumerate(metrics):
        df = df.append(pd.DataFrame([[step + 1 if idx == 0 else '', metric] + len(algorithm_names) * ['-']], columns=df.columns),ignore_index=True)
    for algorithm_name in algorithm_names:
        if algorithm_name not in already_seen_indexes_per_algo:
            already_seen_indexes_per_algo[algorithm_name] = set()
        if step < len(algorithm_steps[algorithm_name]):
            algorithm_step = algorithm_steps[algorithm_name][step]
            df[algorithm_name].values[-1] = algorithm_step.memory_consumption_gb()
            df[algorithm_name].values[-2] = algorithm_step.workload_processing_cost
            indexes = set(cut_index_string(algorithm_step.indexes))
            new_indexes = indexes - already_seen_indexes_per_algo[algorithm_name]
#             print(new_indexes)
            df[algorithm_name].values[-3] = '\n'.join(new_indexes) if len(new_indexes) else '-'
            already_seen_indexes_per_algo[algorithm_name] |= (new_indexes)
df = df.drop(columns=['dexter'])
def pretty_print(df):
    return display( HTML( df.to_html().replace("\\n","<br>") ) )
new_df = pretty_print(df)
new_df

Unnamed: 0,Step #,Metric,drop_heuristic,epic,ibm,microsoft
0,1.0,New Indexes,[l.l_partkey],-,"[s.s_suppkey,s.s_comment,s.s_nationkey] [n.n_nationkey,n.n_name,n.n_regionkey] [n.n_nationkey,n.n_regionkey] [s.s_nationkey,s.s_suppkey] [s.s_suppkey,s.s_name,s.s_nationkey] [c.c_custkey] [s.s_name] [part.p_size] [c.c_nationkey,c.c_custkey] [s.s_suppkey] [n.n_nationkey,n.n_name] [n.n_name,n.n_nationkey,n.n_regionkey] [s.s_suppkey,s.s_nationkey]","[l.l_partkey,l.l_suppkey]"
1,,Workload Cost,1.09947e+09,1.85989e+13,1.8589e+13,9.94156e+07
2,,Size (GB),1.57,0.00,0.19,2.51
3,2.0,New Indexes,[ps.ps_partkey],[ps.ps_suppkey],"[c.c_mktsegment,c.c_custkey] [part.p_partkey,part.p_brand,part.p_container] [c.c_custkey,c.c_nationkey]","[ps.ps_suppkey,ps.ps_partkey,ps.ps_supplycost]"
4,,Workload Cost,7.80037e+07,7.5342e+11,1.85933e+13,3.395e+07
5,,Size (GB),1.78,0.21,0.39,3.07
6,3.0,New Indexes,[l.l_suppkey] [ps.ps_suppkey],-,"[part.p_brand,part.p_partkey,part.p_container] [part.p_type,part.p_partkey] [c.c_acctbal,c.c_custkey,c.c_phone]","[l.l_suppkey,l.l_orderkey,l.l_tax]"
7,,Workload Cost,4.03288e+07,7.4411e+11,1.863e+13,3.30442e+07
8,,Size (GB),3.34,0.21,0.57,7.15
9,4.0,New Indexes,[l.l_orderkey],-,"[l.l_partkey,l.l_quantity,l.l_extendedprice] [l.l_partkey,l.l_extendedprice,l.l_quantity]","[l.l_orderkey,l.l_returnflag] [l.l_orderkey,l.l_quantity]"


In [74]:
import html2markdown
html2markdown.convert(df.to_html(index=False).replace('\n', ''))

'<table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th>Step #</th> <th>Metric</th> <th>drop_heuristic</th> <th>epic</th> <th>ibm</th> <th>microsoft</th> </tr> </thead> <tbody> <tr> <td>1</td> <td>New Indexes</td> <td>[l.l_partkey]</td> <td>-</td> <td>[s.s_suppkey,s.s_comment,s.s_nationkey]\\n[n.n_nationkey,n.n_name,n.n_regionkey]\\n[n.n_nationkey,n.n_regionkey]\\n[s.s_nationkey,s.s_suppkey]\\n[s.s_suppkey,s.s_name,s.s_nationkey]\\n[c.c_custkey]\\n[s.s_name]\\n[part.p_size]\\n[c.c_nationkey,c.c_custkey]\\n[s.s_suppkey]\\n[n.n_nationkey,n.n_name]\\n[n.n_name,n.n_nationkey,n.n_regionkey]\\n[s.s_suppkey,s.s_nationkey]</td> <td>[l.l_partkey,l.l_suppkey]</td> </tr> <tr> <td></td> <td>Workload Cost</td> <td>1.09947e+09</td> <td>1.85989e+13</td> <td>1.8589e+13</td> <td>9.94156e+07</td> </tr> <tr> <td></td> <td>Size (GB)</td> <td>1.57</td> <td>0.00</td> <td>0.19</td> <td>2.51</td> </tr> <tr> <td>2</td> <td>New Indexes</td> <td>[ps.ps_partkey]</td> <td>[ps.ps_suppkey]<

In [73]:
print(df.to_latex(index=False,escape=True))

\begin{tabular}{llllll}
\toprule
Step \# &         Metric &                                                          drop\_heuristic &                                                                                                                            epic &                                                                                                                                                                                                                                                                                                                                                                     ibm &                                                                                                                                                                           microsoft \\
\midrule
     1 &    New Indexes &                                                           [l.l\_partkey] &                                                                                 