In [39]:
from typing import List, Dict
import pandas as pd
from pathlib import Path
from pandas import DataFrame
import matplotlib.pyplot as plt

In [82]:
data_path = Path("../data/sizes")

def load_table(path: Path) -> DataFrame:
    all_entries: List[List[str]] = []
    column_names: List[str] = []
    types: List[str] = []
    with path.open() as file:
        table_starts: bool = False
        for line in file:
            if len(column_names) != 0 and table_starts and not '|' in line:
                break
            if not '|' in line:
                continue
            if len(column_names) == 0:
                column_names = [entry.replace(" ", "")
                                for entry in line.split("|") if entry != "" and entry != "\n"]
                continue
            if len(types) == 0:
                types =  [entry.replace(" ", "")
                          for entry in line.split("|") if entry != "" and entry != "\n"]
                continue
            if "<ValueS>" in line:
                table_starts = True
                continue
            if not table_starts:
                continue
            new_entry: List[str] = [entry.replace(" ", "")
                                    for entry in line.split("|") if entry != "" and entry != "\n"]
            all_entries.append(new_entry)
    as_type: Dict[str, str] = {column_names[i]:types[i] for i in range(0, len(column_names))}
    return DataFrame(data=all_entries, columns=column_names).astype(as_type)

simdcai = load_table(data_path / "sizes_SIMDCAI.txt")
dictionary = load_table(data_path / "sizes_Dictionary.txt")
turbopfor = load_table(data_path / "sizes_turboPFOR.txt")
bitpacking = load_table(data_path / "sizes_bitpacking_turbopfor.txt")

In [89]:
grouped_by_column: DataFrame = dictionary.groupby(["table_name", "column_name", "column_data_type"], as_index=False)["size_in_bytes"].sum()
columns_per_data_type: DataFrame = grouped_by_column.groupby(["column_data_type"], as_index=False)["column_name"].count()
size_per_data_type: DataFrame = grouped_by_column.groupby(["column_data_type"], as_index=False)["size_in_bytes"].sum()
columns_per_data_type

Unnamed: 0,column_data_type,column_name
0,float,9
1,int,19
2,string,33


In [43]:
cols_with_simdcai_flags = simdcai["encoding_type"].str.match(".*SIMDCAI.*")
cols_with_simdcai = simdcai[cols_with_simdcai_flags]

cols_with_tp_flags = turbopfor["encoding_type"].str.match(".*Turbo.*")
cols_with_tp = turbopfor[cols_with_tp_flags]


Unnamed: 0,table_name,chunk_id,column_id,column_name,column_data_type,distinct_value_count,encoding_type,vector_compression_type,size_in_bytes,point_accesses,sequential_accesses,monotonic_accesses,random_accesses,dictionary_accesses
0,nation,0,0,n_nationkey,int,25,SIMDCAI,,193,400,9175,3000,800,0
2,nation,0,2,n_regionkey,int,5,SIMDCAI,,177,0,10075,0,0,0
4,part,0,0,p_partkey,int,65535,SIMDCAI,,131233,0,6750105,2482034,258395,0
9,part,0,5,p_size,int,50,SIMDCAI,,49313,72,6750105,10049237,825280,0
13,part,1,0,p_partkey,int,65535,SIMDCAI,,131233,0,6750105,2481940,258689,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1799,customer,1,0,c_custkey,int,65535,SIMDCAI,,131233,0,25820790,4503907,80162965,0
1802,customer,1,3,c_nationkey,int,25,SIMDCAI,,41121,0,19857105,523839,7965826,0
1807,customer,2,0,c_custkey,int,18930,SIMDCAI,,35665,0,7458420,1300282,23140502,0
1810,customer,2,3,c_nationkey,int,25,SIMDCAI,,12001,0,5735790,150846,2303309,0


In [50]:
cols_with_simdcai_dict_encoded = dictionary[cols_with_simdcai_flags]
cols_with_tp_dict_encoded = dictionary[cols_with_tp_flags]

cols_with_simdcai_dict_encoded

Unnamed: 0,table_name,chunk_id,column_id,column_name,column_data_type,distinct_value_count,encoding_type,vector_compression_type,size_in_bytes,point_accesses,sequential_accesses,monotonic_accesses,random_accesses,dictionary_accesses
0,nation,0,0,n_nationkey,int,25,Dictionary,FixedSize1ByteAligned,221,400,9125,3000,800,12525
2,nation,0,2,n_regionkey,int,5,Dictionary,FixedSize1ByteAligned,141,0,10025,0,0,10025
4,part,0,0,p_partkey,int,65535,Dictionary,FixedSize2ByteAligned,393306,0,6619035,2480795,255023,9099830
9,part,0,5,p_size,int,50,Dictionary,FixedSize1ByteAligned,65831,61,6619035,10048311,825429,10044683
13,part,1,0,p_partkey,int,65535,Dictionary,FixedSize2ByteAligned,393306,0,6619035,2482728,264350,9101763
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1799,customer,1,0,c_custkey,int,65535,Dictionary,FixedSize2ByteAligned,393306,0,26279535,4503123,80182496,30782658
1802,customer,1,3,c_nationkey,int,25,Dictionary,FixedSize1ByteAligned,65731,0,19726035,524141,7960469,20250176
1807,customer,2,0,c_custkey,int,18930,Dictionary,FixedSize2ByteAligned,113676,0,7590930,1301100,23145473,8892030
1810,customer,2,3,c_nationkey,int,25,Dictionary,FixedSize1ByteAligned,19126,0,5697930,151100,2300494,5849030


In [45]:
size1 = cols_with_simdcai["size_in_bytes"].sum()
size2 = cols_with_simdcai_dict_encoded["size_in_bytes"].sum()
size3 = cols_with_tp["size_in_bytes"].sum()

print(size1/size2)
print(size3/size2)

56486271
109498396
56637286
0.5158639127462652
0.5172430653687383


In [46]:
int_cols = dictionary[dictionary["column_data_type"].str.match(".*int.*")]
int_cols["size_in_bytes"].sum() / dictionary["size_in_bytes"].sum()

0.11351794355486276

In [47]:
dictionary["column_data_type"].unique()

<StringArray>
['             int', '          string', '           float']
Length: 3, dtype: string

In [48]:
simdcai["size_in_bytes"].sum() / dictionary["size_in_bytes"].sum()

0.9450418669742584

In [49]:
turbopfor["size_in_bytes"].sum() / dictionary["size_in_bytes"].sum()

0.9451984255438098