In [63]:
import json
import itertools
import os
import pandas as pd
from matplotlib import pyplot as plt
from IPython.core.display import HTML

In [44]:
DATA_PATH = "../SOSD/data/osm_cellids_200M_uint64"

In [45]:
top_only_layers = ["radix", "bradix"]
anywhere_layers = ["linear", "cubic"]
specialty_top_layers = ["histogram", "loglinear", "normal", "lognormal"]
branching_factors = [2**x for x in range(7, 20)]

In [46]:
all_top_layers = top_only_layers + anywhere_layers

In [47]:
# first, build a grid of the most likely configs
configs = []
for top in all_top_layers:
    for bot in anywhere_layers:
        for bf in branching_factors[::2]:
            configs.append({"layers": f"{top},{bot}", "branching factor": bf})
            
# next, build a few tests to see if a speciality layer would help
for top in specialty_top_layers:
    if top == "histogram":
        for bot in anywhere_layers:
            for bf in [64, 128, 256]:
                configs.append({"layers": f"{top},{bot}", "branching factor": bf})       
    else:
        # not a histogram
        for bot in anywhere_layers:
            for bf in branching_factors[::3]:
                configs.append({"layers": f"{top},{bot}", "branching factor": bf})       



In [48]:
print("Testing", len(configs), "initial configurations.")
with open("step1.json", "w") as f:
    json.dump({"configs": configs}, f)
    

Testing 78 initial configurations.


In [50]:
cmd = f"RUST_BACKTRACE=1 RUST_LOG=trace cargo run --release {DATA_PATH} --param-grid step1.json"
os.system(cmd)

0

In [53]:
with open("results.json", "r") as f:
    step1_results = json.load(f)

step1_results = pd.DataFrame(step1_results)

In [65]:
display(HTML(step1_results.to_html()))

Unnamed: 0,layers,branching factor,average error,average error %,max error,max error %,size binary search,size linear search
0,"radix,linear",128,16777220.0,24.181279,48362559,24.181279,17536,16512
1,"radix,linear",512,2915224.0,4.406203,8812405,4.406203,69760,65664
2,"radix,linear",2048,1248769.0,5.211855,10423711,5.211855,278656,262272
3,"radix,linear",8192,267710.3,2.301554,4603109,2.301554,1114240,1048704
4,"radix,linear",32768,113133.0,1.234042,2468084,1.234042,4456576,4194432
5,"radix,linear",131072,49939.54,0.989075,1978150,0.989075,17825920,16777344
6,"radix,cubic",128,8388608.0,14.061114,28122228,14.061114,33920,32896
7,"radix,cubic",512,2336881.0,3.883072,7766143,3.883072,135296,131200
8,"radix,cubic",2048,1048576.0,3.054265,6108530,3.054265,540800,524416
9,"radix,cubic",8192,364816.2,1.783331,3566662,1.783331,2162816,2097280


In [59]:
# find Pareto efficient RMIs
mask = []
for idx1, el1 in step1_results.iterrows():
    my_size = el1["size linear search"]
    my_error = el1["max error"]
    for idx2, el2 in step1_results.iterrows():
        if idx1 == idx2:
            continue
            
        if (el2["size linear search"] <= my_size) and (el2["max error"] <= my_error):
            mask.append(False)
            break
    else:
        mask.append(True)
        
pareto = step1_results[mask]
pareto

Unnamed: 0,layers,branching factor,average error,average error %,max error,max error %,size binary search,size linear search
24,"linear,linear",128,2339913.0,5.746346,11492692,5.746346,17536,16512
25,"linear,linear",512,708401.6,1.965939,3931879,1.965939,69760,65664
56,"loglinear,linear",8192,262144.0,0.75799,1515980,0.75799,1114240,1048704
61,"loglinear,cubic",65536,70022.74,0.224099,448197,0.224099,17301632,16777344
62,"normal,linear",128,3185062.0,3.375293,6750586,3.375293,17600,16576
64,"normal,linear",8192,163349.1,0.475484,950969,0.475484,1114304,1048768
65,"normal,linear",65536,41358.16,0.308672,617344,0.308672,8913088,8388800
67,"normal,cubic",1024,734111.6,0.969581,1939162,0.969581,270528,262336
68,"normal,cubic",8192,146967.2,0.423521,847043,0.423521,2162880,2097344
69,"normal,cubic",65536,33400.31,0.133224,266449,0.133224,17301696,16777408
