In [6]:
import pandas as pd
import itertools
import numpy as np
from IPython.display import display, Markdown, Latex
import json

DATASETS = {
    "fb.csv": "fb_200M_uint64",
    "wiki.csv": "wiki_ts_200M_uint64",
    "osm.csv": "osm_cellids_200M_uint64",
    "osm400.csv": "osm_cellids_400M_uint64",
    "osm600.csv": "osm_cellids_600M_uint64",
    "osm800.csv": "osm_cellids_800M_uint64",
    "books.csv": "books_200M_uint64",
    "books400.csv": "books_400M_uint64",
    "books600.csv": "books_600M_uint64",
    "books800.csv": "books_800M_uint64"   
}

In [17]:

def produce_front(opt_out_fn, debug=False):    
    df = pd.read_csv(opt_out_fn)
    to_search = df[(df["average log2 error"] < 15) & (df["size binary search"] < 2000000000)].sort_values("average log2 error").copy()
    if debug:
        display(Markdown(f"# Top 20 overall `{opt_out_fn}`"))
        display(to_search.sort_values("average log2 error").iloc[0:20])
    
    front = to_search[to_search.front & (to_search["average log2 error"] < 15) & (to_search["size binary search"] < 2000000000)].copy()
    if debug:
        display(Markdown(f"# Pareto front `{opt_out_fn}`"))
        display(front.sort_values("average log2 error"))
    front = front.to_dict("record")
    
    retained = []
    for rmi1 in front:
        for rmi2 in front:
            if rmi1["layers"] == rmi2["layers"] and rmi1["branching factor"] == rmi2["branching factor"]:
                continue

            if abs(rmi1["average log2 error"] - rmi2["average log2 error"]) < 0.001:
                if rmi1["size binary search"] > rmi2["size binary search"]:
                    break

            if abs(rmi1["size binary search"] - rmi2["size binary search"]) < 1000:
                if rmi1["average log2 error"] > rmi2["average log2 error"]:
                    break
        else:
            retained.append(rmi1)

    retained = pd.DataFrame(retained).sort_values("average log2 error")
    if debug:
        display(Markdown(f"# Pruned `{opt_out_fn}`"))
        display(retained)

    retained["diff"] = [-1_000_000_000_000] + list(np.diff(retained.sort_values("size binary search", ascending=False)["size binary search"].to_numpy()))
    selected = retained.sort_values("diff", ascending=True)[0:10]
    selected = selected.sort_values("size binary search", ascending=False)
    display(Markdown(f"# Final configs `{opt_out_fn}`"))
    
    tmp = selected.copy()
    while len(tmp) < 10:
        tmp = tmp.append(tmp.iloc[-1], ignore_index=True)
    tmp = tmp.reset_index().drop(columns="index").reset_index()
    display(tmp)

    tmp["namespace"] = tmp.apply(lambda x: DATASETS[opt_out_fn] + "_" + str(x["index"]), axis=1)
    spec = {"configs" : tmp[["layers", "branching factor", "namespace", "binary"]].to_dict("record")}
    
        
    return spec

In [18]:
for k, v in DATASETS.items():
    front = produce_front(k)
    assert len(front["configs"]) == 10
    with open(f"/home/ryan/mcgraw/SOSD-private/scripts/rmi_specs/{v}.json", "w") as f:
        json.dump(front, f)

# Final configs `fb.csv`

Unnamed: 0,index,layers,branching factor,average error,average error %,average l2 error,average log2 error,max error,max error %,size binary search,size linear search,namespace,binary,star,front,diff
0,0,"linear_spline,linear",2097152,127.140537,0.001914,42267170.0,7.016945,3828,0.001914,50331664,33554448,nm77680171b4d11b70aa2f,True,False,True,-1000000000000
1,1,"linear_spline,linear",1048576,183.943797,0.00255,91238600.0,7.723291,5099,0.00255,25165840,16777232,nm9d7314ecfb0606fea357,True,False,True,-25165824
2,2,"linear_spline,linear",524288,266.117007,0.002589,208736300.0,8.428844,5178,0.002589,12582928,8388624,nm14697a215f79906b7b7d,True,False,True,-12582912
3,3,"linear_spline,linear",262144,383.549167,0.002908,503768900.0,9.120617,5816,0.002908,6291472,4194320,nmd985905d60efa476204b,True,False,True,-6291456
4,4,"linear,linear",131072,554.474509,0.005482,1338033000.0,9.793821,10964,0.005482,3145744,2097168,nm4ba04214fe8ae1c6a0ef,True,False,True,-3145728
5,5,"linear_spline,linear",65536,799.587735,0.003387,3846815000.0,10.430103,6774,0.003387,1572880,1048592,nm6d2f6a0dda092b70154a,True,False,True,-1572864
6,6,"linear_spline,linear",32768,1151.26079,0.004347,12281710000.0,11.030974,8695,0.004347,786448,524304,nm238bd4f690b492c7d5e8,True,False,True,-786432
7,7,"linear_spline,linear",16384,1664.141578,0.004609,43463240000.0,11.609133,9218,0.004609,393232,262160,nmc64eb7f62650bbe7dfff,True,False,True,-393216
8,8,"linear_spline,linear",8192,2413.632627,0.004276,166637700000.0,12.167599,8551,0.004276,196624,131088,nmaac4a789ceba69493dce,True,False,True,-196608
9,9,"linear_spline,linear",4096,3499.589229,0.004722,668854100000.0,12.716431,9443,0.004722,98320,65552,nm6502100e724c28edaaf0,True,False,True,-98304


# Final configs `wiki.csv`

Unnamed: 0,index,layers,branching factor,average error,average error %,average l2 error,average log2 error,max error,max error %,size binary search,size linear search,namespace,binary,star,front,diff
0,0,"cubic,linear",2097152,6.843935,0.000971,8338808.0,3.858576,1941,0.000971,50331680,33554464,nm64b6dda5b888f4561122,True,False,True,-1000000000000
1,1,"cubic,linear",1048576,12.569999,0.007824,1693544000.0,4.331367,15648,0.007824,25165856,16777248,nm736bee190651c4a0cd4d,True,False,True,-25165824
2,2,"cubic,linear",524288,17.34654,0.007824,1693564000.0,4.8498,15647,0.007824,12582944,8388640,nm0ef962d126b549515ff0,True,False,True,-12582912
3,3,"cubic,linear",262144,37.36073,0.0397,43689400000.0,5.437125,79401,0.0397,6291488,4194336,nm09179faa51abc737f7e4,True,False,True,-6291456
4,4,"cubic,linear",131072,56.880235,0.052127,75550890000.0,6.110941,104254,0.052127,3145760,2097184,nm9d512a66510b9ac033ad,True,False,True,-3145728
5,5,"cubic,linear",65536,86.800784,0.051725,74800060000.0,6.897752,103450,0.051725,1572896,1048608,nm5977238fee1957ddfb0c,True,False,True,-1572864
6,6,"cubic,linear",32768,144.203666,0.028193,22770540000.0,7.83872,56387,0.028193,786464,524320,nmdf834f1da40372581102,True,False,True,-786432
7,7,"cubic,linear",16384,322.582491,0.016004,10417200000.0,9.042062,32008,0.016004,393248,262176,nm716f02c1677fda4778b1,True,False,True,-393216
8,8,"cubic,linear",8192,935.394713,0.013853,51124170000.0,10.570055,27707,0.013853,196640,131104,nm3dfcd5b0ee81140c1161,True,False,True,-196608
9,9,"cubic,linear",4096,3196.648491,0.013594,967567900000.0,12.388285,27188,0.013594,98336,65568,nm2f6cb73bf7227f2a68cf,True,False,True,-98304


# Final configs `osm.csv`

Unnamed: 0,index,layers,branching factor,average error,average error %,average l2 error,average log2 error,max error,max error %,size binary search,size linear search,namespace,binary,star,front,diff
0,0,"radix26,linear",2097152,153.784532,0.009473,742654500.0,6.628143,18945,0.009473,318767108,301989892,nm5224d02427c12afc4e35,True,False,True,-1000000000000
1,1,"radix26,linear",1048576,162.449067,0.009473,755846300.0,6.970636,18945,0.009473,293601284,285212676,nm1051edb8eb3c4850071e,True,False,True,-25165824
2,2,"radix26,linear",524288,183.048636,0.010434,860972100.0,7.461723,20867,0.010434,281018372,276824068,nmcecb82faae80a9205fdb,True,False,True,-12582912
3,3,"cubic,linear",2097152,1798.456309,79.680933,1628254000000.0,9.675247,159361867,79.680933,50331680,33554464,nm64b6dda5b888f4561122,True,False,True,-230686692
4,4,"cubic,linear",1048576,3339.049197,79.680933,13001150000000.0,10.5604,159361867,79.680933,25165856,16777248,nm736bee190651c4a0cd4d,True,False,True,-25165824
5,5,"cubic,linear",524288,5697.569922,79.680933,54691060000000.0,11.438515,159361867,79.680933,12582944,8388640,nm0ef962d126b549515ff0,True,False,True,-12582912
6,6,"cubic,linear",262144,9615.863456,79.680933,169680600000000.0,12.334018,159361867,79.680933,6291488,4194336,nm09179faa51abc737f7e4,True,False,True,-6291456
7,7,"cubic,linear",131072,15171.124057,79.680933,366298400000000.0,13.22498,159361867,79.680933,3145760,2097184,nm9d512a66510b9ac033ad,True,False,True,-3145728
8,8,"linear,linear",65536,50721.556197,0.657125,1.382877e+17,14.125928,1314250,0.657125,1572880,1048592,nm953abf6dfb01eeb62cbc,True,False,True,-1572880
9,9,"cubic,linear",32768,43002.230702,0.283509,4289466000000000.0,14.998977,567019,0.283509,786464,524320,nmdf834f1da40372581102,True,False,True,-786416


# Final configs `osm400.csv`

Unnamed: 0,index,layers,branching factor,average error,average error %,average l2 error,average log2 error,max error,max error %,size binary search,size linear search,namespace,binary,star,front,diff
0,0,"radix26,linear",2097152,311.8696,0.009486,6178304000.0,7.614878,37944,0.009486,318767108,301989892,nm5224d02427c12afc4e35,True,False,True,-1000000000000
1,1,"radix26,linear",1048576,328.742425,0.009486,6233748000.0,7.962549,37944,0.009486,293601284,285212676,nm1051edb8eb3c4850071e,True,False,True,-25165824
2,2,"radix26,linear",524288,372.998499,0.025125,11559860000.0,8.457498,100499,0.025125,281018372,276824068,nmcecb82faae80a9205fdb,True,False,True,-12582912
3,3,"cubic,linear",2097152,3715.009926,79.680933,17249990000000.0,10.68262,318723734,79.680933,50331680,33554464,nm64b6dda5b888f4561122,True,False,True,-230686692
4,4,"cubic,linear",1048576,6628.78116,79.680933,140634800000000.0,11.558896,318723734,79.680933,25165856,16777248,nm736bee190651c4a0cd4d,True,False,True,-25165824
5,5,"cubic,linear",524288,11760.514002,79.680933,498775600000000.0,12.443793,318723734,79.680933,12582944,8388640,nm0ef962d126b549515ff0,True,False,True,-12582912
6,6,"cubic,linear",262144,20306.353201,79.680933,1502829000000000.0,13.355001,318723734,79.680933,6291488,4194336,nm09179faa51abc737f7e4,True,False,True,-6291456
7,7,"cubic,linear",131072,32727.901212,79.680933,3886726000000000.0,14.226932,318723734,79.680933,3145760,2097184,nm9d512a66510b9ac033ad,True,False,True,-3145728
8,8,"cubic,linear",131072,32727.901212,79.680933,3886726000000000.0,14.226932,318723734,79.680933,3145760,2097184,nm9d512a66510b9ac033ad,True,False,True,-3145728
9,9,"cubic,linear",131072,32727.901212,79.680933,3886726000000000.0,14.226932,318723734,79.680933,3145760,2097184,nm9d512a66510b9ac033ad,True,False,True,-3145728


# Final configs `osm600.csv`

Unnamed: 0,index,layers,branching factor,average error,average error %,average l2 error,average log2 error,max error,max error %,size binary search,size linear search,namespace,binary,star,front,diff
0,0,"radix26,linear",2097152,470.37225,0.016849,21453380000.0,8.196358,101095,0.016849,318767108,301989892,nm5224d02427c12afc4e35,True,False,True,-1000000000000
1,1,"radix26,linear",1048576,495.386924,0.016849,21606410000.0,8.545297,101095,0.016849,293601284,285212676,nm1051edb8eb3c4850071e,True,False,True,-25165824
2,2,"radix26,linear",524288,564.304626,0.039066,56927680000.0,9.041247,234397,0.039066,281018372,276824068,nmcecb82faae80a9205fdb,True,False,True,-12582912
3,3,"cubic,linear",2097152,6045.428862,79.684178,135152100000000.0,11.276821,478105068,79.684178,50331680,33554464,nm64b6dda5b888f4561122,True,False,True,-230686692
4,4,"cubic,linear",1048576,10093.068201,79.684178,279391600000000.0,12.16729,478105068,79.684178,25165856,16777248,nm736bee190651c4a0cd4d,True,False,True,-25165824
5,5,"cubic,linear",524288,17338.067488,79.684178,1212404000000000.0,13.041993,478105068,79.684178,12582944,8388640,nm0ef962d126b549515ff0,True,False,True,-12582912
6,6,"cubic,linear",262144,30928.794324,79.684178,5228119000000000.0,13.955464,478105068,79.684178,6291488,4194336,nm09179faa51abc737f7e4,True,False,True,-6291456
7,7,"linear,linear",131072,122057.918829,0.657112,3.70575e+18,14.821816,3942670,0.657112,3145744,2097168,nm4ba04214fe8ae1c6a0ef,True,False,True,-3145744
8,8,"linear,linear",131072,122057.918829,0.657112,3.70575e+18,14.821816,3942670,0.657112,3145744,2097168,nm4ba04214fe8ae1c6a0ef,True,False,True,-3145744
9,9,"linear,linear",131072,122057.918829,0.657112,3.70575e+18,14.821816,3942670,0.657112,3145744,2097168,nm4ba04214fe8ae1c6a0ef,True,False,True,-3145744


# Final configs `osm800.csv`

Unnamed: 0,index,layers,branching factor,average error,average error %,average l2 error,average log2 error,max error,max error %,size binary search,size linear search,namespace,binary,star,front,diff
0,0,"radix26,linear",2097152,630.774092,0.017681,53331630000.0,8.610785,141451,0.017681,318767108,301989892,nm5224d02427c12afc4e35,True,False,True,-1000000000000
1,1,"radix26,linear",1048576,664.059818,0.017681,53640340000.0,8.960148,141451,0.017681,293601284,285212676,nm1051edb8eb3c4850071e,True,False,True,-25165824
2,2,"radix26,linear",524288,755.553261,0.041689,142192300000.0,9.456396,333514,0.041689,281018372,276824068,nmcecb82faae80a9205fdb,True,False,True,-12582912
3,3,"cubic,linear",2097152,7512.142828,0.078147,131898400000000.0,11.692654,625179,0.078147,50331680,33554464,nm64b6dda5b888f4561122,True,False,True,-230686692
4,4,"cubic,linear",1048576,13842.767716,0.115044,1039291000000000.0,12.572016,920355,0.115044,25165856,16777248,nm736bee190651c4a0cd4d,True,False,True,-25165824
5,5,"cubic,linear",524288,24180.036309,0.30261,6099367000000000.0,13.453065,2420883,0.30261,12582944,8388640,nm0ef962d126b549515ff0,True,False,True,-12582912
6,6,"cubic,linear",262144,39996.013979,0.269547,1.066883e+16,14.35855,2156378,0.269547,6291488,4194336,nm09179faa51abc737f7e4,True,False,True,-6291456
7,7,"cubic,linear",262144,39996.013979,0.269547,1.066883e+16,14.35855,2156378,0.269547,6291488,4194336,nm09179faa51abc737f7e4,True,False,True,-6291456
8,8,"cubic,linear",262144,39996.013979,0.269547,1.066883e+16,14.35855,2156378,0.269547,6291488,4194336,nm09179faa51abc737f7e4,True,False,True,-6291456
9,9,"cubic,linear",262144,39996.013979,0.269547,1.066883e+16,14.35855,2156378,0.269547,6291488,4194336,nm09179faa51abc737f7e4,True,False,True,-6291456


# Final configs `books.csv`

Unnamed: 0,index,layers,branching factor,average error,average error %,average l2 error,average log2 error,max error,max error %,size binary search,size linear search,namespace,binary,star,front,diff
0,0,"linear_spline,linear",2097152,13.562024,7.2e-05,34972.33,4.740375,143,7.2e-05,50331664,33554448,nm77680171b4d11b70aa2f,True,False,True,-1000000000000
1,1,"linear_spline,linear",1048576,19.640306,0.000101,139047.6,5.216807,203,0.000101,25165840,16777232,nm9d7314ecfb0606fea357,True,False,True,-25165824
2,2,"linear_spline,linear",524288,28.293764,0.000105,556250.0,5.694511,209,0.000105,12582928,8388624,nm14697a215f79906b7b7d,True,False,True,-12582912
3,3,"linear_spline,linear",262144,40.659039,0.000128,2250520.0,6.175874,256,0.000128,6291472,4194320,nmd985905d60efa476204b,True,False,True,-6291456
4,4,"cubic,linear",131072,64.775024,0.000166,14667100.0,6.646657,332,0.000166,3145760,2097184,nm9d512a66510b9ac033ad,True,False,True,-3145712
5,5,"cubic,linear",65536,92.707136,0.000249,60618020.0,7.123086,497,0.000249,1572896,1048608,nm5977238fee1957ddfb0c,True,False,True,-1572864
6,6,"cubic,linear",32768,131.80285,0.000345,246039700.0,7.601833,690,0.000345,786464,524320,nmdf834f1da40372581102,True,False,True,-786432
7,7,"cubic,linear",16384,187.616965,0.000384,1003650000.0,8.087064,768,0.000384,393248,262176,nm716f02c1677fda4778b1,True,False,True,-393216
8,8,"cubic,linear",8192,263.273023,0.000543,3952284000.0,8.563083,1086,0.000543,196640,131104,nm3dfcd5b0ee81140c1161,True,False,True,-196608
9,9,"cubic,linear",4096,361.341534,0.000616,14652270000.0,9.033416,1232,0.000616,98336,65568,nm2f6cb73bf7227f2a68cf,True,False,True,-98304


# Final configs `books400.csv`

Unnamed: 0,index,layers,branching factor,average error,average error %,average l2 error,average log2 error,max error,max error %,size binary search,size linear search,namespace,binary,star,front,diff
0,0,"cubic,linear",2097152,29.291543,0.000118,342136.4,5.720408,474,0.000118,50331680,33554464,nm64b6dda5b888f4561122,True,False,True,-1000000000000
1,1,"cubic,linear",1048576,42.05714,0.000128,1355858.0,6.179188,513,0.000128,25165856,16777248,nm736bee190651c4a0cd4d,True,False,True,-25165824
2,2,"cubic,linear",524288,60.239218,0.000146,5469022.0,6.636412,586,0.000146,12582944,8388640,nm0ef962d126b549515ff0,True,False,True,-12582912
3,3,"cubic,linear",262144,86.224495,0.000157,22385640.0,7.097978,629,0.000157,6291488,4194336,nm09179faa51abc737f7e4,True,False,True,-6291456
4,4,"cubic,linear",131072,122.971473,0.000171,91428890.0,7.563637,685,0.000171,3145760,2097184,nm9d512a66510b9ac033ad,True,False,True,-3145728
5,5,"cubic,linear",65536,175.257718,0.000243,374358900.0,8.036209,973,0.000243,1572896,1048608,nm5977238fee1957ddfb0c,True,False,True,-1572864
6,6,"cubic,linear",32768,249.68368,0.000292,1531848000.0,8.515894,1169,0.000292,786464,524320,nmdf834f1da40372581102,True,False,True,-786432
7,7,"cubic,linear",16384,354.914587,0.00041,6228043000.0,9.000914,1639,0.00041,393248,262176,nm716f02c1677fda4778b1,True,False,True,-393216
8,8,"cubic,linear",8192,498.088377,0.000499,24359380000.0,9.481321,1998,0.000499,196640,131104,nm3dfcd5b0ee81140c1161,True,False,True,-196608
9,9,"cubic,linear",4096,682.463988,0.000595,90742140000.0,9.939651,2380,0.000595,98336,65568,nm2f6cb73bf7227f2a68cf,True,False,True,-98304


# Final configs `books600.csv`

Unnamed: 0,index,layers,branching factor,average error,average error %,average l2 error,average log2 error,max error,max error %,size binary search,size linear search,namespace,binary,star,front,diff
0,0,"cubic,linear",2097152,43.008532,8.4e-05,1059960.0,6.255798,507,8.4e-05,50331680,33554464,nm64b6dda5b888f4561122,True,False,True,-1000000000000
1,1,"cubic,linear",1048576,61.584883,0.000152,4126926.0,6.718883,910,0.000152,25165856,16777248,nm736bee190651c4a0cd4d,True,False,True,-25165824
2,2,"cubic,linear",524288,88.118567,0.000126,16530840.0,7.178909,755,0.000126,12582944,8388640,nm0ef962d126b549515ff0,True,False,True,-12582912
3,3,"cubic,linear",262144,125.940628,0.000167,67256400.0,7.641206,1000,0.000167,6291488,4194336,nm09179faa51abc737f7e4,True,False,True,-6291456
4,4,"cubic,linear",131072,179.474206,0.000182,273905700.0,8.107491,1094,0.000182,3145760,2097184,nm9d512a66510b9ac033ad,True,False,True,-3145728
5,5,"cubic,linear",65536,255.931976,0.000218,1123779000.0,8.580681,1307,0.000218,1572896,1048608,nm5977238fee1957ddfb0c,True,False,True,-1572864
6,6,"cubic,linear",32768,364.503408,0.000323,4589197000.0,9.061426,1937,0.000323,786464,524320,nmdf834f1da40372581102,True,False,True,-786432
7,7,"cubic,linear",16384,516.901953,0.000387,18488830000.0,9.545637,2321,0.000387,393248,262176,nm716f02c1677fda4778b1,True,False,True,-393216
8,8,"cubic,linear",8192,725.790676,0.000449,72831940000.0,10.023516,2695,0.000449,196640,131104,nm3dfcd5b0ee81140c1161,True,False,True,-196608
9,9,"cubic,linear",4096,1005.115865,0.000578,275491800000.0,10.497749,3467,0.000578,98336,65568,nm2f6cb73bf7227f2a68cf,True,False,True,-98304


# Final configs `books800.csv`

Unnamed: 0,index,layers,branching factor,average error,average error %,average l2 error,average log2 error,max error,max error %,size binary search,size linear search,namespace,binary,star,front,diff
0,0,"cubic,linear",2097152,59.359151,0.000124,2850875.0,6.69897,989,0.000124,50331680,33554464,nm64b6dda5b888f4561122,True,False,True,-1000000000000
1,1,"cubic,linear",1048576,84.769484,0.00013,11074930.0,7.163128,1042,0.00013,25165856,16777248,nm736bee190651c4a0cd4d,True,False,True,-25165824
2,2,"cubic,linear",524288,120.9823,0.00015,44185380.0,7.623692,1203,0.00015,12582944,8388640,nm0ef962d126b549515ff0,True,False,True,-12582912
3,3,"cubic,linear",262144,172.803398,0.000158,179823700.0,8.087635,1260,0.000158,6291488,4194336,nm09179faa51abc737f7e4,True,False,True,-6291456
4,4,"cubic,linear",131072,246.193558,0.000172,732717200.0,8.555401,1372,0.000172,3145760,2097184,nm9d512a66510b9ac033ad,True,False,True,-3145728
5,5,"cubic,linear",65536,350.691477,0.000243,2997339000.0,9.029714,1946,0.000243,1572896,1048608,nm5977238fee1957ddfb0c,True,False,True,-1572864
6,6,"cubic,linear",32768,499.486937,0.000292,12259110000.0,9.510875,2337,0.000292,786464,524320,nmdf834f1da40372581102,True,False,True,-786432
7,7,"cubic,linear",16384,709.896403,0.00041,49831160000.0,9.997131,3282,0.00041,393248,262176,nm716f02c1677fda4778b1,True,False,True,-393216
8,8,"cubic,linear",8192,996.226846,0.000499,194886600000.0,10.478534,3996,0.000499,196640,131104,nm3dfcd5b0ee81140c1161,True,False,True,-196608
9,9,"cubic,linear",4096,1364.977123,0.000596,725982700000.0,10.937593,4771,0.000596,98336,65568,nm2f6cb73bf7227f2a68cf,True,False,True,-98304
