In [1]:
import pandas as pd
import itertools
import numpy as np
from IPython.display import display, Markdown, Latex
import json

DATASETS = {
    "fb.csv": "fb_200M_uint64",
    "wiki.csv": "wiki_ts_200M_uint64",
    "osm.csv": "osm_cellids_200M_uint64",
    "osm400.csv": "osm_cellids_400M_uint64",
    "osm600.csv": "osm_cellids_600M_uint64",
    "osm800.csv": "osm_cellids_800M_uint64",
    "books.csv": "books_200M_uint64",
    "books400.csv": "books_400M_uint64",
    "books600.csv": "books_600M_uint64",
    "books800.csv": "books_800M_uint64",
    "norm32.csv": "normal_200M_uint32",
    "books32.csv": "books_200M_uint32",
    "fb32.csv": "fb_200M_uint32"
}

In [2]:

def produce_front(opt_out_fn, debug=False):    
    df = pd.read_csv(opt_out_fn)
    to_search = df[(df["average log2 error"] < 15) & (df["size binary search"] < 2000000000)].sort_values("average log2 error").copy()
    if debug:
        display(Markdown(f"# Top 20 overall `{opt_out_fn}`"))
        display(to_search.sort_values("average log2 error").iloc[0:20])
    
    front = to_search[to_search.front & (to_search["average log2 error"] < 15) & (to_search["size binary search"] < 2000000000)].copy()
    if debug:
        display(Markdown(f"# Pareto front `{opt_out_fn}`"))
        display(front.sort_values("average log2 error"))
    front = front.to_dict("record")
    
    retained = []
    for rmi1 in front:
        for rmi2 in front:
            if rmi1["layers"] == rmi2["layers"] and rmi1["branching factor"] == rmi2["branching factor"]:
                continue

            if abs(rmi1["average log2 error"] - rmi2["average log2 error"]) < 0.001:
                if rmi1["size binary search"] > rmi2["size binary search"]:
                    break

            if abs(rmi1["size binary search"] - rmi2["size binary search"]) < 1000:
                if rmi1["average log2 error"] > rmi2["average log2 error"]:
                    break
        else:
            retained.append(rmi1)

    retained = pd.DataFrame(retained).sort_values("average log2 error")
    if debug:
        display(Markdown(f"# Pruned `{opt_out_fn}`"))
        display(retained)

    retained["diff"] = [-1_000_000_000_000] + list(np.diff(retained.sort_values("size binary search", ascending=False)["size binary search"].to_numpy()))
    selected = retained.sort_values("diff", ascending=True)[0:10]
    selected = selected.sort_values("size binary search", ascending=False)
    display(Markdown(f"# Final configs `{opt_out_fn}`"))
    
    tmp = selected.copy()
    while len(tmp) < 10:
        tmp = tmp.append(tmp.iloc[-1], ignore_index=True)
    tmp = tmp.reset_index().drop(columns="index").reset_index()
    display(tmp)

    tmp["namespace"] = tmp.apply(lambda x: DATASETS[opt_out_fn] + "_" + str(x["index"]), axis=1)
    spec = {"configs" : tmp[["layers", "branching factor", "namespace", "binary"]].to_dict("record")}
    
        
    return spec

In [4]:
for k, v in DATASETS.items():
    print(k)
    front = produce_front(k)
    assert len(front["configs"]) == 10
    with open(f"/home/ryan/mcgraw/SOSD-private/scripts/rmi_specs/{v}.json", "w") as f:
        json.dump(front, f)

fb.csv


KeyError: 'average log2 error'