In [1]:
import json
from collections.abc import Container
from copy import deepcopy
from functools import reduce
from pathlib import Path
from pprint import pprint

import ipywidgets as widgets
from IPython.display import display

In [2]:
class HasEverything:
    def __contains__(self, item):
        return True


everything = HasEverything()


def filter_results(
        results: dict,
        *,
        measures: Container[str],
        storages: Container[str],
        catalogs: Container[str],
        col_types: Container[str],
) -> dict:
    filtered = deepcopy(results)
    for suffix, suffix_result in results.items():
        for m, m_result in suffix_result.items():
            if m not in measures:
                del filtered[suffix][m]
                continue
            for storage, storage_result in m_result.items():
                if storage not in storages:
                    del filtered[suffix][m][storage]
                    continue
                for catalog, catalog_result in storage_result.items():
                    if catalog not in catalogs:
                        del filtered[suffix][m][storage][catalog]
                        continue
                    for col_type, record in catalog_result.items():
                        if col_type not in col_types:
                            del filtered[suffix][m][storage][catalog][col_type]
                            continue
    return filtered


def transpose(results: dict) -> dict:
    transposed = {}
    for suffix, suffix_result in results.items():
        for m, m_result in suffix_result.items():
            for storage, storage_result in m_result.items():
                for catalog, catalog_result in storage_result.items():
                    for col_type, record in catalog_result.items():
                        key = (m, storage, catalog, col_type)
                        transposed.setdefault(key, {}).update({suffix: record})
    return transposed


def add_more_stats(results: dict) -> dict:
    results = deepcopy(results)
    transposed = transpose(results)
    for key, result in transposed.items():
        prefixes = list(result.keys())
        times = list(v["time"] for v in result.values())
        best_time = min(times)
        scores = [best_time / t for t in times]
        ranks = sorted(range(len(times)), key=lambda i: times[i])
        for i, prefix in enumerate(prefixes):
            record = reduce(lambda r, k: r[k], key, results[prefix])
            record["score"] = scores[i]
            record["rank"] = ranks[i]
    return results


def _get_label_values_recursive(d: dict, label: str, values: list) -> list:
    if label in d:
        values.append(d[label])
        return values
    for k, v in d.items():
        values = _get_label_values_recursive(v, label, values)
    return values


def get_label_values(d: dict, label: str) -> list:
    return _get_label_values_recursive(d, label, [])


def leaderboard(results: dict, *, label: str, small_good: bool, weighted: bool) -> dict[str, float]:
    prefixes = sorted(results.keys())
    leaderboard = {}
    for prefix in prefixes:
        values = get_label_values(results[prefix], label)
        if weighted:
            weights = get_label_values(results[prefix], "weight")
        else:
            weights = [1] * len(values)
        average_value = sum(v * w for v, w in zip(values, weights)) / sum(weights)
        leaderboard[prefix] = average_value
    leaderboard = dict(sorted(leaderboard.items(), key=lambda x: x[1], reverse=not small_good))
    return leaderboard


def has_single_element(results: dict) -> bool:
    return all(json.dumps(suffix_result).count('"time": ') == 1 for suffix_result in results.values())


def rename_suffix(suffix: str) -> str:
    if suffix == "":
        return "Original"
    _, name, value = suffix.split('-')
    match name:
        case "_healpix_29":
            return f"healpix29 {value}"
        case "healpix":
            return f"4^{value} subtiles"
        case "id":
            return f"ID {value}"
        case "filter":
            return f"Filter {value}"
    raise ValueError(f'Unknown suffix "{suffix}"')


def get_sizes(prefixes) -> dict[str, dict[str, int]]:
    root = Path("../data")
    results = {}
    for prefix in prefixes:
        paths = sorted(root.glob(f"{prefix}*.parquet"))
        suffixes = [p.stem.removeprefix(prefix) for p in paths]
        sizes = [p.stat().st_size for p in paths]
        unordered = dict(zip(suffixes, sizes))
        results[prefix] = dict(sorted(unordered.items(), key=lambda item: item[1]))
    return results

In [3]:
sizes = get_sizes(['gaia_dr3-2-0', 'ztf_dr22-6-21554'])
print('File Sizes')
print('==========')
for prefix, prefix_sizes in sizes.items():
    print(prefix)
    print('-' * len(prefix))
    new_sizes = [f"{size // (1 << 20):4d} MiB: {rename_suffix(suffix)}" for suffix, size in prefix_sizes.items()]
    pprint(new_sizes)

File Sizes
gaia_dr3-2-0
------------
[' 407 MiB: Filter 1048576',
 ' 411 MiB: ID 1048576',
 ' 411 MiB: healpix29 1048576',
 ' 411 MiB: Original',
 ' 443 MiB: Filter 262144',
 ' 450 MiB: healpix29 262144',
 ' 450 MiB: ID 262144',
 ' 468 MiB: 4^1 subtiles',
 ' 475 MiB: Filter 16384',
 ' 476 MiB: Filter 4096',
 ' 477 MiB: Filter 65536',
 ' 481 MiB: healpix29 16384',
 ' 481 MiB: ID 16384',
 ' 483 MiB: healpix29 4096',
 ' 483 MiB: ID 4096',
 ' 483 MiB: ID 65536',
 ' 483 MiB: healpix29 65536',
 ' 484 MiB: 4^3 subtiles',
 ' 486 MiB: 4^2 subtiles',
 ' 488 MiB: 4^4 subtiles',
 ' 489 MiB: Filter 1024',
 ' 498 MiB: ID 1024',
 ' 498 MiB: healpix29 1024',
 ' 510 MiB: 4^5 subtiles',
 ' 601 MiB: 4^6 subtiles',
 ' 950 MiB: 4^7 subtiles',
 '2313 MiB: 4^8 subtiles']
ztf_dr22-6-21554
----------------
[' 492 MiB: ID 16384',
 ' 496 MiB: ID 65536',
 ' 504 MiB: ID 1048576',
 ' 504 MiB: ID 262144',
 ' 510 MiB: 4^1 subtiles',
 ' 513 MiB: Filter 16384',
 ' 516 MiB: healpix29 65536',
 ' 516 MiB: 4^2 subtiles',
 

In [4]:
measurers_options = ['Full read', 'Small cone', 'Large cone', 'Select by ID',
                     'Filter column with few result rows', 'Filter column with many result rows']

storages_options = ["local", "remote"]
catalog_options = ["gaia", "ztf"]
col_type_options = ["required", "default", "all"]

measurers = widgets.SelectMultiple(options=measurers_options, description='Measures', value=measurers_options, rows=len(measurers_options))
storages = widgets.SelectMultiple(options=storages_options, description='Storages', value=storages_options, rows=len(storages_options))
catalogs = widgets.SelectMultiple(options=catalog_options, description='Catalogs', value=catalog_options, rows=len(catalog_options))
col_types = widgets.SelectMultiple(options=col_type_options, description='Column Types', value=col_type_options, rows=len(col_type_options))
weighted_checkbox = widgets.Checkbox(value=True, description='Weighted?')

score_output = widgets.Output()
rank_output = widgets.Output()
times_output = widgets.Output()

run_button = widgets.Button(description="Run")


def run_analysis(_button=None):
    selected_measures = frozenset(measurers.value) or everything
    selected_storages = frozenset(storages.value) or everything
    selected_catalogs = frozenset(catalogs.value) or everything
    selected_col_types = frozenset(col_types.value) or everything
    weighted = weighted_checkbox.value
    
    with open("../results/results.json", "r") as f:
        results = json.load(f)

    filtered_results = filter_results(
        results,
        measures=selected_measures,
        storages=selected_storages,
        catalogs=selected_catalogs,
        col_types=selected_col_types,
    )

    enhanced_results = add_more_stats(filtered_results)
    
    scores_leaderboard = leaderboard(enhanced_results, label="score", small_good=False, weighted=weighted)
    new_scores_leaderboard = [f"{v:.3f}: {rename_suffix(k)}" for k, v in scores_leaderboard.items()]
    with score_output:
        score_output.clear_output(wait=True)
        print("\nScore Leaderboard:")
        print("------------------")
        pprint(new_scores_leaderboard, sort_dicts=False)
    
    rank_leaderboard = leaderboard(enhanced_results, label="rank", small_good=True, weighted=weighted)
    integer_ranks = np.round(list(rank_leaderboard.values()))
    new_ranks = np.unique(integer_ranks, return_inverse=True)[1] + 1
    new_rank_leaderboard = {}
    for r, m in zip(new_ranks.tolist(), rank_leaderboard.keys()):
        new_rank_leaderboard.setdefault(r, []).append(rename_suffix(m))
    new_rank_leaderboard = [f"{k}: {v if len(v) > 1 else v[0]}" for k, v in new_rank_leaderboard.items()]
    with rank_output:  
        rank_output.clear_output(wait=True)
        print("\nRank Leaderboard:")
        print("-----------------")
        pprint(new_rank_leaderboard, sort_dicts=False)
    
    if not has_single_element(enhanced_results):
        return

    # weighted doesn't matter, because we have single experiment
    times_leaderboard = leaderboard(enhanced_results, label="time", small_good=True, weighted=False)
    new_times_leaderboard = [f"{v:8.3F}: {rename_suffix(k)}" for k, v in times_leaderboard.items()]
    with times_output:  
        times_output.clear_output(wait=True)
        print()
        print("Time, s:")
        print("--------")
        pprint(new_times_leaderboard, sort_dicts=False)

run_button.on_click(run_analysis)

controls = widgets.VBox([
    measurers, storages, catalogs, col_types, weighted_checkbox, run_button
])
app_layout = widgets.HBox([controls, times_output, score_output, rank_output])

display(app_layout)
run_analysis()

HBox(children=(VBox(children=(SelectMultiple(description='Measures', index=(0, 1, 2, 3, 4, 5), options=('Full …