In [1]:
%load_ext lab_black
%load_ext autotime
import pandas as pd
import numpy as np

time: 253 ms (started: 2022-09-17 17:16:38 -07:00)


In [2]:
import drnb.neighbors as nbnbrs


def hubness_datasets(names, n_neighbors):
    summaries = []
    for name in names:
        try:
            h = hubness_data(name, n_neighbors)
        except ValueError:
            continue
        hdesc = describe(h)
        hdesc.name = name
        summaries.append(hdesc)
    res = pd.concat(summaries, axis=1).T
    for col_to_norm in ["max", "median"]:
        res[f"n{col_to_norm}"] = res[col_to_norm].astype(float) / n_neighbors
    res["#0%"] = 100.0 * res["#0"] / res["count"]
    int_cols = [
        "count",
        "min",
        "25%",
        "50%",
        "75%",
        "max",
        "median",
        "#0",
    ]
    res[int_cols] = res[int_cols].astype(np.int32)
    res[["nmax", "nmedian", "#0%"]] = res[["nmax", "nmedian", "#0%"]].applymap(
        "{0:.2f}".format
    )
    return res.drop(columns=["mean", "std"])


def hubness_data(name, n_neighbors):
    nbrs = nbnbrs.read_neighbors(
        name,
        n_neighbors=n_neighbors + 1,
        exact=True,
    )
    if nbrs is None:
        raise ValueError(f"Couldn't get {n_neighbors} for {name}")
    return hubness_idx(nbrs.idx, n_neighbors=n_neighbors)


def hubness_idx(idx, n_neighbors=None, include_self=False):
    if not include_self:
        idx = idx[:, 1:]
    if n_neighbors is None:
        n_neighbors = idx.shape[1]
    if n_neighbors > idx.shape[1]:
        raise ValueError(f"{n_neighbors} > {idx.shape[1]}")

    result = np.zeros(dtype=np.int32, shape=idx.shape[0])

    for i in range(idx.shape[0]):
        for j in range(n_neighbors):
            result[idx[i][j]] += 1

    return result


# https://stackoverflow.com/a/38547818/4096483
def describe(df):
    if isinstance(df, np.ndarray):
        df = pd.Series(df)
    d = df.describe()
    return pd.concat(
        [d, df.agg(["median"]), pd.Series(df[df == 0].count(), index=["#0"])]
    )

time: 3.77 s (started: 2022-09-17 17:16:39 -07:00)


In [3]:
# hubness15 = hubness_datasets(["iris", "s1k"], 15)
# hubness15

time: 581 µs (started: 2022-09-17 17:16:42 -07:00)


In [5]:
from drnb.io.dataset import get_available_data_info

hubness15 = hubness_datasets(get_available_data_info().index, 15)
hubness15

Unnamed: 0,count,min,25%,50%,75%,max,median,#0,nmax,nmedian,#0%
avonet,11009,0,10,15,20,44,15,39,2.93,1.0,0.35
cifar10,60000,0,0,2,10,2005,2,18689,133.67,0.13,31.15
cifar10act,60000,0,9,14,20,67,14,321,4.47,0.93,0.54
coil100,7200,1,11,14,18,67,14,0,4.47,0.93,0.0
coil20,1440,2,11,14,18,52,14,0,3.47,0.93,0.0
fashion,70000,0,3,10,21,267,10,6728,17.8,0.67,9.61
frey,1965,0,9,14,20,45,14,6,3.0,0.93,0.31
iris,150,0,9,15,20,30,15,1,2.0,1.03,0.67
isofaces,698,2,12,15,18,30,15,0,2.0,1.0,0.0
isoswiss,20000,3,13,15,17,27,15,0,1.8,1.0,0.0


time: 23.2 s (started: 2022-09-17 17:17:55 -07:00)
