In [1]:
import sys
import numpy as np

In [2]:
sys.path.append("..")
from benchmark import datasets

In [6]:
# the ground-truth files on https://big-ann-benchmarks.com/


new_gt = {
    'bigann-1B': "https://comp21storage.blob.core.windows.net/publiccontainer/comp21/bigann/public_query_gt100.bin", 
    "ssnpp-1B": "https://dl.fbaipublicfiles.com/billion-scale-ann-benchmarks/FB_ssnpp_public_queries_GT.rangeres",
    'msturing-1B': "https://comp21storage.blob.core.windows.net/publiccontainer/comp21/MSFT-TURING-ANNS/query_gt100.bin",
    "msspacev-1B": "https://comp21storage.blob.core.windows.net/publiccontainer/comp21/spacev1b/public_query_gt100.bin", 
    "deep-1B": "https://storage.yandexcloud.net/yandex-research/ann-datasets/deep_new_groundtruth.public.10K.bin", 
    "text2image-1B": "https://storage.yandexcloud.net/yandex-research/ann-datasets/t2i_new_groundtruth.public.100K.bin",
}


In [22]:
# get official GT file 


for dsname in new_gt: 
    ds = datasets.DATASETS[dsname]()
    print(ds)
    
    data = urllib.request.urlopen(new_gt[dsname]).read()
    open(f"/tmp/new_GT/{dsname}", "wb").write(data)
    
    

Dataset BigANNDataset in dimension 128, with distance euclidean, search_type knn, size: Q 10000 B 1000000000
Dataset SSNPPDataset in dimension 256, with distance euclidean, search_type range, size: Q 100000 B 1000000000
Dataset MSTuringANNS in dimension 100, with distance euclidean, search_type knn, size: Q 100000 B 1000000000
Dataset MSSPACEV1B in dimension 100, with distance euclidean, search_type knn, size: Q 29316 B 1000000000
Dataset Deep1BDataset in dimension 96, with distance euclidean, search_type knn, size: Q 10000 B 1000000000
Dataset Text2Image1B in dimension 200, with distance ip, search_type knn, size: Q 100000 B 1000000000


In [109]:
def count_diff_1_result(Dref, Iref, Dnew, Inew, eps): 
    """ compare knn search results. Intended to normalize for: 
    - small variations of the distance measure (below eps)
    - ordering of ties
    """
    if not np.all(Dref == Dnew): 
        assert np.abs(Dref - Dnew).max() < eps
        # attempt to do some normalization to merge nearby distances 
        Dref = np.floor(np.minimum(Dref, Dnew) / eps) * eps           
    
    ndiff = 0
    cur_d = -1e10
    s_ref = set()
    s_new = set()
    for j in range(len(Iref)): 
        if Dref[j] != cur_d: 
            nd = len(s_ref ^ s_new)
            ndiff += nd
            if nd > 0: 
                pass
                # print(i, cur_d, s_ref, s_new)
            s_ref = set()
            s_new = set()
            cur_d = Dref[j]
        s_ref.add(Iref[j])
        s_new.add(Inew[j])             
    return ndiff

def compare_knn_res(Dref, Iref, Dnew, Inew): 

    ndiff = 0
    eps = Dref.max() * 1e-5
    for i in range(len(Iref)):
        
        if np.all(Iref[i] == Inew[i]): 
            continue
     
        ndiff += count_diff_1_result(Dref[i], Iref[i], Dnew[i], Inew[i], eps)
    

    return ndiff

In [110]:
# compare with what I computed 
new_basedir = "/checkpoint/matthijs/billion-scale-ann-benchmarks/GT_1B/"

for dsname in new_gt: 
    ds = datasets.DATASETS[dsname]()
    print(dsname, ds)
    if ds.search_type() == "knn": 
        Iref, Dref = datasets.knn_result_read(f"/tmp/new_GT/{dsname}")
        Inew, Dnew = datasets.knn_result_read(f"{new_basedir}/{dsname}")
        raw_ndiff = (Iref != Inew).sum()
        ndiff = compare_knn_res(Dref, Iref, Dnew, Inew)        
        print(f"raw_diff={100 * raw_ndiff/ Iref.size} % diff={100 * ndiff/ Iref.size} %")
        
    else: 
        nres_ref, Iref, Dref = datasets.range_result_read(f"/tmp/new_GT/{dsname}")
        nres_new, Inew, Dnew = datasets.range_result_read(f"{new_basedir}/{dsname}")
        # does not make much sense to verify, they are computed simultaneously
        
    print(Iref.shape, Inew.shape)
    
    
    

bigann-1B Dataset BigANNDataset in dimension 128, with distance euclidean, search_type knn, size: Q 10000 B 1000000000
raw_diff=0.9899 % diff=0.0 %
(10000, 100) (10000, 100)
ssnpp-1B Dataset SSNPPDataset in dimension 256, with distance euclidean, search_type range, size: Q 100000 B 1000000000
(7706752,) (7706752,)
msturing-1B Dataset MSTuringANNS in dimension 100, with distance euclidean, search_type knn, size: Q 100000 B 1000000000
raw_diff=0.0195 % diff=0.00024 %
(100000, 100) (100000, 100)
msspacev-1B Dataset MSSPACEV1B in dimension 100, with distance euclidean, search_type knn, size: Q 29316 B 1000000000
raw_diff=24.181163869559285 % diff=0.0 %
(29316, 100) (29316, 100)
deep-1B Dataset Deep1BDataset in dimension 96, with distance euclidean, search_type knn, size: Q 10000 B 1000000000
raw_diff=0.1864 % diff=0.0002 %
(10000, 100) (10000, 100)
text2image-1B Dataset Text2Image1B in dimension 200, with distance ip, search_type knn, size: Q 100000 B 1000000000
raw_diff=0.04773 % diff=0.0

# Check subsets -- range

Make sure the 10M and 100M results are a subset of 1B

In [91]:
dsname = "ssnpp-1B"

In [92]:
new_basedir = "/checkpoint/matthijs/billion-scale-ann-benchmarks/GT_1B/"

nres_ref, Iref, Dref = datasets.range_result_read(f"/tmp/new_GT/{dsname}")
nres_new, Inew, Dnew = datasets.range_result_read(f"{new_basedir}/{dsname}")

In [98]:
for nb, ss in [(10 ** 7, "10M"), (10 ** 8, "100M")]: 
    ds_sub = dsname.replace("1B", ss)
    nres_sub, Isub, Dsub = datasets.range_result_read(f"/checkpoint/matthijs/billion-scale-ann-benchmarks/GT_{ss}/{ds_sub}")
    
    nq = len(nres_ref)
    assert len(nres_sub) == nq
    i0 = j0 = 0
    for i in range(nq): 
        i1 = i0 + nres_ref[i]
        j1 = j0 + nres_sub[i]

        ref_res = Iref[i0:i1]
        sub_res = Isub[j0:j1]

        ref_res_sub = ref_res[ref_res < nb]
        assert set(ref_res_sub) == set(sub_res)

        i0 = i1
        j0 = j1
    

# Check subsets -- knn

Make sure the 10M and 100M results are a subset of 1B in knn sense 

In [118]:
basedir = "/checkpoint/matthijs/billion-scale-ann-benchmarks/GT"

for dsname in new_gt: 
    if dsname == "ssnpp-1B": 
        continue
    print(dsname)
    I1B, D1B = datasets.knn_result_read(f"{basedir}_1B/{dsname}")
    nq = len(I1B)
    ndiff = 0
    eps = D1B.max() * 1e-5
    
    for nb, ss in [(10 ** 7, "10M"), (10 ** 8, "100M")]: 
        ds_sub = dsname.replace("1B", ss)
        Iss, Dss = datasets.knn_result_read(f"{basedir}_{ss}/{ds_sub}")
        ndiff = 0
        ltot = 0
        
        for i in range(nq): 
            ref_I = I1B[i][I1B[i] < nb]
            ref_D = D1B[i][I1B[i] < nb]
            
            l = len(ref_I)
            ndiff += count_diff_1_result(ref_D, ref_I, Dss[i, :l], Iss[i, :l], eps)
            ltot += l
            
        print(f"{ss} diff={100 * ndiff / ltot} % (verif on {ltot} / {I1B.size} = 1/{I1B.size/ltot:.1f})")
 

bigann-1B
10M diff=0.0 % (verif on 10175 / 1000000 = 1/98.3)
100M diff=0.0 % (verif on 99455 / 1000000 = 1/10.1)
msturing-1B
10M diff=0.0 % (verif on 99896 / 10000000 = 1/100.1)
100M diff=0.0 % (verif on 1000758 / 10000000 = 1/10.0)
msspacev-1B
10M diff=0.0 % (verif on 30801 / 2931600 = 1/95.2)
100M diff=0.0 % (verif on 293540 / 2931600 = 1/10.0)
deep-1B
10M diff=0.0 % (verif on 10285 / 1000000 = 1/97.2)
100M diff=0.0 % (verif on 100663 / 1000000 = 1/9.9)
text2image-1B
10M diff=0.0 % (verif on 99944 / 10000000 = 1/100.1)
100M diff=0.0 % (verif on 999862 / 10000000 = 1/10.0)
