In [1]:
# Relative path to competition data export file
CSV = "data_export_m2-medium.csv"

# Relative path to competition directory
COMPETITION_DIR = ".." 

# Competition dataset names
DATASETS = []

# Competition recall/ap threshold
RECALL_AP_THRESHOLD = 0.9

# Markdown file
MARKDOWN = "../latitude-m2-medium.md"

In [2]:
import pandas as pd
import os
import glob
import math
import re
import helper
import importlib
from IPython.core.display import HTML

In [3]:
# read CSV
df = pd.read_csv( CSV )


In [4]:
# get track/dataset groups
grps = df.groupby(["track","dataset"])

In [5]:
dfs = [] # accumulate individual track dataframes

# iterate groups
for name, group in grps:

    # extract groupby track and dataset
    track = name[0]
    dataset = name[1]
   
    # produce track ranking
    ranking_df = group[ group["recall/ap"]>=RECALL_AP_THRESHOLD ]\
        .groupby(["algorithm"]) \
        .max("qps") \
        .sort_values("qps", ascending=False) \
        [["qps","recall/ap"]] 
    
    # set 'status' column as 'qualified' for all surviving rows
    ranking_df['status'] = 'ok'

    # return the algorithm index as a column
    ranking_df.reset_index(inplace=True)
    #print(ranking_df)
    #print(list(ranking_df.index))

    # retrieve all participating track algorithm names via track algo subdirectory 
    track_dir = os.path.join( COMPETITION_DIR, "%s/*/Dockerfile" % track )
    algos_participating = [os.path.basename(os.path.dirname(p)) for p in glob.glob( track_dir ) ]
  
    # compute difference of algo lists of track subdirs and algos in results - these did not qualify
    algos_did_not_qualify = list( set(algos_participating) - set(list(ranking_df["algorithm"])) )

    # append not-qualified algos to dataframe
    for algo in algos_did_not_qualify:
        ranking_df = pd.concat([ranking_df, pd.DataFrame([{'algorithm':algo,'status':'error'}])], ignore_index = True)

    # create a rank numeric column
    ranking_df['rank'] = ranking_df.apply( lambda row: int(row.name)+1, axis=1)
    ranking_df['rank'] = ranking_df['rank'].astype('Int64')

    # return the track and dataset as column
    ranking_df['track'] = track
    ranking_df['dataset'] = dataset
    #print(ranking_df)

    dfs.append(ranking_df)


In [6]:
# combine all track dataframes
master_df = pd.concat(dfs)
# master_df

In [7]:
# pivot via 'rank' so that track rankings are parallel across columns
pivot_df =  master_df.pivot_table(index=['rank'], 
                                  columns=['track'], 
                                  values=['algorithm','qps','status'],
                                  aggfunc='first')
#pivot_df


In [8]:
# group specific track columns
reorder_df = pivot_df[ [ ('algorithm','filter'), ( 'qps','filter' ), ( 'status','filter' ), \
            ('algorithm','sparse'), ( 'qps','sparse' ), ( 'status','sparse' ),
            ('algorithm','ood'),    ( 'qps','ood' ), ( 'status','ood' ) ] ]
#reorder_df

In [9]:
# swap hierarchical index for columns
swap_df = reorder_df.swaplevel(0,1, axis=1)
#swap_df

In [10]:
# add per track 'rank' column ensuring no rank to algorithms that did not qualify
for track in master_df['track'].unique():
    qps = list( swap_df[ (track,'qps') ] )
    track_ranking = map(lambda el: float('nan') if math.isnan(qps[el-1]) else el,
                  range(1,len(qps)+1) )
    #print(track_ranking, list(ranking))
    col_to_find = (track, 'algorithm')
    col_idx = list(swap_df.columns).index(col_to_find)

    swap_df.insert(col_idx, (track,'rank'), list(track_ranking) )
    swap_df[(track,'rank')] = swap_df[(track,'rank')].astype('Int64')
#swap_df

In [15]:
# style the dataframe (getting ready for html export)

display_df_styled = swap_df.style
display_df_styled

track,filter,filter,filter,filter,sparse,sparse,sparse,sparse,ood,ood,ood,ood
Unnamed: 0_level_1,rank,algorithm,qps,status,rank,algorithm,qps,status,rank,algorithm,qps,status
rank,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,1.0,pinecone,146721.2418,ok,1.0,pyanns,26858.518647,ok,1.0,pinecone-ood,76865.422871,ok
2,2.0,puck,62257.694628,ok,2.0,pinecone_smips,12003.182367,ok,2.0,pyanns,55504.351628,ok
3,3.0,parlayivf,55033.197938,ok,3.0,shnsw,8247.277083,ok,3.0,sustech-ood,28458.262883,ok
4,4.0,wm_filter,20874.856888,ok,4.0,nle,2945.626091,ok,4.0,mysteryann-dif,27946.097391,ok
5,5.0,pyanns,8988.021683,ok,5.0,cufe,84.975738,ok,5.0,mysteryann,26560.866584,ok
6,6.0,faissplus,8493.264038,ok,6.0,linscan,60.756008,ok,6.0,vamana,19965.516521,ok
7,7.0,faiss,7327.855358,ok,,zilliz,,error,7.0,puck,18960.224161,ok
8,8.0,cufe,6318.461801,ok,,sustech-whu,,error,8.0,ngt,11921.398938,ok
9,,zilliz,,error,,spmat,,error,9.0,epsearch,7663.219028,ok
10,,dhq,,error,,,,,10.0,diskann,6406.820738,ok


In [19]:
# convert styled table to html string

importlib.reload(helper) # need to reload if editing simultaneuosly

# hide row index and export to html
html = display_df_styled.hide(axis=0).to_html()

# remove style tag (markdown doesn't use it anyway)
new_html = helper.remove_style_prefix(html)

# replace all "nan"s with empty string
new_html = new_html.replace("nan","")

# insert links for specific use cases (ie, errors)
new_html = helper.replace_table_with_links(new_html, swap_df)

# display in-line
HTML(new_html)

filter pinecone
sparse pyanns
ood pinecone-ood
filter puck
sparse pinecone_smips
ood pyanns
filter parlayivf
sparse shnsw
ood sustech-ood
filter wm_filter
sparse nle
ood mysteryann-dif
filter pyanns
sparse cufe
ood mysteryann
filter faissplus
sparse linscan
ood vamana
filter faiss
sparse zilliz
ood puck
filter cufe
sparse sustech-whu
ood ngt
filter zilliz
sparse spmat
ood epsearch
filter dhq
sparse 
ood diskann
filter fdufilterdiskann
sparse 
ood cufe
filter hwtl_sdu_anns_filter
sparse 
ood zilliz
filter 
sparse 
ood puck-fizz
filter 
sparse 
ood scann


filter,filter,filter,filter,sparse,sparse,sparse,sparse,ood,ood,ood,ood
rank,algorithm,qps,status,rank,algorithm,qps,status,rank,algorithm,qps,status
1.0,pinecone,146721.2418,ok,1.0,pyanns,26858.518647,ok,1.0,pinecone-ood,76865.422871,ok
2.0,puck,62257.694628,ok,2.0,pinecone_smips,12003.182367,ok,2.0,pyanns,55504.351628,ok
3.0,parlayivf,55033.197938,ok,3.0,shnsw,8247.277083,ok,3.0,sustech-ood,28458.262883,ok
4.0,wm_filter,20874.856888,ok,4.0,nle,2945.626091,ok,4.0,mysteryann-dif,27946.097391,ok
5.0,pyanns,8988.021683,ok,5.0,cufe,84.975738,ok,5.0,mysteryann,26560.866584,ok
6.0,faissplus,8493.264038,ok,6.0,linscan,60.756008,ok,6.0,vamana,19965.516521,ok
7.0,faiss,7327.855358,ok,,zilliz,,error,7.0,puck,18960.224161,ok
8.0,cufe,6318.461801,ok,,sustech-whu,,error,8.0,ngt,11921.398938,ok
,zilliz,,error,,spmat,,error,9.0,epsearch,7663.219028,ok
,dhq,,error,,,,,10.0,diskann,6406.820738,ok


In [None]:
# insert html into the markdown

with open(MARKDOWN,'r') as md:
    contents = md.read()
    
# locate insertion point via regex
regexpr = "(## Results\n)"
matches = re.search(regexpr, contents, re.M)

# insert table html
new_contents = contents[0:matches.span(0)[1]] + new_html +" \n" + contents[matches.span(0)[1]:]
#print(new_contents[0:1000])

# update the markdown
with open(MARKDOWN,'w') as md:
    contents = md.write(new_contents)