In [138]:
import polars as pl

# text embedding models
from sentence_transformers import SentenceTransformer, util

# DistanceMetric shows similar user query is to each video
from sklearn.metrics import DistanceMetric
import numpy as np

import matplotlib.pyplot as plt

In [139]:
#Load the DATA
#df0 = pl.read_parquet('data/video-transcripts.parquet')
df = pl.read_csv('data/video-transcripts.csv')
df_eval = pl.read_csv('data/eval-raw-cleaned.csv')

df = df.with_columns(pl.col('datetime').cast(pl.Datetime))
df.head()


video_id,datetime,title,transcript
str,datetime[μs],str,str
"""wwSzpaTHyS8""",2024-01-30 15:00:03,"""Did The Future Already Happen?…","""Do your past, present and futu…"
"""dFCbJmgeHmA""",2021-06-15 14:09:03,"""The Day the Dinosaurs Died – M…","""one of the greatest Illusions …"
"""1AElONvi9WQ""",2020-03-01 13:30:00,"""Why Blue Whales Don't Get Canc…","""Cancer is a creepy and mysteri…"
"""Hug0rfFC_L8""",2014-12-18 14:02:27,"""The Ultimate Conspiracy Debunk…","""The Internet is like a breedin…"
"""lXfEK8G8CUI""",2021-08-10 13:59:41,"""How The Immune System ACTUALLY…","""the human immune system is the…"


In [140]:
# parameters

column_to_embed_list = ["title", "transcript"]
model_name_list = ['all-MiniLM-L6-v2', "multi-qa-distilbert-cos-v1","multi-qa-mpnet-base-dot-v1"]

In [141]:
# generate embeddings for 6 possible combinations of video-transcript data

text_embedding_dict = {}

for model_name in model_name_list:
    
    model = SentenceTransformer(model_name)
    #print(model)  2 models are normalized with euclidian normalization so that all vectors are equal in "strength" (length = 1)

    for column_name in column_to_embed_list: # either title or transcript
        key_name = model_name + "_" + column_name 
        print(key_name)

        # generate embeddings for text under column_name (both transcript or title)
        %time embedding_arr = model.encode(df[column_name].to_list())
        print('')

        text_embedding_dict[key_name] = embedding_arr # type: ignore 

all-MiniLM-L6-v2_title
CPU times: user 65.6 ms, sys: 496 ms, total: 562 ms
Wall time: 3.07 s

all-MiniLM-L6-v2_transcript
CPU times: user 1.36 s, sys: 380 ms, total: 1.74 s
Wall time: 1.04 s

multi-qa-distilbert-cos-v1_title
CPU times: user 55.3 ms, sys: 40.3 ms, total: 95.5 ms
Wall time: 192 ms

multi-qa-distilbert-cos-v1_transcript
CPU times: user 1.17 s, sys: 344 ms, total: 1.52 s
Wall time: 3.93 s

multi-qa-mpnet-base-dot-v1_title
CPU times: user 119 ms, sys: 97.6 ms, total: 217 ms
Wall time: 391 ms

multi-qa-mpnet-base-dot-v1_transcript
CPU times: user 1.29 s, sys: 347 ms, total: 1.63 s
Wall time: 8.95 s



In [142]:
embedding_arr.shape

(217, 768)

In [143]:
text_embedding_dict
"""
Each video ends up with the same number of dimensions (columns) in its embedding vector, 
regardless of how long the title or transcript is.

1. The string is tokenized into smaller pieces (words, subwords).

2. These tokens go through a Transformer model, which captures the meaning and context.

3. The model then reduces all this information into a single fixed-size vector.

+ Easy comparison between any two texts (e.g. title vs. query, transcript vs. query).

+ Efficient semantic search, clustering, classification, etc.

"""

'\nEach video ends up with the same number of dimensions (columns) in its embedding vector, \nregardless of how long the title or transcript is.\n\n1. The string is tokenized into smaller pieces (words, subwords).\n\n2. These tokens go through a Transformer model, which captures the meaning and context.\n\n3. The model then reduces all this information into a single fixed-size vector.\n\n+ Easy comparison between any two texts (e.g. title vs. query, transcript vs. query).\n\n+ Efficient semantic search, clustering, classification, etc.\n\n'

In [144]:
# make the embedding for evaulation data
query_embedding_dict = {}

for model_name in model_name_list:

    model= SentenceTransformer(model_name)
    print(model_name)

    # embed query text
    %time embedding_arr = model.encode(df_eval['query'].to_list())
    print('')

    # store it into embedding dict
    query_embedding_dict[model_name] = embedding_arr # type: ignore

all-MiniLM-L6-v2
CPU times: user 38.5 ms, sys: 26.5 ms, total: 65 ms
Wall time: 76.5 ms

multi-qa-distilbert-cos-v1
CPU times: user 45.7 ms, sys: 23.1 ms, total: 68.8 ms
Wall time: 98.9 ms

multi-qa-mpnet-base-dot-v1
CPU times: user 82.1 ms, sys: 174 ms, total: 256 ms
Wall time: 328 ms



In [145]:
# 1 column for all rows
print(len(embedding_arr[:,1]))


# 1 row for all columns
print(embedding_arr[0])

embedding_arr.shape


137
[ 1.78847209e-01 -9.87320393e-03 -4.14049625e-01 -6.45881593e-02
  1.45963863e-01 -3.64585370e-02  3.81362587e-02  1.10301912e-01
  3.24565060e-02  1.68996394e-01 -1.05586849e-01  1.42462164e-01
 -3.25847745e-01  8.29601660e-04 -1.60027668e-01 -2.09816173e-03
  2.09358454e-01 -7.29408860e-02  3.50981988e-02 -2.18159929e-01
 -2.83594251e-01 -2.47421369e-01 -1.37302876e-01  5.51879331e-02
 -2.94683456e-01 -4.87244688e-03  2.37264428e-02  3.75497490e-01
 -3.50297429e-03 -1.12414649e-02 -5.26690483e-02 -1.51533782e-02
  1.83607131e-01  2.49425501e-01 -9.00616578e-05 -2.04561532e-01
 -1.59090176e-01  2.05659956e-01 -2.43804865e-02  2.54055858e-02
 -2.06916064e-01  7.60148168e-02 -1.55353382e-01  1.04587175e-01
 -1.79002080e-02  7.10462853e-02  4.38751616e-02 -7.00071901e-02
  3.68219435e-01 -1.07797034e-01  4.58051741e-01 -1.46145061e-01
 -8.49771798e-02 -6.38186261e-02 -1.85329527e-01  2.31639460e-01
  3.39643285e-02 -1.29878536e-01  5.63105419e-02  2.43548770e-02
  2.69304872e-01  1.3

(137, 768)

In [146]:
print(query_embedding_dict['all-MiniLM-L6-v2'])

[[ 0.01795088  0.05634568 -0.04357697 ... -0.09881878  0.03439683
  -0.02163488]
 [-0.01571924  0.08833206 -0.03494741 ... -0.07740968 -0.05103115
  -0.007313  ]
 [ 0.07857412  0.01176622  0.05849238 ...  0.02323207 -0.0396411
   0.01533789]
 ...
 [-0.01782705 -0.00337126  0.03826381 ...  0.02836143  0.08206018
   0.01634091]
 [-0.01833171 -0.04946456  0.00775069 ... -0.02741444 -0.04327692
   0.10167596]
 [ 0.02797536 -0.035756    0.03497365 ... -0.06601121 -0.01123385
  -0.02856237]]


In [147]:
def returnVideoID_index(df: pl.dataframe.frame.DataFrame, df_eval: pl.dataframe.frame.DataFrame,query_n:int) -> int:
    # to get the index in the original dataset where one of the 137 eval videos is 
    return [i for i in range(len(df)) if df['video_id'][i] == df_eval['video_id'][query_n]][0]

In [148]:
def evalTrueRankings(dist_arr_isorted:np.ndarray, df:pl.dataframe.frame.DataFrame, df_eval: pl.dataframe.frame.DataFrame) -> np.ndarray:
    # intialize array to store rankings of "correct" search result
    # shape of 1 row and number of queries columns - 137
    true_rank_arr = np.empty((1, dist_arr_isorted.shape[1]))

    for query_n in range(dist_arr_isorted.shape[1]): #0-137

        # we get the index of the eval video in original data set
        video_id_idx = returnVideoID_index(df, df_eval, query_n)
        # take the entire column query_n (0-137 columns), and look at each row, 
        # until you get the original video that matches the video_id_idx
        # this index tells us its ranking
        # because rows are rankings and column is one query - so we are looking for video_id_idx in that column
        # good algorithm will rank it in top 3 as that is the ground truth value for query
        # meaning it is a query with same video_id as the video from df (og dataframe)

        true_rank = np.argwhere(dist_arr_isorted[:, query_n] == video_id_idx)[0][0]
        #dist_arr_isorted[:, query_n] == video_id_idx creates a boolean array
        # [False, False, False, ..., True, ..., False]
        # np.argwhere returns [[True_idx]], so we need [0][0] to get that idx

        # 1 row with 137 columns - true_rank_arr = [[0, 4, 12, 1, 5, ...]]
        # true_rank_arr[0, 0] = 0 → for query 0, the correct video was ranked 1st

        # true_rank_arr[0, 1] = 4 → for query 1, the correct video was ranked 5th

        # true_rank_arr[0, 2] = 12 → for query 2, correct video ranked 13th

        # true_rank_arr[0, 3] = 1 → 2nd best match

        # true_rank_arr[0, 4] = 5 → 6th best match

        true_rank_arr[0,query_n] = true_rank


    return true_rank_arr




In [149]:
dist_name_list = ['euclidean', 'manhattan', 'chebyshev']
sim_name_list = ['cos_sim', 'dot_score']

In [150]:
type(query_embedding_dict[model_name])

numpy.ndarray

In [159]:
# evaluate all possible combinations of model, columns to embed and distance metrics

eval_results = []
# loop through all the models
for model_name in model_name_list:
    # since models are keys, we get the values of embeddings for queries (Eval data)


    # we know what value represents what index because model.encode()
    # returns the embeddings in the same order 0-217
    query_embedding = query_embedding_dict[model_name]

    # loop through text columns 
    for column_name in column_to_embed_list:
        
        # generate column embedding for each of 6 versions
        embedding_arr = text_embedding_dict[model_name + "_" + column_name]

        # comparing the distance between query and video text(transcript or title)
        for dist_name in dist_name_list:
            dist = DistanceMetric.get_metric(dist_name)
            #compare video text (transcript or title embeddings) to eval data (queries)
            dist_arr = dist.pairwise(embedding_arr, query_embedding)

            # sort the indicies of distance array
            dist_arr_isorted = np.argsort(dist_arr,axis=0)

            # define lable for search method
            method_name = "_".join([model_name, column_name, dist_name])

            # evalute the ranking based on the ground truth (queries)
            true_rank_arr = evalTrueRankings(dist_arr_isorted, df, df_eval)
            

            eval_list = [method_name] + true_rank_arr.tolist()[0]
            eval_results.append(eval_list)


        
# print(true_rank_arr)
      
# distance between 0th video and all queries
# print(dist_arr[0])

# distance between 0th video and 0th query
# print(dist_arr[0][0])

# print(dist_arr_isorted[:,0]) # grabs all the rows with their's first column
# print(dist_arr_isorted[0]) # grabs the entire row with all its columns

# very important - this is rank vs query matrix, so in entire dist_arr_isorted we get 217 rows and 137 columns
# each first row value is video index for that query for given rank (rank being the row)
# print(dist_arr_isorted)
# print(dist_arr_isorted[0])
# print(dist_arr_isorted[1])
# print(dist_arr_isorted[2])


        for sim_name in sim_name_list:
            # apply similarity score from sbert
            # - of similarty score, bcs similarty score will be large unliike for distance score
            # so reverse the order
            cmd = "dist_arr = -util." + sim_name + "(embedding_arr, query_embedding)"
            exec(cmd)
    
            # sort indexes of distance array (notice minus sign in front of cosine similarity)
            dist_arr_isorted = np.argsort(dist_arr, axis=0)
    
            # define label for search method
            method_name = "_".join([model_name, column_name, sim_name.replace("_","-")])
    
            # evaluate the ranking of the ground truth
            true_rank_arr = evalTrueRankings(dist_arr_isorted, df, df_eval)
    
            # store results
            eval_list = [method_name] + true_rank_arr.tolist()[0]
            eval_results.append(eval_list)

In [160]:
# eval result contains all the rankings for each model, where each sublist is eval list containg ranking of ground truth for each
# query, we only care about what we thought ranks first vs what really ranks first - bcs dist_arr_isorted is ranked by indecies so we know
# how much we are off


###  Explanation of `dist_arr_isorted` Matrix (Shape: [217, 137])

After computing the pairwise distance between all video embeddings  
(from the full dataset, shape [217, 768]) and all query embeddings  
(from the eval dataset, shape [137, 768]), we get a distance matrix:

    dist_arr.shape = (217 videos, 137 queries)

Each value `dist_arr[i, j]` represents the distance between video `i`  
and query `j` — lower means more similar.

We then sort these distances along axis=0 (i.e., for each query)  
to get the ranked list of videos for every query:

    dist_arr_isorted = np.argsort(dist_arr, axis=0)

The result:  
- Shape: [217, 137]  
- Each COLUMN `j` corresponds to a single query `j`  
- Each column contains 217 video indices from the dataset  
- These indices are sorted in order of increasing distance  
  → i.e., the most relevant (closest) video is at the top of each column

So: `dist_arr_isorted[:, j]` gives you the ranked list of video indices  
for query `j` — from best match to worst.

This sorted array is then used to determine how well each method performed  
by checking where the correct video (the one that query `j` originally came from)  
appears in this ranking — done using `evalTrueRankings()`.

And now we want to find where that query is ranked in original dataset.  
Thus, we can go back to each column and search for that index  
to get how well it ranked the ground truth (itself).


####

The `dist_arr_isorted` matrix has shape `(num_videos, num_eval_queries)`, e.g. `(217, 137)`.  
Each **column** corresponds to one evaluation query from `df_eval`.  
Each **row** corresponds to a **rank position** (`0 = best match`, `1 = second-best`, ..., `216 = worst`).

---

- `dist_arr_isorted[r, c]` → index of the video (from `df`) ranked at position `r` for eval query `c`.

So:

- `dist_arr_isorted[0, c]` → best match (most similar video) for eval query `c`  
- `dist_arr_isorted[1, c]` → 2nd best match for eval query `c`  
- `dist_arr_isorted[:, c]` → full ranked list of video indices for query `c`  

This allows us to **evaluate the model's performance** by checking where the **ground-truth video** (from `df_eval`) appears in this ranked list.

---

#### Visualization:

| Rank | `query_0` | `query_1` | `query_2` |
|------|-----------|-----------|-----------|
| 0    | 12        | 5         | 9         |
| 1    | 34        | 12        | 3         |
| 2    | 98        | 34        | 21        |
| ...  | ...       | ...       | ...       |

Each **cell** holds a video index from the original dataset (`df`) indicating its position in the similarity ranking.



In [161]:
# Embed both title and transcripts and combine results
for model_name in model_name_list:
    # generate embeddings (extracting)
    embedding_arr1 = text_embedding_dict[model_name + "_transcript"]
    embedding_arr2 = text_embedding_dict[model_name + "_title"]
    query_embedding = query_embedding_dict[model_name]

    # loop through distance methods
    for dist_name in dist_name_list:
        # compute distance between text and query
        dist = DistanceMetric.get_metric(dist_name)
        dist_arr = dist.pairwise(embedding_arr1,query_embedding) + dist.pairwise(embedding_arr2,query_embedding)

        dist_arr_isorted = np.argsort(dist_arr, axis=0)

        # define label for search method
        method_name = '_'.join([model_name,'title-transcript',dist_name])

        # evaluate the true ranking of ground truth
        true_rank_arr = evalTrueRankings(dist_arr_isorted, df, df_eval)

        eval_list = [method_name] + true_rank_arr.tolist()[0]
        eval_results.append(eval_list)

    for sim_name in sim_name_list:

        cmd = 'dist_arr = -util.' + sim_name + '(embedding_arr1, query_embedding) - util.' + sim_name + "(embedding_arr2, query_embedding)"
        exec(cmd)

        dist_arr_isorted = np.argsort(dist_arr,axis=0)

        method_name = "_".join([model_name,"title-transcript",sim_name.replace('_',"-")])

        true_rank_arr = evalTrueRankings(dist_arr_isorted, df,df_eval)

        eval_list = [method_name] + true_rank_arr.tolist()[0]
        eval_results.append(eval_list)

In [162]:
len(eval_results)

# do this # Embed both title and transcripts and combine results

45

In [163]:
schema_dict = {'method_name': str }
for i in range(len(eval_results[0])-1):
    schema_dict['rank_query-'+str(i)] = float
print(schema_dict)

df_results = pl.DataFrame(eval_results, schema=schema_dict)
df_results.head()

{'method_name': <class 'str'>, 'rank_query-0': <class 'float'>, 'rank_query-1': <class 'float'>, 'rank_query-2': <class 'float'>, 'rank_query-3': <class 'float'>, 'rank_query-4': <class 'float'>, 'rank_query-5': <class 'float'>, 'rank_query-6': <class 'float'>, 'rank_query-7': <class 'float'>, 'rank_query-8': <class 'float'>, 'rank_query-9': <class 'float'>, 'rank_query-10': <class 'float'>, 'rank_query-11': <class 'float'>, 'rank_query-12': <class 'float'>, 'rank_query-13': <class 'float'>, 'rank_query-14': <class 'float'>, 'rank_query-15': <class 'float'>, 'rank_query-16': <class 'float'>, 'rank_query-17': <class 'float'>, 'rank_query-18': <class 'float'>, 'rank_query-19': <class 'float'>, 'rank_query-20': <class 'float'>, 'rank_query-21': <class 'float'>, 'rank_query-22': <class 'float'>, 'rank_query-23': <class 'float'>, 'rank_query-24': <class 'float'>, 'rank_query-25': <class 'float'>, 'rank_query-26': <class 'float'>, 'rank_query-27': <class 'float'>, 'rank_query-28': <class 'fl

  return dispatch(args[0].__class__)(*args, **kw)


method_name,rank_query-0,rank_query-1,rank_query-2,rank_query-3,rank_query-4,rank_query-5,rank_query-6,rank_query-7,rank_query-8,rank_query-9,rank_query-10,rank_query-11,rank_query-12,rank_query-13,rank_query-14,rank_query-15,rank_query-16,rank_query-17,rank_query-18,rank_query-19,rank_query-20,rank_query-21,rank_query-22,rank_query-23,rank_query-24,rank_query-25,rank_query-26,rank_query-27,rank_query-28,rank_query-29,rank_query-30,rank_query-31,rank_query-32,rank_query-33,rank_query-34,rank_query-35,…,rank_query-100,rank_query-101,rank_query-102,rank_query-103,rank_query-104,rank_query-105,rank_query-106,rank_query-107,rank_query-108,rank_query-109,rank_query-110,rank_query-111,rank_query-112,rank_query-113,rank_query-114,rank_query-115,rank_query-116,rank_query-117,rank_query-118,rank_query-119,rank_query-120,rank_query-121,rank_query-122,rank_query-123,rank_query-124,rank_query-125,rank_query-126,rank_query-127,rank_query-128,rank_query-129,rank_query-130,rank_query-131,rank_query-132,rank_query-133,rank_query-134,rank_query-135,rank_query-136
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""all-MiniLM-L6-v2_title_euclide…",0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,…,126.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""all-MiniLM-L6-v2_title_manhatt…",0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,…,131.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""all-MiniLM-L6-v2_title_chebysh…",0.0,27.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,38.0,0.0,0.0,0.0,0.0,0.0,137.0,…,76.0,0.0,0.0,0.0,0.0,103.0,9.0,6.0,0.0,8.0,0.0,0.0,0.0,1.0,0.0,3.0,0.0,51.0,0.0,3.0,9.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,5.0
"""all-MiniLM-L6-v2_title_cos-sim""",0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,…,126.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""all-MiniLM-L6-v2_title_dot-sco…",0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,…,126.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [164]:
# compute mean rankings of ground truth search result
df_results = df_results.with_columns(new_col=pl.mean_horizontal(df_results.columns[1:])).rename({"new_col": "rank_query-mean"})

In [165]:
# compute number of ground truth results which appear in top 3
for i in [1,3]:
    df_results = df_results.with_columns(new_col=pl.sum_horizontal(df_results[:,1:-1]<i)).rename({"new_col": "num_in_top-"+str(i)})

In [166]:
df_summary = df_results[['method_name', "rank_query-mean", "num_in_top-1", "num_in_top-3"]]

In [167]:
df_summary.sort('rank_query-mean').head()

method_name,rank_query-mean,num_in_top-1,num_in_top-3
str,f64,u32,u32
"""multi-qa-mpnet-base-dot-v1_tit…",3.80292,117,127
"""multi-qa-mpnet-base-dot-v1_tit…",4.072993,114,126
"""multi-qa-distilbert-cos-v1_tit…",4.372263,114,126
"""multi-qa-distilbert-cos-v1_tit…",4.437956,115,126
"""multi-qa-distilbert-cos-v1_tit…",4.459854,114,124


In [168]:
df_summary.sort('rank_query-mean').head()[0,0]

'multi-qa-mpnet-base-dot-v1_title-transcript_dot-score'

In [169]:
print(df_summary.sort("num_in_top-1", descending=True).head())

shape: (5, 4)
┌─────────────────────────────────┬─────────────────┬──────────────┬──────────────┐
│ method_name                     ┆ rank_query-mean ┆ num_in_top-1 ┆ num_in_top-3 │
│ ---                             ┆ ---             ┆ ---          ┆ ---          │
│ str                             ┆ f64             ┆ u32          ┆ u32          │
╞═════════════════════════════════╪═════════════════╪══════════════╪══════════════╡
│ multi-qa-distilbert-cos-v1_tit… ┆ 4.613139        ┆ 117          ┆ 123          │
│ all-MiniLM-L6-v2_title-transcr… ┆ 4.481752        ┆ 117          ┆ 124          │
│ multi-qa-mpnet-base-dot-v1_tit… ┆ 3.80292         ┆ 117          ┆ 127          │
│ all-MiniLM-L6-v2_title_manhatt… ┆ 5.291971        ┆ 116          ┆ 125          │
│ multi-qa-mpnet-base-dot-v1_tit… ┆ 4.80292         ┆ 116          ┆ 123          │
└─────────────────────────────────┴─────────────────┴──────────────┴──────────────┘


In [170]:
df_summary.sort("num_in_top-1", descending=True).head()[0,0]

'multi-qa-distilbert-cos-v1_title_manhattan'

In [171]:
print(df_summary.sort("num_in_top-3", descending=True).head())

shape: (5, 4)
┌─────────────────────────────────┬─────────────────┬──────────────┬──────────────┐
│ method_name                     ┆ rank_query-mean ┆ num_in_top-1 ┆ num_in_top-3 │
│ ---                             ┆ ---             ┆ ---          ┆ ---          │
│ str                             ┆ f64             ┆ u32          ┆ u32          │
╞═════════════════════════════════╪═════════════════╪══════════════╪══════════════╡
│ multi-qa-mpnet-base-dot-v1_tit… ┆ 5.072993        ┆ 114          ┆ 127          │
│ multi-qa-mpnet-base-dot-v1_tit… ┆ 3.80292         ┆ 117          ┆ 127          │
│ all-MiniLM-L6-v2_title_euclide… ┆ 5.124088        ┆ 115          ┆ 126          │
│ all-MiniLM-L6-v2_title_cos-sim  ┆ 5.124088        ┆ 115          ┆ 126          │
│ all-MiniLM-L6-v2_title_dot-sco… ┆ 5.124088        ┆ 115          ┆ 126          │
└─────────────────────────────────┴─────────────────┴──────────────┴──────────────┘


In [172]:
df_summary.sort("num_in_top-3", descending=True).head()[0,0]

'multi-qa-mpnet-base-dot-v1_title-transcript_manhattan'

In [173]:
for i in range(4):
    print(df_summary.sort("num_in_top-3", descending=True)['method_name'][i])

multi-qa-mpnet-base-dot-v1_title-transcript_manhattan
multi-qa-mpnet-base-dot-v1_title-transcript_dot-score
all-MiniLM-L6-v2_title_euclidean
all-MiniLM-L6-v2_title_cos-sim
