In [8]:
import polars as pl
from sentence_transformers import SentenceTransformer
from sklearn.metrics import DistanceMetric
import numpy as np
import gradio as gr

In [9]:
%time df = pl.scan_csv('../data/video-index.csv')

CPU times: user 86 μs, sys: 110 μs, total: 196 μs
Wall time: 199 μs


In [10]:
model_name = 'multi-qa-mpnet-base-dot-v1'
model = SentenceTransformer(model_name)

In [21]:
dist_name = 'euclidean'
dist = DistanceMetric.get_metric(dist_name)

In [22]:
def returnSearchResults(query: str, index: pl.lazyframe.frame.LazyFrame) -> np.ndarray:
    """
        Function to return indexes of top search results
    """
    
    # embed query
    query_embedding = model.encode(query).reshape(1, -1)

    # Get column names without triggering schema resolution warning
    column_names = index.collect_schema().names()
    
    # compute distances between query and titles/transcripts
    dist_arr = (
        dist.pairwise(index.select(column_names[4:772]).collect(), query_embedding) +
        dist.pairwise(index.select(column_names[772:]).collect(), query_embedding)
    )

    # search paramaters
    threshold = 40 # eye balled threshold for manhatten distance
    top_k = 5

    # evaluate videos close to query based on threshold
    idx_below_threshold = np.argwhere(dist_arr.flatten()<threshold).flatten()
    # keep top k closest videos
    idx_sorted = np.argsort(dist_arr[idx_below_threshold], axis=0).flatten()

    # return indexes of search results
    return idx_below_threshold[idx_sorted][:top_k]

In [None]:
# def returnSearchResults(
#         query: str,
#         index: pl.LazyFrame,
#         top_k: int = 5,
#         title_prefix: str = "title_embedding-",
#         transcript_prefix: str = "transcript_embedding-",
# ) -> np.ndarray:
#     """
#     Returns the row‑indices of the top‑k videos most relevant to *query*
#     using dot‑product similarity on title + transcript embeddings.
#     """

#     # -------------------------------------------------
#     # 1. embed / normalise the query text
#     # -------------------------------------------------
#     # SentenceTransformers normalises automatically for “dot” models,
#     # but calling with the flag makes that explicit & future‑proof:
#     query_vec = model.encode(query,
#                              normalize_embeddings=True).reshape(-1)  # (768,)

#     # -------------------------------------------------
#     # 2. pull the two 768‑dim matrices from Polars
#     # -------------------------------------------------
#     col_names = index.collect_schema().names()
#     title_cols      = [c for c in col_names if c.startswith(title_prefix)]
#     transcript_cols = [c for c in col_names if c.startswith(transcript_prefix)]

#     title_mat      = index.select(title_cols).collect().to_numpy()      # (N,768)
#     transcript_mat = index.select(transcript_cols).collect().to_numpy() # (N,768)

#     # -------------------------------------------------
#     # 3. dot‑score ‑> bigger is better
#     # -------------------------------------------------
#     sim_scores = (title_mat @ query_vec) + (transcript_mat @ query_vec)  # (N,)

#     # -------------------------------------------------
#     # 4. take the *k* highest scores
#     # -------------------------------------------------
#     top_idx = np.argsort(-sim_scores)[:top_k]   # descending order
#     return top_idx


In [24]:
idx_result = returnSearchResults("bomb", df.lazy())
df.select(['video_id', 'title']).collect()[idx_result]


video_id,title
str,str
"""5iPH-br_eJQ""","""What if We Nuke a City?"""
"""JyECrGp-Sw8""","""What If We Detonated All Nucle…"
"""E55uSCO5D2w""","""The Most Insane Weapon You Nev…"
"""qEfPBt9dU60""","""What if We Nuke the Moon?"""
"""9tbxDgcv74c""","""What If You Detonated a Nuclea…"


In [25]:
df.select(['title', 'video_id']).collect()[idx_result].to_dict(as_series=False)

{'title': ['What if We Nuke a City?',
  'What If We Detonated All Nuclear Bombs at Once?',
  'The Most Insane Weapon You Never Heard About',
  'What if We Nuke the Moon?',
  'What If You Detonated a Nuclear Bomb In The Marianas Trench? (Science not Fantasy)'],
 'video_id': ['5iPH-br_eJQ',
  'JyECrGp-Sw8',
  'E55uSCO5D2w',
  'qEfPBt9dU60',
  '9tbxDgcv74c']}

In [26]:
def pseudoSearchAPI(query: str):
    
    # return top 5 search results
    idx_result = returnSearchResults(query, df)
    response = df.select(['title', 'video_id']).collect()[idx_result].to_dict(as_series=False)

    return response

In [27]:
def formatResultText(title: str, video_id: str):
    
    text = markdown_text = f"""<br> <br>
# {title}<br>

🔗 [Video Link](https://youtu.be/{video_id})"""

    return text

In [28]:
def formatVideoEmbed(video_id: str):

    # other options
    # embed = '<iframe width="640" height="360" src="https://img.youtube.com/vi/'+ video_id +'/0.jpg" </iframe>'
    # embed = '<a href="https://youtu.be/'+ video_id +'"> <img src="https://img.youtube.com/vi/'+ video_id +'/0.jpg" style="width:576;height:324;"></a>'
    # embed = '<a href="www.youtube.com/watch?v='+ video_id +'"> <img src="https://img.youtube.com/vi/'+ video_id +'/0.jpg" style="width:576;height:324;"></a>'
    
    return '<iframe width="576" height="324" src="https://www.youtube.com/embed/'+ video_id +'"></iframe>'

In [29]:
def searchResults(query):
    # pseudo API call
    response = pseudoSearchAPI(query)

    # format search results

    # initialize list of outputs
    output_list = []

    # compute number of null search results (out of 5)
    num_empty_results = 5-len(response['title'])

    # display search results
    for i in range(len(response['title'])):
        video_id = response['video_id'][i]
        title = response['title'][i]

        embed = gr.HTML(value = formatVideoEmbed(video_id), visible=True)
        text = gr.Markdown(value = formatResultText(title, video_id), visible=True)

        output_list.append(embed)
        output_list.append(text)

    # make null search result slots invisible
    for i in range(num_empty_results):
        
        # if no search results display "No results." text
        if num_empty_results==5 and i==0:
            embed = gr.HTML(visible=False)
            text = gr.Markdown(value = "No results. Try rephrasing your query.", visible=True)

            output_list.append(embed)
            output_list.append(text)
            continue

        embed = gr.HTML(visible=False)
        text = gr.Markdown(visible=False)

        output_list.append(embed)
        output_list.append(text)
        
    return output_list

In [30]:
# demo
output_list = []

with gr.Blocks() as demo:
    gr.Markdown("# YouTube Search")

    with gr.Row():
        inp = gr.Textbox(placeholder="What are you looking for?", label="Query", scale=3)
        btn = gr.Button("Search")
        btn.click(fn=searchResults, inputs=inp, outputs=output_list)
    
    for i in range(5):
        with gr.Row():
            output_list.append(gr.HTML())
            output_list.append(gr.Markdown())
             
    inp.submit(fn=searchResults, inputs=inp, outputs=output_list)

demo.launch()

* Running on local URL:  http://127.0.0.1:7861
* To create a public link, set `share=True` in `launch()`.


