In [33]:
# Lista i file e le cartelle direttamente sotto /kaggle/input/nlp_resources
!ls /kaggle/input/nlp-resources/test_data.parquet


/kaggle/input/nlp-resources/test_data.parquet


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [34]:
!pip install hnswlib


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [35]:
import pandas as pd

test_data=pd.read_parquet("/kaggle/input/nlp-resources/test_data.parquet")
train_data=pd.read_parquet("/kaggle/input/nlp-resources/tuning_data.parquet")
all_data=pd.concat([train_data,test_data])

In [36]:
print("Train data shape:", train_data.shape)
print("Test data shape: ", test_data.shape)
print("All data shape:  ", all_data.shape)


Train data shape: (7655, 3)
Test data shape:  (1641, 3)
All data shape:   (9296, 3)


Let's load a sentence transformer to compute the embeddings and a cross model encoder for reranking

In [37]:
from sentence_transformers import SentenceTransformer, CrossEncoder, util

semb_model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
xenc_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2') #they work better but they need the compute the embedding every time we have a new query

Let's now compute the context embedding we will use for answering

In [38]:
import torch
contexts = train_data['context'].tolist()
#corpus_embeddings = semb_model.encode(contexts, convert_to_tensor=True, show_progress_bar=True)
corpus_embeddings = torch.load('//kaggle/input/corpusembeddings/corpus_embeddings.pt')


Let's now use some random sentence, one for each cluster previously identified: we will use cosine similarity to see what is the most similar context and if it belongs to the same category

In [39]:
queries = [
    "She sprinkled fresh basil over the steaming bowl of tomato bisque.",        # Food/Culinary
    "The old philosopher pondered the meaning of existence under a fading sunset.",  # Literature/Philosophy
    "They unlocked the door to their seaside cottage just as the morning tide rolled in.",  # Real Estate/Travel
    "The student stayed late in the lab, determined to perfect her experiment.",    # Education/Research
    "The new streaming series had everyone talking about its unexpected plot twists.",  # Entertainment/Media
    "The council passed the ordinance after a heated debate on public safety.",     # Government/Legal
    "A sleek wind turbine spun silently against the clear blue sky.",               # Energy/Utilities
    "He practiced yoga each morning to center his mind and body.",                 # Healthcare/Wellness
    "The underdog team scored the winning goal in the final seconds of the match.",   # Sports/Athletics
    "Volunteers planted trees along the boulevard to brighten the city streets.",   # Urban Planning/Public Works
    "She sketched a minimalist poster that captured the essence of summer."         # Art/Design
]


Let's compute the embeddings of these sentences, too.

In [40]:
queries_embeddings = semb_model.encode(queries, convert_to_tensor=True,show_progress_bar=True )

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Let's find the most similar contexts with cosine similarity and see what category they belong to and if they are actually relevant.

In [41]:
import torch
import textwrap  # built-in module per il wrapping/shortening

# 1) Imposto il device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 2) Sposto le embedding sul device
corpus_embeddings = corpus_embeddings.to(device)
queries_embeddings = queries_embeddings.to(device)

# 3) Parametri di stampa
MAX_CHARS = 150
top_k = min(5, len(contexts))

# 4) Loop di similarità e stampa
for query, query_embedding in zip(queries, queries_embeddings):
    # query_embedding è già sul device corretto
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    print(f"Query: \"{query}\"")
    print("Top 5 most similar contexts in corpus:")
    print("---------------------------------------")
    for score, idx in zip(top_results[0], top_results[1]):
        full_ctx = contexts[idx]
        # accorcio per non stampare testi troppo lunghi
        short_ctx = full_ctx[:MAX_CHARS] + ("..." if len(full_ctx) > MAX_CHARS else "")
        # in alternativa con textwrap:
        # short_ctx = textwrap.shorten(full_ctx, width=MAX_CHARS, placeholder="...")

        print(f"Score: {score:.2f} - Document: \"{short_ctx}\"")
    print("\n")


Query: "She sprinkled fresh basil over the steaming bowl of tomato bisque."
Top 5 most similar contexts in corpus:
---------------------------------------
Score: 0.50 - Document: "- 2 Tbsp. Extra Virgin Olive Oil
- 1 Cup Onions-diced
- ½ Cup Carrots-diced
- 1 Tbsp. Garlic-sliced
- 4 Cups Chicken Broth-low sodium
- 1 Cup Green an..."
Score: 0.47 - Document: "Intro
It takes a lot for me to say something is perfect. Perfection isn’t really worth striving for but for us this chicken is as close as it gets.
Th..."
Score: 0.45 - Document: "Saturday in West Texas was beautiful. It was the kind of day that should be spent entirely outdoors, from breakfast to dinner. So what did I do in thi..."
Score: 0.42 - Document: "A great vegan recipe to share with a group of friends, and as a big carnivore I can safely say that the meat will not be missed with this one. The oat..."
Score: 0.41 - Document: "Recipe: Chickpea and Grape Tomato Salad ::.
With the days getting longer and warmer, I’m noticing mo

We can also use the built-in function for semantic search

In [42]:
query_embeddings =  semb_model.encode(queries, convert_to_tensor=True)

util.semantic_search(query_embeddings, corpus_embeddings, score_function=util.cos_sim)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[[{'corpus_id': 5537, 'score': 0.4992058277130127},
  {'corpus_id': 4486, 'score': 0.46568188071250916},
  {'corpus_id': 353, 'score': 0.4497196078300476},
  {'corpus_id': 4674, 'score': 0.4196283519268036},
  {'corpus_id': 6654, 'score': 0.40968459844589233},
  {'corpus_id': 3018, 'score': 0.4025222063064575},
  {'corpus_id': 5525, 'score': 0.3997200131416321},
  {'corpus_id': 4262, 'score': 0.3945382833480835},
  {'corpus_id': 3912, 'score': 0.3939746916294098},
  {'corpus_id': 4884, 'score': 0.3938068747520447}],
 [{'corpus_id': 3409, 'score': 0.4137376546859741},
  {'corpus_id': 7300, 'score': 0.4062492847442627},
  {'corpus_id': 7582, 'score': 0.40325719118118286},
  {'corpus_id': 4034, 'score': 0.3982100486755371},
  {'corpus_id': 6061, 'score': 0.387061208486557},
  {'corpus_id': 5846, 'score': 0.3861527740955353},
  {'corpus_id': 89, 'score': 0.38355791568756104},
  {'corpus_id': 5009, 'score': 0.3745168149471283},
  {'corpus_id': 2563, 'score': 0.36679935455322266},
  {'corpus

We can also speed up this process by pre-normalising the embeddings and compute directly the dot-product instead of cosine similarity.

In [43]:
#normalised_docs_embeddings = util.normalize_embeddings(semb_model.encode(contexts, convert_to_tensor=True))
#normalised_query_embeddings = util.normalize_embeddings(semb_model.encode(queries, convert_to_tensor=True))
norm_docs_embeddings = torch.load('/kaggle/input/normdocsembedding/norm_docs_embeddings.pt')
norm_query_embeddings = torch.load('/kaggle/input/normqueryemb/norm_query_embeddings.pt')

In [44]:
hits = util.semantic_search(norm_query_embeddings, norm_docs_embeddings, score_function=util.dot_score)
hits

[[{'corpus_id': 5537, 'score': 0.4992058277130127},
  {'corpus_id': 4486, 'score': 0.4656819701194763},
  {'corpus_id': 353, 'score': 0.44971972703933716},
  {'corpus_id': 4674, 'score': 0.41962817311286926},
  {'corpus_id': 6654, 'score': 0.4096849262714386},
  {'corpus_id': 3018, 'score': 0.4025224447250366},
  {'corpus_id': 5525, 'score': 0.39972010254859924},
  {'corpus_id': 4262, 'score': 0.3945384621620178},
  {'corpus_id': 3912, 'score': 0.3939747214317322},
  {'corpus_id': 4884, 'score': 0.393807053565979}],
 [{'corpus_id': 3409, 'score': 0.413737952709198},
  {'corpus_id': 7300, 'score': 0.40624937415122986},
  {'corpus_id': 7582, 'score': 0.4032573699951172},
  {'corpus_id': 4034, 'score': 0.39820975065231323},
  {'corpus_id': 6061, 'score': 0.3870611786842346},
  {'corpus_id': 5846, 'score': 0.3861527144908905},
  {'corpus_id': 89, 'score': 0.383558064699173},
  {'corpus_id': 5009, 'score': 0.3745168447494507},
  {'corpus_id': 2563, 'score': 0.36679941415786743},
  {'corpus_

Since it took a lot of time, let's save the embedding we previously computed

In [45]:
'''torch.save(corpus_embeddings, 'corpus_embeddings.pt')
torch.save(normalised_docs_embeddings, 'norm_docs_embeddings.pt')
torch.save(normalised_query_embeddings, 'norm_query_embeddings.pt')'''

"torch.save(corpus_embeddings, 'corpus_embeddings.pt')\ntorch.save(normalised_docs_embeddings, 'norm_docs_embeddings.pt')\ntorch.save(normalised_query_embeddings, 'norm_query_embeddings.pt')"

In order to load them the next time i have to do this

In [46]:
'''corpus_embeddings = torch.load('corpus_embeddings.pt')
norm_docs_embeddings = torch.load('normalised_docs_embeddings.pt')
norm_query_embeddings = torch.load('normalised_query_embeddings.pt')'''

"corpus_embeddings = torch.load('corpus_embeddings.pt')\nnorm_docs_embeddings = torch.load('normalised_docs_embeddings.pt')\nnorm_query_embeddings = torch.load('normalised_query_embeddings.pt')"

Let's now try and do the same thing with the cross encoder. Basically this model will be used to compute the similarity instead of using cosine similarity.

In [47]:
import textwrap  # per textwrap.shorten, se preferisci

MAX_CHARS = 150
TOP_K = 5

for query in queries:
    # Preparo gli input al modello
    xenc_model_inputs = [[query, context] for context in contexts]
    # Calcolo i punteggi di similarità
    scores = xenc_model.predict(xenc_model_inputs)

    # Prendo gli indici dei TOP_K maggiori
    top_idxs = np.argsort(-scores)[:TOP_K]

    print(f"Query: \"{query}\"")
    print("Top 5 most similar contexts:")
    print("---------------------------------------")
    for idx in top_idxs:
        full_ctx = contexts[idx]
        # Accorcio il contesto
        short_ctx = full_ctx[:MAX_CHARS] + ("..." if len(full_ctx) > MAX_CHARS else "")
        # oppure, se vuoi tagliare per parola:
        # short_ctx = textwrap.shorten(full_ctx, width=MAX_CHARS, placeholder="...")

        print(f"Score: {scores[idx]:.4f} – \"{short_ctx}\"")
    print("\n")

Batches:   0%|          | 0/240 [00:00<?, ?it/s]

Query: "She sprinkled fresh basil over the steaming bowl of tomato bisque."
Top 5 most similar contexts:
---------------------------------------
Score: -6.6098 – "HUNTSVILLE, Alabama -- Try this football season recipe from chef James Boyce:
Grilled Italian Sausage and Peppers (Gyro-Style)
1 tablespoon olive oil
..."
Score: -6.6682 – "By V. Sheree Williams
As restaurants prepare for the change of seasons, many are thinking light and fresh for the summer. Following the trend is inter..."
Score: -6.8855 – "Signor Sassi is a London institution that has been around since 1984, from 2007 the ownership belonged to the San Carlo Group. It ruffles up the sort ..."
Score: -6.8900 – "This is my famous Panang curry recipe with kale, which has a slightly different twist and lots of veggies – resulting in a huge pot of steaming, nutri..."
Score: -7.1186 – "A great vegan recipe to share with a group of friends, and as a big carnivore I can safely say that the meat will not be missed with this one. Th

Batches:   0%|          | 0/240 [00:00<?, ?it/s]

Query: "The old philosopher pondered the meaning of existence under a fading sunset."
Top 5 most similar contexts:
---------------------------------------
Score: -7.8170 – "Progress ?
What is progress?
I do not mean just what is the definition of progress
But what is it to actually make some progress towards what is the u..."
Score: -7.8535 – "A Summary of Postmodern PhilosophyQUESTION: A Summary of Postmodern Philosophy
ANSWER:.1 He writes, “Nietzsche, the patron saint of postmodernity, pro..."
Score: -8.1478 – "One.
Meillassoux has proposed that dichotomy theist/deist and atheist is a false dilemma (see fallacy of black and white, etc.). He calls this dilemma..."
Score: -8.2940 – "THE WELLSPRING OF HOPE
Where can we find Hope, and How do we then sustain it ? 'The whole of creation relates to God, as air to the sun' (St.Thomas Aq..."
Score: -8.4759 – "FreeThe Divine Revelation of the New Jerusalem This work contains a summary of the doctrines of the Catholic and Protesant churches, an

Batches:   0%|          | 0/240 [00:00<?, ?it/s]

Query: "They unlocked the door to their seaside cottage just as the morning tide rolled in."
Top 5 most similar contexts:
---------------------------------------
Score: -7.7588 – "Chapter 1 : Prologue
Background: Font color:
Hermione sighed, turned over in bed and pulled the covers over her head. The bangs and crashes from down ..."
Score: -7.8111 – "Several wonderful must-experience things are happening at this pair of Airbnbs. First, there are the adorable llamas and alpacas roaming the grounds. ..."
Score: -8.0180 – "Despite, how slow, I thought my two weeks of R and R would pass at the beginning,
my time in Napier, raced past, out and about exploring the surroundi..."
Score: -8.4063 – "The Heart's Content Cottage is the most requested private accommodation with a 2 person Jacuzzi tub/shower.
Breakfast is served direct to your door wi..."
Score: -8.5521 – "THE HUNGRY sea has been gnawing away again at Pagham beach and trying to fight its way over the seawalls into farmland at Fishbo

Batches:   0%|          | 0/240 [00:00<?, ?it/s]

Query: "The student stayed late in the lab, determined to perfect her experiment."
Top 5 most similar contexts:
---------------------------------------
Score: -2.7661 – "Home > Get Involved > Volunteer Groups > Laboratory Health and Safety Committee > Azobisisobutyronitrile Fire Azobisisobutyronitrile Fire Page Content..."
Score: -5.2988 – "Reviewed by Dennis Schwartz
An early work directed by Anthony Mann ("Raw Deal")
that leaves a bad impression, giving few clues to his
future greatness..."
Score: -5.7056 – "EE115C Final Solution Leaked
Have final exam for the class by himself, so one of his graduate students, Aida Varzaghani, was in charge of proctoring t..."
Score: -6.0894 – "Students admitted to YSPA will begin their studies for the program two weeks before the residential program, when they will receive our project guide ..."
Score: -6.9446 – "This is the third and final action in the Strong Marriage Experiment that is concentrated on Dating. Yes, that means one seventh of the ch

Batches:   0%|          | 0/240 [00:00<?, ?it/s]

Query: "The new streaming series had everyone talking about its unexpected plot twists."
Top 5 most similar contexts:
---------------------------------------
Score: -0.6372 – "Netflix Caper Series 'Lupin' Returns For A Second Season This Summer
Lupin was the first surprise streaming hit of 2021, with its first five slick, su..."
Score: -4.3717 – "Oh, brilliant! The Time Treadmill Ron Miles 12th Doctor13th DoctorBill PottsNardole Aug 06 2018 I did it! As of about 7:30 am this morning I completed..."
Score: -4.4813 – ""
One.
"Disappointed")
danmc
"Fun Listen"
Follett weaves another fine yarn!
This book is written from the perspective of a handful of different charac..."
Score: -4.8703 – "Title: The Bad Beginning
Series: A Series of Unfortunate Events
Author: Lemony Snicket
Publisher: Scholastic, Inc
My Rating:
> was totally caught by s..."
Score: -5.2063 – "Big […]
Utopia is my favorite new series so far. Granted, it’s way too early to judge, especially considering there are so many new 

Batches:   0%|          | 0/240 [00:00<?, ?it/s]

Query: "The council passed the ordinance after a heated debate on public safety."
Top 5 most similar contexts:
---------------------------------------
Score: -0.2028 – "Whatcom County Council passes the introduction of a 5th interim moratorium on facilities at Cherry Point during their July 24th, 2018 meeting.
Council..."
Score: -2.2542 – "By Susan Beam
For the Daily Record/Sunday News
West York Councilman Tim Berkheimer -- cited for sounding an air horn during a council meeting in Novem..."
Score: -3.6017 – "COLUMBIA — Within two weeks, Columbia citizens will get a chance to voice their opinions about the role of the Citizens Police Review Board.
At a meet..."
Score: -4.0235 – "CMP could have trouble rerouting corridor if Maine pulls approvals
Oct. 19—A heated debate over potential alternative routes for Central Maine Power C..."
Score: -4.6649 – "Quebec’s National Assembly voted Wednesday to pass the Liberal government’s Bill 62 which will ban face coverings while giving or receiving

Batches:   0%|          | 0/240 [00:00<?, ?it/s]

Query: "A sleek wind turbine spun silently against the clear blue sky."
Top 5 most similar contexts:
---------------------------------------
Score: -7.2023 – "The cold rain was falling outside, and the windows rattled with the force of the wind. Tiny shards of rainwater slid silently down the clear glass. Th..."
Score: -7.5677 – "The Petite Marseille Crossbody Wallet with thousands of boats docked in its clear blue waters. Not only that, but the beautiful beaches, ancient build..."
Score: -8.0541 – "Viking Wind has two current projects in the order book for fall 2018. Both projects are already at an advanced stage.
Viking Wind has two current proj..."
Score: -8.1801 – "This definitely is not about the movie; it is something much more than that. It is about an event I witnessed that happened right in front of me. It i..."
Score: -8.1974 – "Supplier Homepage Product 3 Tempered Glass Building Tempered Glass Customized Size Tempered Glass with Fine Polish
Customized Size Tempered Glass wit

Batches:   0%|          | 0/240 [00:00<?, ?it/s]

Query: "He practiced yoga each morning to center his mind and body."
Top 5 most similar contexts:
---------------------------------------
Score: -2.9264 – "10 Important Things my Yoga Practice Taught Me.
Reflections from the yoga mat.
I was inspired by Kate Bartolotta’s recent post, So What Do Gomukhasana..."
Score: -6.4410 – "Here at Amara, we have been brainstorming on how we can be of service as we launch this phase of our healing center.
Our focus this spring has been in..."
Score: -6.7114 – "The [...]
August 2010
Three New Abdominal ExercisesAugust 30, 2010 { [...] [...]
Menu Plan – Week of 8/23 (late!)August 23, 2010 { 1 comment }
Apologi..."
Score: -6.7903 – "3 Daily Habits To Help You Stay Young In Mind And Body
This question originally appeared on Quora. Answer by Rachmartika Astarini.
Most people won’t b..."
Score: -7.0712 – "Great 9.8/10
Total of people that favorited this listing
Explore and unfold your relationships through yoga and other various techniques at this relax..

Batches:   0%|          | 0/240 [00:00<?, ?it/s]

Query: "The underdog team scored the winning goal in the final seconds of the match."
Top 5 most similar contexts:
---------------------------------------
Score: 0.7172 – "Hockey: Rochdale Mens Seconds 6 - 4 West Derby
Date published: 05 March 2019
Photo: Rachel Jackson
Peter Ransome (far right) scored a hat trick
Saturd..."
Score: 0.5367 – "August 18, 2013 - The 2013 Hublot Polo Gold Cup fiercer fight between two incredible teams, Team Gstaad Palace and Team Hublot. The players and ponies..."
Score: 0.4829 – "Jazz Ousts Nuggets, Avoids Making History
Article excerpt
The Utah Jazz, desperate to avoid one of the most embarrassing defeats in National Basketbal..."
Score: -0.2624 – "This post was written by Hatchet reporter Luis Puno.
With nine seconds left in sudden-death overtime, sophomore Paul Deasey netted the game-winning go..."
Score: -0.5334 – "GRAND FORKS, N.D. – After 17-consecutive games without a win, the University of North Dakota women’s soccer team picked up a huge win over

Batches:   0%|          | 0/240 [00:00<?, ?it/s]

Query: "Volunteers planted trees along the boulevard to brighten the city streets."
Top 5 most similar contexts:
---------------------------------------
Score: -5.5501 – "Hello everyone! I missed Amy's Five on Friday last week and so this Friday I'm sharing some photos of plants and flowers that I've seen in the last te..."
Score: -6.0828 – "The NYPD has declared war on crime in Greenwich Village, flooding the neighborhood with extra patrols, mounted cops and a mobile command center and li..."
Score: -6.7882 – "Our first location will be in the high elevation cloud forest in the Talamanca Mountain Range in a virgin oak forest that is part of the largest prote..."
Score: -6.8055 – "Without a doubt, Lytton Park is one of the most exclusive neighbourhoods in the city. Affluent families and young professionals are drawn to this Nort..."
Score: -7.4421 – "Mayor lauds business community’s recovery
The newly built Hoo’s Q barbecue restaurant was built on the lot that previously housed Mike & 

Batches:   0%|          | 0/240 [00:00<?, ?it/s]

Query: "She sketched a minimalist poster that captured the essence of summer."
Top 5 most similar contexts:
---------------------------------------
Score: -3.4496 – "Baby, It’s Cold In Here… Baby, It’s Hot In Here…
Wylde is a hyper-cool online magazine and an open way towards all new, trending and inspiring. A cult..."
Score: -6.6112 – "Utah artist Colleen Howe is an accomplished pastelist and workshop instructor who is widely known for her sensitive and colorful landscapes of the Wes..."
Score: -6.7545 – "SUMMER ROMANCE
Summer is here and love is in the air. If it’s a first date, five year anniversary or guy-spying at a late afternoon wedding, romantic ..."
Score: -7.2181 – "SALA Featured Artist - Louise HASELTON
Adelaide-based Louise Haselton has established an art practice in which no materials or concepts are off limits..."
Score: -7.4886 – "LEOPARD STRING BIKINI BOTTOMS FEATURES:
super soft, luxe fabric
fully lined, no see-through swimwear
sleek cut for minimal tan lines
a flexibl

When we use an embedding model with cosine similarity, we can pre-compute the embeddings in our data set and index them to speed-up the search. There are techniques for Approximate Nearest Neighbor (ANN), which use clustering to index the embedding space and speed-up the search process. Let's do indexing with HNSWLIB

In [48]:
import hnswlib

index = hnswlib.Index(space='cosine', dim=corpus_embeddings.size(1))

Now we can index our data. The index we compute can be saved and loaded, so we can check if it is already availabel and load it (this will save time)

In [49]:
import os

# Define hnswlib index path
index_path = "./hnswlib.index"

# Load index if available
if os.path.exists(index_path):
    print("Loading index...")
    index.load_index(index_path)
# Else index data collection
else:
    # Initialise the index
    print("Start creating HNSWLIB index")
    index.init_index(max_elements=corpus_embeddings.size(0), ef_construction=400, M=64)
    #  Compute the HNSWLIB index (it may take a while)
    index.add_items(corpus_embeddings.cpu(), list(range(len(corpus_embeddings))))
    # Save the index to a file for future loading
    print("Saving index to:", index_path)
    index.save_index(index_path)

Loading index...


Let's see if it's actually faster by measuring the running time

In [50]:
from datetime import datetime

# Search using index
t_start = datetime.now()
_ = index.knn_query(query_embeddings[0].cpu(), k=128)
t_stop = datetime.now()
print(f"Search time with index: {t_stop - t_start}")

# Search without index
t_start = datetime.now()
_ = util.semantic_search(query_embedding, corpus_embeddings, score_function=util.cos_sim, top_k=128)
t_stop = datetime.now()
print(f"Search time without index: {t_stop - t_start}")

Search time with index: 0:00:00.000889
Search time without index: 0:00:00.001169


Reranking
Since the cross encoder gives better results but is slower than cosine similarity, we can take advantage of both: We can do a first search with bi-encoder models and then re-rank the top- k  results with a cross-encoder. We call this approach retrieve and re-rank.

Let's define a new random query (we could have used one of those generated before)

In [51]:
query = "Who is the president of the United States?"
query_embedding = semb_model.encode(query, convert_to_tensor=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Search using ANN index

In [52]:
corpus_ids, distances = index.knn_query(query_embedding.cpu(), k=3)
scores = 1 - distances

print("Cosine similarity model search results")
print(f"Query: \"{query}\"")
print("---------------------------------------")
for idx, score in zip(corpus_ids[0], scores[0]):
    print(f"Score: {score:.4f}\nDocument: \"{contexts[idx]}\"\n\n")

Cosine similarity model search results
Query: "Who is the president of the United States?"
---------------------------------------
Score: 0.4102
Document: "JAKARTA, KOMPAS.com – Indonesian President Joko ‘Jokowi’ Widodo is set to give his maiden speech to the United Nations General Assembly, more than five years after his first term as the country’s head of state.
Unlike other heads of state who made their way through New York's Manhattan traffic to the UN headquarters to deliver their speech, Jokowi’s oration is determined by the Covid-19 pandemic ravaging Indonesia and the rest of the world.
“[The Presidential Secretariat] will be tapping [sic] Jokowi’s speech beforehand, as it will be delivered virtually in line with Covid-19 [health protocols],” said Presidential Secretariat Head Heru Budi Hartono on September 22.
Also read: Indonesia Calls for Greater ASEAN-US Cooperation to Combat Covid-19
“It will be played on the UN General Assembly at 8.30pm on September 22 in New York, or 7.3

Let's add the re-ranking

In [53]:
corpus_ids, _ = index.knn_query(query_embedding.cpu(), k=128)

model_inputs = [(query, contexts[idx]) for idx in corpus_ids[0]]
cross_scores = xenc_model.predict(model_inputs)

print("Cross-encoder model re-ranking results")
print(f"Query: \"{query}\"")
print("---------------------------------------")
for idx in np.argsort(-cross_scores)[:3]:
    print(f"Score: {cross_scores[idx]:.4f}\nDocument: \"{contexts[corpus_ids[0][idx]]}\"\n\n")

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Cross-encoder model re-ranking results
Query: "Who is the president of the United States?"
---------------------------------------
Score: 0.7632
Document: "Why It Makes A Difference!
The President of the United States is a liar. Not a tiny weenie white liar, but a Sociopathic Liar. His word is not worth the air he is allowed to use to say it. Why Does It Make A Difference? It is because the leader of the Free World MUST be Trusted. When the President of the United States is willing to repeatedly Testify to lies, then who should our Children look up to? Sports Figures? When everyone within the Obama Administration is willing to SAY and do anything regardless of TRUTH, then why would anyone in the WORLD believe in America or Her Freedoms.
The recent admission to IRS intimidation against “Tea Party” shows the level of tyrannical Leadership that Mr. Obama is willing to stoop. With the passage of the Obama Care TAX legislation, imagine the CONTROL that our Massive “Progressive” Government w

Finally, let's see if our dataset is big enough to give correct answers to generic questions just by finding the most similar question in the dataset and giving its associated answer. It should work kind of like a chatbot

Let's first compute the answers' embeddings

In [54]:
answers = train_data['answer'].tolist()
answers_embeddings = semb_model.encode(answers, convert_to_tensor=True, show_progress_bar=True)

Batches:   0%|          | 0/240 [00:00<?, ?it/s]

In [60]:
questions = train_data['question'].tolist()
questions_embeddings = semb_model.encode(questions, convert_to_tensor=True, show_progress_bar=True)

Batches:   0%|          | 0/240 [00:00<?, ?it/s]

In [68]:
# Create empty index
hnswlib_index = hnswlib.Index(space='cosine', dim=questions_embeddings.size(1))

# Define hnswlib index path
index_path = "./emp_dialogue_hnswlib.index"

# Load index if available
if os.path.exists(index_path):
    print("Loading index...")
    hnswlib_index.load_index(index_path)
# Else index data collection
else:
    # Initialise the index
    print("Start creating HNSWLIB index")
    hnswlib_index.init_index(max_elements=questions_embeddings.size(0), ef_construction=400, M=64)
    #  Compute the HNSWLIB index (it may take a while)
    hnswlib_index.add_items(questions_embeddings.cpu(), list(range(len(questions_embeddings))))
    # Save the index to a file for future loading
    print("Saving index to:", index_path)
    hnswlib_index.save_index(index_path)

Loading index...


In [69]:
import numpy as np
def get_response(
    message: str,
    questions_embeddings,     # array/tensor delle embedding delle domande
    answers: list[str],       # lista delle risposte corrispondenti
    index,                    # il tuo hnswlib.Index già popolato con questions_embeddings
    re_ranking_model=None,    # eventualmente il tuo modello cross-encoder
    top_k: int = 32
) -> str:
    # 1) Embedding della query
    message_emb = semb_model.encode(message, convert_to_tensor=True).cpu().numpy()

    # 2) Recupero top_k domande più simili
    corpus_ids, _ = index.knn_query(message_emb, k=top_k)

    # corpus_ids è un array shape (1, top_k) di indici
    candidate_idxs = corpus_ids[0]

    # 3) Se hai un cross-encoder, costruisci gli input (message, risposta)
    if re_ranking_model is not None:
        model_inputs = [
            (message, answers[i])
            for i in candidate_idxs
        ]
        cross_scores = re_ranking_model.predict(model_inputs)
        # prendo l’indice del best candidate fra i top_k
        best_pos = np.argmax(cross_scores)
        best_idx = candidate_idxs[best_pos]
    else:
        # altrimenti, scelgo semplicemente il primo fra i più simili
        best_idx = candidate_idxs[0]

    # 4) Ritorno la risposta corrispondente
    return answers[best_idx]

In [72]:
chatbot_response = get_response(
    "who is einstein?",  # 1) message
    questions_embeddings,                 # 2) embedding delle domande
    answers,                              # 3) lista delle risposte
    hnswlib_index,                        # 4) l’indice HNSW
    re_ranking_model=xenc_model,          # 5) (keyword) modello di re-ranking, se lo usi
    top_k=32                              # 6) facoltativo, quanti candidati recuperare
)

print("Chatbot says:", chatbot_response)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Chatbot says: Albert Einstein's most famous paper was the Special Theory of Relativity. It revealed that the speed of light is a constant against which even time and space lose their absolute meaning.
