In [2]:
!pip install hnswlib


Collecting hnswlib
  Downloading hnswlib-0.8.0.tar.gz (36 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: hnswlib
  Building wheel for hnswlib (pyproject.toml) ... [?25l[?25hdone
  Created wheel for hnswlib: filename=hnswlib-0.8.0-cp311-cp311-linux_x86_64.whl size=2389210 sha256=4d9f50f5028f7b188854ce9b8b79627479562e2df975ffa5c5449a13aac00b54
  Stored in directory: /root/.cache/pip/wheels/ea/4e/27/39aebca9958719776e36fada290845a7ef10f053ad70e22ceb
Successfully built hnswlib
Installing collected packages: hnswlib
Successfully installed hnswlib-0.8.0


In [None]:

!ls /kaggle/input/nlp-resources/test_data.parquet


/kaggle/input/nlp-resources/test_data.parquet


In [15]:
import pandas as pd


all_data=pd.read_parquet("/kaggle/input/nlp-resources/cleaned_data.parquet")
labels = pd.read_parquet("/kaggle/input/labeled/df_supervised.parquet")

In [19]:

lab = labels['cluster_label'].tolist()

text             Caption: Tasmanian berry grower Nic Hansen sho...
cluster_id                                                       4
cluster_label                                 Finance & Investment
Name: 0, dtype: object


Let's load a sentence transformer to compute the embeddings and a cross model encoder for reranking

In [20]:
from sentence_transformers import SentenceTransformer, CrossEncoder, util

semb_model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
xenc_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2') #they work better but they need the compute the embedding every time we have a new query

Let's now compute the context embedding we will use for answering

In [21]:
import torch
contexts = all_data['context'].tolist()
corpus_embeddings = semb_model.encode(contexts, convert_to_tensor=True, show_progress_bar=True)
#corpus_embeddings = torch.load('//kaggle/input/corpusembeddings/corpus_embeddings.pt')


Batches:   0%|          | 0/342 [00:00<?, ?it/s]

Let's now use some random sentence, one for each cluster previously identified: we will use cosine similarity to see what is the most similar context and if it belongs to the same category

In [22]:
queries = [
    "She uploaded the latest app update to the cloud-based platform without a hitch.",           # Digital/Online Services
    "The director revealed the teaser trailer at the midnight screening.",                      # Film & Television
    "The assembly line hummed as robotic arms welded the steel frame with precision.",           # Industrial & Machinery
    "He laced up his cleats just before kick-off under the stadium lights.",                    # Sports & Athletics
    "She diversified her portfolio by investing in emerging market bonds last quarter.",        # Finance & Investment
    "They sold out tickets to the championship gaming tournament in under an hour.",             # Gaming & Events
    "Guests strolled through the resort’s marble lobby, overlooking the sun-drenched terrace.",  # Hospitality & Real Estate
    "He spent the afternoon lost in a novel about existentialism and the human condition.",      # Literature & Philosophy
    "The clinic introduced a cutting-edge diagnostic tool to improve patient outcomes.",         # Healthcare & Medicine
    "She curated a capsule wardrobe to streamline her daily style routine.",                     # Lifestyle & Personal Interests
    "Lawmakers convened to draft the new regulations on data privacy and trade compliance.",    # Legal & Government
    "He savored the flaky croissant while reading the recipe for boulangerie perfection.",      # Food & Culinary
    "The candidate outlined her vision for education reform during the televised debate.",       # Politics & Public Affairs
    "Volunteers conducted a beach cleanup to protect the fragile coastal ecosystem.",           # Environment & Ecology
    "She blended acrylics to capture the vibrant hues of the urban skyline at dusk.",            # Art & Design
    "Researchers presented their groundbreaking findings on renewable energy sources.",         # Education & Research
    "The DJ mixed vintage vinyl records to curate the perfect summer soundtrack.",              # Music & Audio
    "He practiced mindfulness meditation to maintain balance amid his hectic schedule."         # Wellness & Personal Care
]


Let's compute the embeddings of these sentences, too.

In [23]:
queries_embeddings = semb_model.encode(queries, convert_to_tensor=True,show_progress_bar=True )

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Let's find the most similar contexts with cosine similarity and see what category they belong to and if they are actually relevant.

We can also use the built-in function for semantic search

In [24]:
query_embeddings =  semb_model.encode(queries, convert_to_tensor=True)

util.semantic_search(query_embeddings, corpus_embeddings, score_function=util.cos_sim)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[[{'corpus_id': 603, 'score': 0.40528178215026855},
  {'corpus_id': 642, 'score': 0.3988167345523834},
  {'corpus_id': 9173, 'score': 0.39425790309906006},
  {'corpus_id': 6954, 'score': 0.39246395230293274},
  {'corpus_id': 9056, 'score': 0.3865756690502167},
  {'corpus_id': 273, 'score': 0.380735844373703},
  {'corpus_id': 113, 'score': 0.36934608221054077},
  {'corpus_id': 6911, 'score': 0.3677324950695038},
  {'corpus_id': 851, 'score': 0.3670920133590698},
  {'corpus_id': 5821, 'score': 0.3656845688819885}],
 [{'corpus_id': 1497, 'score': 0.4757058322429657},
  {'corpus_id': 317, 'score': 0.46287935972213745},
  {'corpus_id': 6703, 'score': 0.42867356538772583},
  {'corpus_id': 8269, 'score': 0.4266166090965271},
  {'corpus_id': 10420, 'score': 0.4123404920101166},
  {'corpus_id': 1819, 'score': 0.40546783804893494},
  {'corpus_id': 2713, 'score': 0.39723342657089233},
  {'corpus_id': 10865, 'score': 0.3966255187988281},
  {'corpus_id': 10168, 'score': 0.3938640356063843},
  {'cor

We can also speed up this process by pre-normalising the embeddings and compute directly the dot-product instead of cosine similarity.

In [25]:
normalised_docs_embeddings = util.normalize_embeddings(semb_model.encode(contexts, convert_to_tensor=True))
normalised_query_embeddings = util.normalize_embeddings(semb_model.encode(queries, convert_to_tensor=True))
#norm_docs_embeddings = torch.load('/kaggle/input/normdocsembedding/norm_docs_embeddings.pt')
#norm_query_embeddings = torch.load('/kaggle/input/normqueryemb/norm_query_embeddings.pt')

Batches:   0%|          | 0/342 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [26]:
hits = util.semantic_search(normalised_query_embeddings, normalised_docs_embeddings, score_function=util.dot_score)
hits

[[{'corpus_id': 603, 'score': 0.40528178215026855},
  {'corpus_id': 642, 'score': 0.3988167345523834},
  {'corpus_id': 9173, 'score': 0.39425790309906006},
  {'corpus_id': 6954, 'score': 0.39246395230293274},
  {'corpus_id': 9056, 'score': 0.3865756690502167},
  {'corpus_id': 273, 'score': 0.380735844373703},
  {'corpus_id': 113, 'score': 0.36934608221054077},
  {'corpus_id': 6911, 'score': 0.3677324950695038},
  {'corpus_id': 851, 'score': 0.3670920133590698},
  {'corpus_id': 5821, 'score': 0.3656845688819885}],
 [{'corpus_id': 1497, 'score': 0.4757058322429657},
  {'corpus_id': 317, 'score': 0.46287935972213745},
  {'corpus_id': 6703, 'score': 0.42867356538772583},
  {'corpus_id': 8269, 'score': 0.4266166090965271},
  {'corpus_id': 10420, 'score': 0.4123404920101166},
  {'corpus_id': 1819, 'score': 0.40546783804893494},
  {'corpus_id': 2713, 'score': 0.39723342657089233},
  {'corpus_id': 10865, 'score': 0.3966255187988281},
  {'corpus_id': 10168, 'score': 0.3938640356063843},
  {'cor

Since it took a lot of time, let's save the embedding we previously computed

In [27]:
torch.save(corpus_embeddings, 'corpus_embeddings.pt')
torch.save(normalised_docs_embeddings, 'norm_docs_embeddings.pt')
torch.save(normalised_query_embeddings, 'norm_query_embeddings.pt')

In order to load them the next time i have to do this

In [None]:
'''corpus_embeddings = torch.load('corpus_embeddings.pt')
norm_docs_embeddings = torch.load('normalised_docs_embeddings.pt')
norm_query_embeddings = torch.load('normalised_query_embeddings.pt')'''

Let's now try and do the same thing with the cross encoder. Basically this model will be used to compute the similarity instead of using cosine similarity.

In [None]:
import numpy as np

import textwrap  

MAX_CHARS = 150
TOP_K = 5

for query in queries:
    
    xenc_model_inputs = [[query, context] for context in contexts]
    # compute similaruty
    scores = xenc_model.predict(xenc_model_inputs)

    # i take highest TOP_K 
    top_idxs = np.argsort(-scores)[:TOP_K]

    print(f"Query: \"{query}\"")
    print("Top 5 most similar contexts:")
    print("---------------------------------------")
    for idx in top_idxs:
        full_ctx = contexts[idx]
        # hortening contex
        short_ctx = full_ctx[:MAX_CHARS] + ("..." if len(full_ctx) > MAX_CHARS else "")
       

        print(f"Score: {scores[idx]:.4f} – \"{short_ctx}\", {lab[idx]}")
    print("\n")

Batches:   0%|          | 0/342 [00:00<?, ?it/s]

Query: "She uploaded the latest app update to the cloud-based platform without a hitch."
Top 5 most similar contexts:
---------------------------------------
Score: -5.1386 – "5. eHarmony
eHarmony is known as a platform that is top locating a soulmate and building effective relationships. In line with the supplied informatio...", Digital/Online Services
Score: -5.3458 – "The latest Google real time auto-translation app update has promised to recognise a spoken language and make a real-time transcript of the words, as w...", Digital/Online Services
Score: -5.4027 – "Read the Original Article at
Google on Wednesday outlined its vision for the Internet, which will include a broad scope of technical ambitions using t...", Digital/Online Services
Score: -5.4532 – "Business New platform transforms Excel spreadsheets into working web apps in minutes, automatically Image Credit: Craig Chew-Moulding November 13, 201...", Digital/Online Services
Score: -5.8853 – "FatCloud Announces New Free Comm

Batches:   0%|          | 0/342 [00:00<?, ?it/s]

Query: "The director revealed the teaser trailer at the midnight screening."
Top 5 most similar contexts:
---------------------------------------
Score: -2.7779 – "'The Dark Knight Rises' Countdown: Fake Trailers and Feline Fatales
Chris Clow is a comic book expert and contributor to Batman-On-Film.com. You can f...", Film & Television
Score: -4.3955 – "210 – What Happens Nexton May 28th, 2009
IN-DEPTH REVIEW OF THE SUPERFOGEYS OVER AT “DIGITAL STRIPS!”
Wow. The Midnight Cartooner, a contributor at Di...", Film & Television
Score: -4.6109 – "Calm Down, A ‘Rogue One’ Trailer is Coming
There was plenty of buzz surrounding the upcoming addition to the Star Wars saga at San Diego Comic Con thi...", Film & Television
Score: -5.2504 – "Some Guy Who Kills People is set to be the sleeper hit of the Toronto After Dark Film Festival. Though it screened at 1:30 pm on a Sunday afternoon, i...", Film & Television
Score: -5.2556 – "Hi Folks, Have you watched this movie- 2012? Here's a little bit bac

Batches:   0%|          | 0/342 [00:00<?, ?it/s]

Query: "The assembly line hummed as robotic arms welded the steel frame with precision."
Top 5 most similar contexts:
---------------------------------------
Score: -4.2790 – "Robot Arm
This Project will produce a robot arm, approximately 1,2m long, with 3 knee-joints, 2 rotating joints and 5 degrees of freedom in total.
The...", Industrial & Machinery
Score: -6.2318 – "The giant steel-framed globe, a 31-foot steel structure on a roundabout in front of MOA, also known as the ‘MOA Globe’, greeted visitors of the mall s...", Industrial & Machinery
Score: -6.6602 – "2. Early Entry, Lethality, and Survivability Operational Capability Requirements
EEL01: Precision Line of Sight Munitions and Weapon Systems. Required...", Industrial & Machinery
Score: -6.9826 – "-CRAFTSMANSHIP-Manufacturing of Daishin
01/ 05
Specialized in ultra-fine parts
Daishin’s technology began with parts feeders of precision watch parts ...", Industrial & Machinery
Score: -7.1718 – "Tape, a circle, a wire 50 NP - alloy

Batches:   0%|          | 0/342 [00:00<?, ?it/s]

Query: "He laced up his cleats just before kick-off under the stadium lights."
Top 5 most similar contexts:
---------------------------------------
Score: -3.2247 – "Courtesy of Victor Cruz's twitter page.
In light of the horrible tragedy that shook the small town of Newtown, Connecticut, one NFL player is paying r...", Sports & Athletics
Score: -4.8232 – "Flag on the Play: Misleading Energy Responses to the Super Bowl Blackout
"Flag on the Play: Misleading Energy Responses to the Super Bowl Blackout" sp...", Environment & Ecology
Score: -5.2534 – "City lose late on to Brackley
Match Result
The Minstermen were denied their first league win of the season as two late second half goals earned Brackl...", Sports & Athletics
Score: -5.4090 – "TAMPA, Fla. --
*****
Bonani, a Sao Paolo native who moved to tiny Lake Wales, Fla., at age 11, learned before Wednesday's practice that he would suppl...", Sports & Athletics
Score: -5.5165 – "Discuss 'YOUTH HOLD OFF THE CITIZENS AT THE BRIDGE' on our 

Batches:   0%|          | 0/342 [00:00<?, ?it/s]

Query: "She diversified her portfolio by investing in emerging market bonds last quarter."
Top 5 most similar contexts:
---------------------------------------
Score: -4.2616 – "Community Bank N.A. decreased its holdings in shares of United Parcel Service, Inc. (NYSE:UPS) by 6.0% during the 3rd quarter, according to the compan...", Finance & Investment
Score: -4.8693 – "- : Emerging Stocks Rise as China Rallies on MSCI Index Speculation
LAGOS, Nigeria, Capital Markets in Africa: Emerging-market stocks rose to a three-...", Finance & Investment
Score: -6.0347 – "Simply Money Advisors decreased its holdings in SPDR S&P 500 ETF Trust (NYSEARCA:SPY) by 10.0% in the 4th quarter, according to the company in its mos...", Finance & Investment
Score: -6.1643 – "Dell Inc. unveiled four low-cost computer models Wednesday designed for China, India and other emerging economies in a new bid to tap the potential of...", Finance & Investment
Score: -6.1697 – "Does political risk deter FDI from emergin

Batches:   0%|          | 0/342 [00:00<?, ?it/s]

Query: "They sold out tickets to the championship gaming tournament in under an hour."
Top 5 most similar contexts:
---------------------------------------
Score: -2.9329 – "There are now 100 days to go until the FIFA Confederations Cup Brazil 2013 gets underway. More than 500,000 tickets have already been sold for the tou...", Gaming & Events
Score: -4.1465 – "ArtsFest draws crowd
Review Photo/James loewenstein Mark Panty of Etters, Pa., shows some of the trivets he was selling Sunday at ArtsFest.
Review Pho...", Gaming & Events
Score: -4.4571 – "A trip to the City Ground awaits for our upcoming fixture, an away EFL Cup clash with Championship side Nottingham Forest.
Considering our convincing ...", Sports & Athletics
Score: -5.1309 – "Coverage of The 147th Open on SiriusXM
Jul 12, 2018, 07:00 ET
SiriusXM's "The Open Radio" channel to present 48 hours of live hole-by-hole coverage Ju...", Gaming & Events
Score: -5.4538 – "2017 Clarkson Cup presented by Scotiabank
Sunday, March 5, 2017

Batches:   0%|          | 0/342 [00:00<?, ?it/s]

Query: "Guests strolled through the resort’s marble lobby, overlooking the sun-drenched terrace."
Top 5 most similar contexts:
---------------------------------------
Score: 0.9488 – "White marble floors glisten throughout the lobby and every Louis XIV style incidental table is adorned with a unique arrangement of freshly cut flower...", Hospitality & Real Estate
Score: -3.2683 – "The Chess Hotel
Grid of 31 images
Featured Image Lobby Lobby Lobby Lobby Sitting Area Lobby Sitting Area Guestroom Guestroom Guestroom Guestroom Guest...", Hospitality & Real Estate
Score: -3.3093 – " bunk beds (90/ 190) , 2 bedrooms with 2 single beds (90/ 190) and 2 master bedrooms ( 160/190 ), plus a reading corner with TV and a sofa bed for 1 p...", Hospitality & Real Estate
Score: -4.6680 – "Wander round St Cézaire
Explore the picturesque hilltop town, stop for a café au lait or a leisurely lunch on the square, meet the locals playing boul...", Hospitality & Real Estate
Score: -4.7284 – "Palm Meadows Clu

Batches:   0%|          | 0/342 [00:00<?, ?it/s]

Query: "He spent the afternoon lost in a novel about existentialism and the human condition."
Top 5 most similar contexts:
---------------------------------------
Score: -5.3549 – "One.
Meillassoux has proposed that dichotomy theist/deist and atheist is a false dilemma (see fallacy of black and white, etc.). He calls this dilemma...", Literature & Philosophy
Score: -5.8982 – "Ten years ago it would have been inconceivable for Philip Roth to publish a novel like The Plot Against America. From My Life as a Man to the blackly ...", Politics & Public Affairs
Score: -6.1371 – "George Wylie Henderson (1904-1965) was a novelist and short-story writer whose works reflected a transition between the literary style of the Harlem R...", Literature & Philosophy
Score: -6.4054 – "Truth
Ellie Carless
Geelong Grammar School
Daniel had lived a rather non-existent life before this all happened. He kept himself contained between the...", Literature & Philosophy
Score: -6.9438 – "Engineered to perfection


Batches:   0%|          | 0/342 [00:00<?, ?it/s]

Query: "The clinic introduced a cutting-edge diagnostic tool to improve patient outcomes."
Top 5 most similar contexts:
---------------------------------------
Score: -0.7211 – "Title
Providing patient-centered enhanced discharge planning and rural transition support: Building a rural transitions network between regional refer...", Healthcare & Medicine
Score: -2.2737 – "OrthoSensor Introduces NEW Clinical and Economic Advantages with its Sensors at the American Academy of Orthopaedic Surgeons
Mar 12, 2019, 08:42 ET
DA...", Wellness & Personal Care
Score: -2.8516 – "How CMOs Can Solve the Next Set of Hospital Challenges
FOR IMMEDIATE RELEASE
Hospitals have systems in place that were designed to store, access and d...", Healthcare & Medicine
Score: -3.3841 – "Connect Care: Strengthening Connections within Healthcare for Albertans
Scott Simmons, Connect Care CORe Lead (Diagnostic Imaging)
Alberta Health Serv...", Healthcare & Medicine
Score: -3.4202 – "Is VR physical therapy the key to h

Batches:   0%|          | 0/342 [00:00<?, ?it/s]

Query: "She curated a capsule wardrobe to streamline her daily style routine."
Top 5 most similar contexts:
---------------------------------------
Score: -4.0594 – "image: 2014AW Capsule Rain in Eclipse, photo by M. SILVESTER/ editorial by Oka HuiYun Lin
After viewing the post of Notjustalabel.com’s ‘Shirt’ select...", Art & Design
Score: -5.0852 – "London Based Fashion Technology start-up Metail has partnered with British Glamour for a European first in immersive online editorial fashion
Cambridg...", Art & Design
Score: -6.2861 – "Priming Is Essential
One of the biggest mistakes women with 'oily' skin make is not integrating a primer into their daily routine. Primers offer a hug...", Wellness & Personal Care
Score: -6.6774 – "The prices indicated in this catalogue do not include shipping costs.
Fabiana Filippi's shirts and blouses are the perfect base to create solid modern...", Art & Design
Score: -6.9450 – "Name: Sik'sani Qin'dalasque
Race: Drowolath
Age: 43
Equipment:
On and Off 

Batches:   0%|          | 0/342 [00:00<?, ?it/s]

Query: "Lawmakers convened to draft the new regulations on data privacy and trade compliance."
Top 5 most similar contexts:
---------------------------------------
Score: -1.4382 – "Care homes must have their house in order, now that the new General Data Protection Regulation (GDPR) has come into effect. Introduced two years ago, ...", Healthcare & Medicine
Score: -2.4777 – "Bangladesh Bank governor Fazle Kabir and Bangladesh Institute of Bank Management director general Toufic Ahmad Choudhury are seen with the officials h...", Finance & Investment
Score: -2.8367 – "BUSINESS RULES
by Bruce Silver
Major IT initiatives may be a tough sell in this economy, but many
organizations are investing in order to meet governm...", Finance & Investment
Score: -2.9966 – "Global Mapping S.A.C. operating company of globalmapping.biz (hereinafter Global Mapping)respects your right to online privacy when you use our websit...", Digital/Online Services
Score: -3.4424 – "James Thew - Fotolia
IAM a core bu

Batches:   0%|          | 0/342 [00:00<?, ?it/s]

Query: "He savored the flaky croissant while reading the recipe for boulangerie perfection."
Top 5 most similar contexts:
---------------------------------------
Score: -6.3480 – "This month I've been on a little holiday in Europe. I'm currently in the countryside not too far from Limoges, which is roughly in the centre of Franc...", Food & Culinary
Score: -6.3949 – "As soon as it hits October I start planning my Thanksgiving dinner. This year, I started planning in September. I blame the covid quarantine. In case ...", Food & Culinary
Score: -7.4903 – "Those who follow my blog might remember we hosted a very special reception at our home a couple of months ago. So far, I only shared one of the recipe...", Food & Culinary
Score: -7.6112 – "22 Mar Show your pride in the South African Culinary Olympics team!
It was a rainy evening and we were stuck in the middle of rush-hour traffic. “So w...", Food & Culinary
Score: -7.6635 – "I’m a lady who needs her coffee. Sure, I know how to make it

Batches:   0%|          | 0/342 [00:00<?, ?it/s]

Query: "The candidate outlined her vision for education reform during the televised debate."
Top 5 most similar contexts:
---------------------------------------
Score: -2.2542 – "Support for including Intelligent Design creationism in science classes deep sixed Judy Johnson's run for state senate.
For immediate release
Prehisto...", Politics & Public Affairs
Score: -3.4051 – "Rep. Steve Knight, challenger Katie Hill spar over special interest money in Simi debate
U.S. Rep. Steve Knight and challenger Katie Hill sparred over...", Politics & Public Affairs
Score: -4.3236 – "Tensions within the world's most powerful media family were dramatically laid bare on Thursday when Elisabeth Murdoch set out her own vision of media ...", Film & Television
Score: -4.6297 – "EDWARD F. MARONEY PHOTO
THIS WAY TO THE MICROPHONE – That seems to be the advice Gov. Deval Patrick, right, is offering to rivals Charlie Baker, left,...", Politics & Public Affairs
Score: -4.7157 – "Esther McVey has called for 

Batches:   0%|          | 0/342 [00:00<?, ?it/s]

Query: "Volunteers conducted a beach cleanup to protect the fragile coastal ecosystem."
Top 5 most similar contexts:
---------------------------------------
Score: -2.6140 – "Creek Cleanup FAQ’s
Q: What do I need to know about creek cleanups? Are they different from a beach or neighborhood cleanup?
A: Cleaning up a creek ha...", Environment & Ecology
Score: -4.8163 – "- Programs
- Boating
- Green boating
- Boat responsibility
Boat responsibility
Operate & maintain your boat responsibly
How boaters operate and mainta...", Wellness & Personal Care
Score: -4.9707 – "
Attraction by Types
beaches
Islands
waterfalls
temples
nationalparks
museums
animals
nightlife
Attraction by City
Amphoe Muang
Amphoe Kathu
Amphoe Th...", Environment & Ecology
Score: -6.2402 – "Coral Gardeners
Coral Gardeners
October-2020
Regenerate the coral reef systems with Coral Gardeners
We wanted to share with you a little bit about Cor...", Environment & Ecology
Score: -6.4343 – "Pete Ceglinski, an Australian surfer l

Batches:   0%|          | 0/342 [00:00<?, ?it/s]

Query: "She blended acrylics to capture the vibrant hues of the urban skyline at dusk."
Top 5 most similar contexts:
---------------------------------------
Score: -6.2397 – "Leading up to the shipping of my entire makeup collection from one country to another, I came across products in the archives that I'd almost forgotte...", Art & Design
Score: -7.7189 – "Good morning, crafters! Design Team Member Cynthia here with a final project featuring several of the latest Kat Scrappiness goodies. For this card, I...", Art & Design
Score: -7.7698 – "The Urban Decay Naked Skin Foundation (proper & long name = Urban Decay Weightless Ultra Definition Liquid Makeup) shot into the limelight a couple of...", Art & Design
Score: -8.2318 – "
16 comments:
"One of my favourite accessories here at the house is.."
And here I thought it was going to end with:
"...the girl in the second picture...", Art & Design
Score: -8.6477 – "At dusk, we stopped along our way to Tofino, parking at the signs marked 'Anc

Batches:   0%|          | 0/342 [00:00<?, ?it/s]

Query: "Researchers presented their groundbreaking findings on renewable energy sources."
Top 5 most similar contexts:
---------------------------------------
Score: 1.2387 – "AUSTIN—Today, the Texas Public Policy Foundation released the research paper, The Economic Fall and Political Rise of Renewable Energy.
The paper’s au...", Finance & Investment
Score: -1.2008 – "Please enable Javascript in your browser.
Researchers worldwide are working to develop new technologies for producing clean energy. A team of research...", Industrial & Machinery
Score: -2.6774 – "Renewable Energy and Health: Me, Myself and I
In the continuing debate surrounding how, why, when, where and whether we should move decisively ahead w...", Wellness & Personal Care
Score: -2.7935 – "Coal, oil, and natural gas are still dominant sources of energy despite the fact that renewable energy is getting more and more supporters. Wind power...", Finance & Investment
Score: -3.2396 – "State Grid China teams up with German 

Batches:   0%|          | 0/342 [00:00<?, ?it/s]

Query: "The DJ mixed vintage vinyl records to curate the perfect summer soundtrack."
Top 5 most similar contexts:
---------------------------------------
Score: -2.9979 – "Spotify released its official “Song of the Summer” list this week, and and it’s filled with tracks you’ve probably had stuck in your head over the pas...", Music & Audio
Score: -3.3983 – "Some trunk rattling awesomeness for your Mother’s Day weekend.
Ty Dolla $ign – Beach House 3 Deluxe
Last year, Ty Dolla $ign blessed the masses with t...", Music & Audio
Score: -3.6043 – "Artist(s): G.O.O.D. Music
Cruel Summer is looking more like just another summer gone by.
Review Score: 3.8
It’s been a long and grueling summer for mu...", Music & Audio
Score: -4.3465 – "
Mar
25
Sexy Funky Electro House Set Chris Humphrey
Posted (
admin
) in
House Music News
on March-25-2006
Coming from Ibiza Spain Chris Humphreys know...", Music & Audio
Score: -5.5894 – "The soundtrack to the much-hyped film House of Wax is on the shelves, thanks

Batches:   0%|          | 0/342 [00:00<?, ?it/s]

Query: "He practiced mindfulness meditation to maintain balance amid his hectic schedule."
Top 5 most similar contexts:
---------------------------------------
Score: -3.8127 – "One of the more recent discoveries of neuroscience is that when you stare into space, gather wool, daydream, or otherwise let your mind wander, the “d...", Lifestyle & Personal Interests
Score: -4.4015 – ""My husband knew that I found Therapeutic Touch helpful and comforting but didn't believe it really worked until Tama did a session for me and nurses ...", Healthcare & Medicine
Score: -4.5501 – "All About Yoga And Why It’s Good For You
All About Yoga and Why It’s Good For You
Yoga’s popularity is on the rise. No matter where you live, it is li...", Wellness & Personal Care
Score: -4.7199 – "QV, Sewickley Academy coaches value preseason practice for winter sports
TribLIVE Sports Videos
Yoga is part of one team's workouts as squads at Quake...", Sports & Athletics
Score: -5.1472 – "It quotes self-help author Ga

Very nice: as we can see in many cases the labels match!

When we use an embedding model with cosine similarity, we can pre-compute the embeddings in our data set and index them to speed-up the search. There are techniques for Approximate Nearest Neighbor (ANN), which use clustering to index the embedding space and speed-up the search process. Let's do indexing with HNSWLIB

In [35]:
import hnswlib

index = hnswlib.Index(space='cosine', dim=corpus_embeddings.size(1))

Now we can index our data. The index we compute can be saved and loaded, so we can check if it is already availabel and load it (this will save time)

In [36]:
import os

# Define hnswlib index path
index_path = "./hnswlib.index"

# Load index if available
if os.path.exists(index_path):
    print("Loading index...")
    index.load_index(index_path)
# Else index data collection
else:
    # Initialise the index
    print("Start creating HNSWLIB index")
    index.init_index(max_elements=corpus_embeddings.size(0), ef_construction=400, M=64)
    #  Compute the HNSWLIB index (it may take a while)
    index.add_items(corpus_embeddings.cpu(), list(range(len(corpus_embeddings))))
    # Save the index to a file for future loading
    print("Saving index to:", index_path)
    index.save_index(index_path)

Loading index...


Let's see if it's actually faster by measuring the running time

In [40]:
from datetime import datetime

# Search using index
t_start = datetime.now()
_ = index.knn_query(queries_embeddings[0].cpu(), k=128)
t_stop = datetime.now()
print(f"Search time with index: {t_stop - t_start}")

# Search without index
t_start = datetime.now()
_ = util.semantic_search(queries_embeddings, corpus_embeddings, score_function=util.cos_sim, top_k=128)
t_stop = datetime.now()
print(f"Search time without index: {t_stop - t_start}")

Search time with index: 0:00:00.002146
Search time without index: 0:00:00.003131


Reranking
Since the cross encoder gives better results but is slower than cosine similarity, we can take advantage of both: We can do a first search with bi-encoder models and then re-rank the top- k  results with a cross-encoder. We call this approach retrieve and re-rank.

Let's define a new random query (we could have used one of those generated before)

In [41]:
query = "Who is the president of the United States?"
query_embedding = semb_model.encode(query, convert_to_tensor=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Search using ANN index

In [42]:
corpus_ids, distances = index.knn_query(query_embedding.cpu(), k=3)
scores = 1 - distances

print("Cosine similarity model search results")
print(f"Query: \"{query}\"")
print("---------------------------------------")
for idx, score in zip(corpus_ids[0], scores[0]):
    print(f"Score: {score:.4f}\nDocument: \"{contexts[idx]}\"\n\n")

Cosine similarity model search results
Query: "Who is the president of the United States?"
---------------------------------------
Score: 0.4851
Document: "UUA Presidential Search Committee begins its work
Created by 2010 General Assembly vote, committee will pick two nominees for 2017 UUA presidential election.
That system is changing. In 2010, the General Assembly approved the creation of a Presidential Search Committee and charged it with selecting at least two nominees for president. The General Assembly also shortened the president's term of office from two four-year terms to a single six-year term.
Members of the search committee were selected in 2013. This spring they are exploring how a system might be created that would be open to more people. "The goal of the committee is to not just select candidates," said the Rev. Dr. Matthew Johnson, co-chair, "but to think creatively and systematically about what the job entails and to broaden the circle of people willing to consider thi

Let's add the re-ranking

In [43]:
corpus_ids, _ = index.knn_query(query_embedding.cpu(), k=128)

model_inputs = [(query, contexts[idx]) for idx in corpus_ids[0]]
cross_scores = xenc_model.predict(model_inputs)

print("Cross-encoder model re-ranking results")
print(f"Query: \"{query}\"")
print("---------------------------------------")
for idx in np.argsort(-cross_scores)[:3]:
    print(f"Score: {cross_scores[idx]:.4f}\nDocument: \"{contexts[corpus_ids[0][idx]]}\"\n\n")

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Cross-encoder model re-ranking results
Query: "Who is the president of the United States?"
---------------------------------------
Score: 0.7632
Document: "Why It Makes A Difference!
The President of the United States is a liar. Not a tiny weenie white liar, but a Sociopathic Liar. His word is not worth the air he is allowed to use to say it. Why Does It Make A Difference? It is because the leader of the Free World MUST be Trusted. When the President of the United States is willing to repeatedly Testify to lies, then who should our Children look up to? Sports Figures? When everyone within the Obama Administration is willing to SAY and do anything regardless of TRUTH, then why would anyone in the WORLD believe in America or Her Freedoms.
The recent admission to IRS intimidation against “Tea Party” shows the level of tyrannical Leadership that Mr. Obama is willing to stoop. With the passage of the Obama Care TAX legislation, imagine the CONTROL that our Massive “Progressive” Government w

Finally, let's see if our dataset is big enough to give correct answers to generic questions just by finding the most similar question in the dataset and giving its associated answer. It should work kind of like a chatbot

Let's first compute the answers' embeddings

In [44]:
answers = all_data['answer'].tolist()
answers_embeddings = semb_model.encode(answers, convert_to_tensor=True, show_progress_bar=True)

Batches:   0%|          | 0/342 [00:00<?, ?it/s]

In [45]:
questions = all_data['question'].tolist()
questions_embeddings = semb_model.encode(questions, convert_to_tensor=True, show_progress_bar=True)

Batches:   0%|          | 0/342 [00:00<?, ?it/s]

In [46]:
# Create empty index
hnswlib_index = hnswlib.Index(space='cosine', dim=questions_embeddings.size(1))

# Define hnswlib index path
index_path = "./emp_dialogue_hnswlib.index"

# Load index if available
if os.path.exists(index_path):
    print("Loading index...")
    hnswlib_index.load_index(index_path)
# Else index data collection
else:
    # Initialise the index
    print("Start creating HNSWLIB index")
    hnswlib_index.init_index(max_elements=questions_embeddings.size(0), ef_construction=400, M=64)
    #  Compute the HNSWLIB index (it may take a while)
    hnswlib_index.add_items(questions_embeddings.cpu(), list(range(len(questions_embeddings))))
    # Save the index to a file for future loading
    print("Saving index to:", index_path)
    hnswlib_index.save_index(index_path)

Start creating HNSWLIB index
Saving index to: ./emp_dialogue_hnswlib.index


In [None]:
import numpy as np
def get_response(
    message: str,
    questions_embeddings,     # array/tensor of questions embeddings
    answers: list[str],       # answers list
    index,                    # hnswlib.Index with questions_embeddings
    re_ranking_model=None,    #  cross-encoder
    top_k: int = 32
) -> str:
   
    message_emb = semb_model.encode(message, convert_to_tensor=True).cpu().numpy()

   
    corpus_ids, _ = index.knn_query(message_emb, k=top_k)

    
    candidate_idxs = corpus_ids[0]
    if re_ranking_model is not None:
        model_inputs = [
            (message, answers[i])
            for i in candidate_idxs
        ]
        cross_scores = re_ranking_model.predict(model_inputs) 
        best_pos = np.argmax(cross_scores)
        best_idx = candidate_idxs[best_pos]
    else:
       
        best_idx = candidate_idxs[0]
    return answers[best_idx]

In [None]:
chatbot_response = get_response(
    "who is einstein?", 
    questions_embeddings,                
    answers,                            
    hnswlib_index,                        
    re_ranking_model=xenc_model,          
    top_k=32                              
)

print("Chatbot says:", chatbot_response)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Chatbot says: Albert Einstein's most famous paper was the Special Theory of Relativity. It revealed that the speed of light is a constant against which even time and space lose their absolute meaning.


As we could imagine, not the best result even thought if we are lucky the responses actully make some sense. Of course this is because the dataset we have is not that big