In [2]:
!pip install -U sentence-transformers rank_bm25

Collecting sentence-transformers
  Downloading sentence_transformers-2.6.1-py3-none-any.whl (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.3/163.3 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 

In [3]:
import json
import gzip
import os
import torch
from sentence_transformers import SentenceTransformer, CrossEncoder, util
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string
from tqdm.autonotebook import tqdm
import numpy as np

if not torch.cuda.is_available():
    print("Warning: No GPU found. Please add GPU to your notebook")

# We use the Bi-Encoder to encode all passages, so that we can use it with semantic search
bi_encoder = SentenceTransformer('nq-distilbert-base-v1')
bi_encoder.max_seq_length = 256     # Truncate long passages to 256 tokens
top_k = 32                          # Number of passages we want to retrieve with the bi-encoder

# The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

# As dataset, we use Simple English Wikipedia. Compared to the full English wikipedia, it has only
# about 170k articles. We split these articles into paragraphs and encode them with the bi-encoder
wikipedia_filepath = 'simplewiki-2020-11-01.jsonl.gz'

if not os.path.exists(wikipedia_filepath):
    util.http_get('http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz', wikipedia_filepath)

passages = []
with gzip.open(wikipedia_filepath, 'rt', encoding='utf8') as fIn:
    for line in fIn:
        data = json.loads(line.strip())
        title = data['title']
        paragraphs = data['paragraphs']
        # Concatenate the Title and paragraphs
        passage = ": ".join([title] + paragraphs)
        passages.append(passage)

print("Passages:", len(passages))

# We encode all passages into our vector space. This takes about 5 minutes (depends on your GPU speed)
corpus_embeddings = bi_encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True)

# We lower case our text and remove stop-words from indexing
def bm25_tokenizer(text):
    tokenized_doc = []
    for token in text.lower().split():
        token = token.strip(string.punctuation)

        if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
            tokenized_doc.append(token)
    return tokenized_doc

tokenized_corpus = []
for passage in tqdm(passages):
    tokenized_corpus.append(bm25_tokenizer(passage))

bm25 = BM25Okapi(tokenized_corpus)

# This function will search all wikipedia articles for passages that answer the query
def search(query):
    print("Input question:", query)

    ##### BM25 search (lexical search) #####
    bm25_scores = bm25.get_scores(bm25_tokenizer(query))
    top_n = np.argpartition(bm25_scores, -5)[-5:]
    bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)

    print("Top-3 lexical search (BM25) hits")
    for hit in bm25_hits[0:3]:
        print("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']]))

    ##### Semantic Search #####
    # Encode the query using the bi-encoder and find potentially relevant passages
    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
    question_embedding = question_embedding.cuda()
    hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
    hits = hits[0]  # Get the hits for the first query

    ##### Re-Ranking #####
    # Now, score all retrieved passages with the cross_encoder
    cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits]
    cross_scores = cross_encoder.predict(cross_inp)

    # Sort results by the cross-encoder scores
    for idx in range(len(cross_scores)):
        hits[idx]['cross-score'] = cross_scores[idx]

    # Output of top-5 hits from bi-encoder
    print("\n-------------------------\n")
    print("Top-3 Bi-Encoder Retrieval hits")
    hits = sorted(hits, key=lambda x: x['score'], reverse=True)
    for hit in hits[0:3]:
        print("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']]))

    # Output of top-5 hits from re-ranker
    print("\n-------------------------\n")
    print("Top-3 Cross-Encoder Re-ranker hits")
    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
    for hit in hits[0:3]:
        print("\t{:.3f}\t{}".format(hit['cross-score'], passages[hit['corpus_id']]))



search("What is the capital of the United States?")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/540 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/554 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

  0%|          | 0.00/50.2M [00:00<?, ?B/s]

Passages: 169597


Batches:   0%|          | 0/5300 [00:00<?, ?it/s]

  0%|          | 0/169597 [00:00<?, ?it/s]

Input question: What is the capital of the United States?
Top-3 lexical search (BM25) hits
	11.854	Española, New Mexico: Española is a city in Rio Arriba County, New Mexico, in the United States. A part of the city is in Santa Fe County. It was founded as a capital for Nuevo México in 1598 as San Juan de los Caballeros. It has been called the first capital city in the United States. At the 2010 census, the city had a total population of 10,495.
	11.573	Hammonton, New Jersey: Hammonton is a town in Atlantic County, New Jersey, United States, known as the "Blueberry Capital of the World." As of the 2010 United States Census, the town's population was 14,791.
	11.414	Capital city: A capital city (or capital town or just capital) is a city or town, specified by law or constitution, by the government of a country, or part of a country, such as a state, province or county. It usually serves as the location of the government's central meeting place and offices. Most of the country's leaders a

In [4]:
search(query="When is Chinese New Year")

Input question: When is Chinese New Year
Top-3 lexical search (BM25) hits
	16.967	Japanese New Year: New Year in Japan is one of the most important festivals. Unlike the Chinese New Year, it is held on January 1.
	16.445	Chinese calendar: The Chinese calendar, also known as the "agriculture calendar" (農曆/农历), is a lunisolar calendar ("yinyangli"). It was used until 1912 when the Gregorian calendar was adopted. This is a system of organizing days for social, religious, commercial, or administrative purposes. It was developed in part from a lunar calendar (陰曆 "yinli") and in part from a solar calendar (陽曆 "yangli").: The current version of the Chinese calendar was developed for the Chongzhen Emperor in the 17th century. It has: Today in China the Gregorian calendar is used for most activities. At the same time, the Chinese calendar is still used for traditional Chinese holidays like Chinese New Year or Lunar New Year.
	16.369	CCTV New Year's Gala: The CCTV New Year's Gala (Simplified Chi

In [5]:
search(query = "Which US president was killed?")

Input question: Which US president was killed?
Top-3 lexical search (BM25) hits
	10.662	Sylvestre Ntibantunganya: Sylvestre Ntibantunganya (born 8 May 1956) is a Burundi politician. He was Speaker of the National Assembly of Burundi from December 1993 to 1 October 1994. He was also President of Burundi from 6 April 1994 to 25 July 1996 (interim to October 1994).: Ntibantunganya was born in Gishubi, Gitega. He is an ethnic Hutu. He served as foreign minister briefly during 1993.: He came into office when the previous president, Cyprien Ntaryamira, was killed in a plane crash. It was an assassination in which the Rwandan president Juvénal Habyarimana was also killed. Ntibantunganya left office when he was deposed by Pierre Buyoya in a military coup of 1996.: Ntiybantunganya is presently a Senator for life as a former head of state.
	10.135	Jacobo Majluta Azar: Jacobo Majluta Azar (October 9, 1934 – March 2, 1996) was a Dominican politician. He was Vice President of the Dominican Republic

In [6]:
search(query = "Paris eiffel tower")

Input question: Paris eiffel tower
Top-3 lexical search (BM25) hits
	29.006	Paris, Texas: Paris is a city in the U.S. state of Texas. It is in Lamar County, Texas. It had a population of 25,171 in 2010. It has been called the "Second Largest Paris in the World". It has a replica of the Eiffel Tower.
	28.730	Paris, Tennessee: Paris is a city in the U.S. state of Tennessee. It had a population of 25,171 in 2010. It has been called the "World's Biggest Fish Fry". It has a 70-foot replica of the Eiffel Tower.
	27.769	Eiffel Tower: The Eiffel Tower (French: La Tour Eiffel, ], IPA pronunciation: "EYE-full" English; "eh-FEHL" French) is a landmark in Paris. It was built between 1887 and 1889 for the Exposition Universelle (World Fair). The Tower was the Exposition's main attraction.: The Eiffel Tower cost 7,799,401.31 French gold francs to build in 1889, an amount equal to $1,495,139.89 at that time. Today, its cost would equal to $36,784,020.11. It took 2 years, 2 months and 5 days to build 

In [7]:
search(query = "Elon Musk year birth")

Input question: Elon Musk year birth
Top-3 lexical search (BM25) hits
	29.963	The Boring Company: The Boring Company is a tunnel boring company founded by Elon Musk, who earlier started SpaceX. It aims to reduce traffic congestion in urban areas. It is involved in the building of the Hyperloop in Los Angeles.: Elon Musk complained via Twitter about Los Angeles traffic and the same day, December 17, 2016, founded the company. It built a short test tunnel in Los Angeles.: To raise money for the expensive project, Elon Musk started selling merchandise - first caps and then flamethrowers (branded as 'Not a flamethrower' to thwart customs) and then fire extinguishers to go with. The company website is written in a lighthearted tone, for example describing the fire extinguishers as 'overpriced'.
	27.514	Elon Musk: Elon Reeve Musk (born June 28, 1971) is a businessman and philanthropist. He was born in South Africa. He moved to Canada and later became an American citizen. Musk is the current 

In [8]:
search(query = "Coldest place earth")

Input question: Coldest place earth
Top-3 lexical search (BM25) hits
	20.143	East Antarctica: East Antarctica, also called Greater Antarctica, is the largest part (two-thirds) of the Antarctic continent. It is on the Indian Ocean side of the Transantarctic Mountains. It is the coldest, windiest, and driest part of Earth. East Antarctica holds the record as the coldest place on earth.: East Antartica includes the following areas: Coats Land, Queen Maud Land, Enderby Land, Kemp Land, Mac. Robertson Land, Princess Elizabeth Land, Kaiser Wilhelm II Land, Queen Mary Land, Wilkes Land, Adélie Land, George V Land, Oates Land and Victoria Land. All but a small portion of this region lies within the Eastern Hemisphere.
	13.269	Herschel Space Observatory: The Herschel Space Observatory is a European Space Agency instrument. It is the largest infrared telescope ever put into orbit. Herschel is named after Sir William Herschel, the discoverer of the infrared spectrum, double stars, and planet Uran

In [9]:
search(query = "Oldest US president")

Input question: Oldest US president
Top-3 lexical search (BM25) hits
	11.402	José Celso de Mello Filho: José Celso de Mello Filho (Tatuí, November 1, 1945), is a Brazilian jurist. He is the oldest member of the Supreme Federal Court of Brazil. He was nominated by President José Sarney in 1989.: He became the youngest President of the Court in 1997.
	10.990	President of North Macedonia: The president of the Republic of North Macedonia (; is the head of state of North Macedonia. The presidency of the modern North Macedonia state began after the Macedonian declaration of independence on 8 September 1991.: Its first president was Kiro Gligorov, the oldest president in the world until his resignation in 1999.: North Macedonia's presidency is largely a ceremonial post and the prime minister of North Macedonia is the country's main political figure.: The president must be a citizen of North Macedonia, be over 40 years of age and have lived in North Macedonia for at least ten of the last fifte

In [10]:
search(query = "How many people live in Toronto?")

Input question: How many people live in Toronto?
Top-3 lexical search (BM25) hits
	17.510	Markham, Ontario: Markham, Ontario is a city in Regional Municipality of York, in the Greater Toronto Area of Southern Ontario, Canada. There are twice as many people there as in 1990. 261,573 people live in Markham. It is the 4th largest town in the Greater Toronto Area, after Toronto, Mississauga, and Brampton.
	15.333	The Best Damn Tour: Live in Toronto: The Best Damn Tour – Live in Toronto is a DVD from Avril Lavigne that was recorded in Toronto, Canada on April 7, 2008. The DVD is made up of nineteen songs that Lavigne sang during her "Best Damn Tour". Most of the songs are taken from her third album, "The Best Damn Thing", while two of the songs are cover songs that other artists have sung.
	14.543	Toronto: Toronto is the capital city of the province of Ontario in Canada. It is also the largest city in both Ontario and Canada. Found It is on the north-west side of Lake Ontario.: The City of 

In [11]:
search(query = "How long do cats live?")

Input question: How long do cats live?
Top-3 lexical search (BM25) hits
	23.261	Aging in cats: Reliable information on the lifespans of house cats is hard to find. However, research has been done to get an estimate (an educated guess) on how long cats usually live. Cats usually live for 13 to 20 years. Sometimes cats can live for 22 to 30 years but there are claims of cats dying at ages of more than 30 years old.: The "Guinness World Record" for the oldest cat was for a cat named Creme Puff, who was 38 years old. Female cats seem to live longer than male cats. Neutered cats live longer than cats that have not been neutered. Mixed breed cats also appear to live longer than purebred cats. Researchers have also found that cats that weigh more have shorter lifespans.: People sometimes guess how long a cat will live by comparing it to how long a human usually lives. You can estimate a cat's age in "cat years" by multiplying the cat's age in normal years by 7. A better guess for cat years us

In [12]:
search(query = "When did the cold war end?")

Input question: When did the cold war end?
Top-3 lexical search (BM25) hits
	17.884	Reagan Doctrine: The Reagan Doctrine was a document by the United States under the Reagan Administration. It was about being against the global influence of the Soviet Union during the final years of the Cold War. The doctrine lasted for less than a decade, it was the most important document of United States foreign policy from the early 1980s until the end of the Cold War in 1991.
	14.799	Cold Norton: Cold Norton is a village and civil parish in Maldon District, Essex, England. In 2001 there were 1103 people living in Cold Norton. Cold Norton is at the south-east end of the Danbury Ridge.
	14.355	Call of Duty: Black Ops Cold War: Call of Duty: Black Ops Cold War is an upcoming first-person shooter video game developed by Treyarch and Raven Software and published by Activision. It is the sixth installment of the "Black Ops" series, and the seventeenth installment in the overall "Call of Duty" series.: I

In [13]:
search(query = "Number countries Europe")

Input question: Number countries Europe
Top-3 lexical search (BM25) hits
	16.505	European Court of Human Rights: The European Court of Human Rights (ECtHR) hears and decides particular type of complaints. These complaints relate to abuse of human rights. ECoHR’s other popular name is "Strasbourg Court". Member countries of the Council of Europe created ECoHR to arrange all such complaints, listen to the complaints, and to give decisions. European Convention on Human Rights tells about many human rights. ECoHR sees that people should enjoy all these human rights.: ECoHR' has a number of judges. The number of judges is seven normally but at the case of dealing a great issue, the number will be 21 and the judges are equally from member countries of the Council of Europe. At present, there are forty seven member countries of the Council of Europe. Each country may have one judge in the ECoHR. But, judges work independently for the ECoHR, and not for their country.
	15.834	Eastern Europe: E

In [14]:
search(query = "What is the best orchestra in the world?")

Input question: What is the best orchestra in the world?
Top-3 lexical search (BM25) hits
	17.059	Cleveland Orchestra: The Cleveland Orchestra is an American orchestra based in Cleveland, Ohio. Their conductor is Franz Welser-Möst.: The orchestra was started in 1918 with Nikolai Sokoloff as conductor. In 1931 a new concert hall called the Severance Hall was opened. It has been the orchestra's concert hall ever since.: The conductor George Szell made them into one of the world's best orchestras. When he started conducting them after World War II he told twelve of the players that they had to leave. Another twelve musicians were so angry that they left as well. Szell then improved the orchestra during the 24 years he was with them. Even now the orchestra is so good because of what Szell did for them.: Today's conductor of the Cleveland Orchestra, Franz Welser-Möst, continues to conduct the orchestra in Cleveland as well as touring with them all over the world. They perform regularly in t

In [17]:
search(query = "Documentaries showcasing indigenous peoples' survival and daily life in Arctic regions")

Input question: Documentaries showcasing indigenous peoples' survival and daily life in Arctic regions
Top-3 lexical search (BM25) hits
	21.136	Indigenous peoples in Brazil: People have inhabited modern-day Brazil for over 10,000 years. They had developed several distinct cultures before the arrival of Europeans in the 1500's.: Most indigenous people died quickly after European contact due to Old World diseases, like smallpox.: Indigenous peoples may face discrimination and denial of rights because of their ethnicity or status.
	20.581	Indigenous peoples: Indigenous people means the first people who lived in any region, and not later immigrants.: Indigenous people can also be described as aborigines, native people, first people, first nations and autochthonous.: The United Nations prefers the phrase "Indigenous peoples" to these descriptions. The other descriptions sometimes sound negative (pejorative).: Very often, indigenous people were forced into slavery or badly treated by settler

In [18]:
search(query = "Western romance")

Input question: Western romance
Top-3 lexical search (BM25) hits
	19.727	Western Romance languages: The Western Romance languages are a branch of Romance languages. The main languages in the branch are Spanish, French, and Portuguese. The branch has two parts, Gallo-Romance and Iberian Romance.
	17.419	Italo-Western languages: The Italo-Western languages are the largest branch of Romance languages. They are made up of two branches, the Italo-Dalmation languages and the Western Romance languages.
	16.236	Romance languages: The Romance languages (also sometimes called Romanic languages) are a language family in the Indo-European languages. They started from Vulgar Latin (in Latin, "vulgar" is the word for "common" and so "Vulgar Latin" means "Common Latin"). The most spoken Romance languages are Spanish, Portuguese, French, Italian and Romanian.: They are called "Romance languages" because they originate from Latin, the language spoken by the Western Roman Empire. Their grammatical infle

In [19]:
search(query = "Silent film about a Parisian star moving to Egypt, leaving her husband for a baron, and later reconciling after finding her family in poverty in Cairo.")

Input question: Silent film about a Parisian star moving to Egypt, leaving her husband for a baron, and later reconciling after finding her family in poverty in Cairo.
Top-3 lexical search (BM25) hits
	24.433	Sylvain Sylvain: Sylvain Mizrahi (born February 14, 1951), better known as Sylvain Sylvain, is an American rock musician. He is most notable for being a member of the New York Dolls. He was a guitarist and pianist for the group. He was a member from 1971 to 1976. He has since had a solo career since the band's first split up. From 2004 to 2011, the New York Dolls surviving members regrouped.: Sylvain was born in Cairo, Egypt to a Jewish family. In the 1950s, his family fled Egypt to live in France, before moving to the U.S. state of New York. In April 2019, Sylvain made it known to the public that he had cancer.
	24.067	Cairo Governorate: Cairo Governorate is a governorate of Egypt. Its capital, the city of Cairo, is also the national capital of Egypt.
	23.594	Samir Farid: Samir F

In [20]:
search(query = "Comedy film, office disguises, boss's daughter, elopement.")

Input question: Comedy film, office disguises, boss's daughter, elopement.
Top-3 lexical search (BM25) hits
	32.074	My Boss's Daughter: My Boss's Daughter is a 2003 romantic comedy movie in which Ashton Kutcher, Molly Shannon and Tara Reid star. The subject in this romantic comedy is researchers working at a publishing company. One researcher, Tom, carries a crush on the daughter of the boss. This movie was released in August 2003. Its budget was $14 million; it got back over $18 million. Reviews, however, were rather substandard. The original of this movie was rated PG-13, though an R-rated version of this exists.
	25.230	Elopement (movie): Elopement is a 1951 American romantic comedy movie directed by Henry Koster and starring Clifton Webb, Anne Francis, Charles Bickford, William Lundigan. It was distributed by 20th Century Fox.
	22.019	Elopement (marriage): An elopement is a marriage done in secret or private. Elopement may be an alternative to a large expensive wedding. It may also

In [22]:
search (query = "Lost film, Cleopatra charms Caesar, plots world rule, treasures from mummy, revels with Antony, tragic end with serpent in Alexandria.")

Input question: Lost film, Cleopatra charms Caesar, plots world rule, treasures from mummy, revels with Antony, tragic end with serpent in Alexandria.
Top-3 lexical search (BM25) hits
	43.256	Cleopatra VII: Cleopatra, Queen of Egypt, was one of the most famous women in history. Her full name was "Cleopatra VII Thea Philopator" (69 BC – 12 August 30 BC). She was the last of the Ptolemaic dynasty of Pharaohs set up in Egypt after the death of Alexander the Great. By descent, she was a Macedonian Princess. After her death, Egypt became the Roman province of "Aegyptus".: The main historical source for her life is Plutarch's "Life of Antony", available in translations.: "Antony and Cleopatra" is the famous tragedy by William Shakespeare, believed to have been written sometime between 1603 and 1607. It was first printed in 1623.: Cleopatra was born in Alexandria, then the capital of Egypt. When she was 18 years old, her father, who was king, died. She and her brother, Ptolemy XIII, became th

In [23]:
search (query = "Denis Gage Deane-Tanner")

Input question: Denis Gage Deane-Tanner
Top-3 lexical search (BM25) hits
	17.872	Gage, Oklahoma: Gage is a town of Oklahoma in the United States.
	16.931	Thomas Gage: General Thomas Gage (10 March 1718/19 – 2 April 1787) was a British Army general officer and colonial official best known for his many years of service in North America, including his role as British commander-in-chief in the early days of the American Revolution.: Thomas Gage, on February 20, 1773, already communicated to the governor of Louisiana, Luis de Unzaga y Amézaga 'le Conciliateur', his intention to return to the United Kingdom with his family, a fact that occurred 4 months later, in June Therefore, Gage was not present when the Boston Tea Party took place in December of that year, a city in which both Gage and Unzaga left confidants to be informed by their respective spy networks.
	16.606	Gage County, Nebraska: Gage County is a county in the U.S. state of Nebraska. As of the 2010 census, 22,311 people lived the

In [29]:
import numpy as np

def search(query, verbose=True):
    if verbose:
        print("Input question:", query)

    ##### BM25 search (lexical search) #####
    bm25_scores = bm25.get_scores(bm25_tokenizer(query))
    top_n = np.argpartition(bm25_scores, -5)[-5:]
    bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)

    if verbose:
        print("Top-3 lexical search (BM25) hits")
        for hit in bm25_hits[0:3]:
            print("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']]))

    ##### Semantic Search #####
    # Encode the query using the bi-encoder and find potentially relevant passages
    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
    question_embedding = question_embedding.cuda()
    hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
    hits = hits[0]  # Get the hits for the first query

    ##### Re-Ranking #####
    # Now, score all retrieved passages with the cross_encoder
    cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits]
    cross_scores = cross_encoder.predict(cross_inp)

    # Sort results by the cross-encoder scores
    for idx in range(len(cross_scores)):
        hits[idx]['cross-score'] = cross_scores[idx]

    if verbose:
        # Output of top-5 hits from bi-encoder
        print("\n-------------------------\n")
        print("Top-3 Bi-Encoder Retrieval hits")
        hits = sorted(hits, key=lambda x: x['score'], reverse=True)
        for hit in hits[0:3]:
            print("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']]))

        # Output of top-5 hits from re-ranker
        print("\n-------------------------\n")
        print("Top-3 Cross-Encoder Re-ranker hits")
        hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
        for hit in hits[0:3]:
            print("\t{:.3f}\t{}".format(hit['cross-score'], passages[hit['corpus_id']]))

    return bm25_hits, hits

def evaluate_recall_and_mrr(queries):
    recall_bm25 = 0
    mrr_bm25 = 0
    recall_reranker = 0
    mrr_reranker = 0
    total_queries = len(queries)

    for query in queries:
        print("Evaluating query:", query)
        bm25_hits, reranker_hits = search(query, verbose=False)

        relevant_documents_bm25 = set([hit['corpus_id'] for hit in bm25_hits[:1]])
        relevant_documents_reranker = set([hit['corpus_id'] for hit in reranker_hits[:1]])

        # Calculate Recall@1 and MRR for BM25
        recall_bm25 += len(relevant_documents_bm25 & relevant_documents_reranker) / len(relevant_documents_bm25)

        rr_bm25 = 0
        for doc_id in relevant_documents_bm25:
            try:
                rr_bm25 += 1 / (bm25_hits.index({'corpus_id': doc_id}) + 1)
            except ValueError:
                pass

        mrr_bm25 += rr_bm25 / len(relevant_documents_bm25) if relevant_documents_bm25 else 0

        # Calculate Recall@1 and MRR for Re-ranker
        recall_reranker += len(relevant_documents_bm25 & relevant_documents_reranker) / len(relevant_documents_reranker)

        rr_reranker = 0
        for doc_id in relevant_documents_reranker:
            try:
                rr_reranker += 1 / (reranker_hits.index({'corpus_id': doc_id}) + 1)
            except ValueError:
                pass

        mrr_reranker += rr_reranker / len(relevant_documents_reranker) if relevant_documents_reranker else 0

    # Average Recall and MRR over all queries
    recall_bm25 /= total_queries
    mrr_bm25 /= total_queries
    recall_reranker /= total_queries
    mrr_reranker /= total_queries

    print("Recall@1 for BM25:", recall_bm25)
    print("MRR for BM25:", mrr_bm25)
    print("Recall@1 for Re-ranker:", recall_reranker)
    print("MRR for Re-ranker:", mrr_reranker)

queries = ["Documentaries showcasing indigenous peoples' survival and daily life in Arctic regions?",
    "Western romance",
    "Silent film about a Parisian star moving to Egypt, leaving her husband for a baron, and laterreconciling after finding her family in poverty in Cairo",
           "Comedy film, office disguises, boss's daughter, elopement.",
    "Lost film, Cleopatra charms Caesar, plots world rule, treasures from mummy, revels with Antony, tragic end withserpent in Alexandria.",
    "Denis Gage Deane-Tanner"]

evaluate_recall_and_mrr(queries)


Evaluating query: Documentaries showcasing indigenous peoples' survival and daily life in Arctic regions?
Evaluating query: Western romance
Evaluating query: Silent film about a Parisian star moving to Egypt, leaving her husband for a baron, and laterreconciling after finding her family in poverty in Cairo
Evaluating query: Comedy film, office disguises, boss's daughter, elopement.
Evaluating query: Lost film, Cleopatra charms Caesar, plots world rule, treasures from mummy, revels with Antony, tragic end withserpent in Alexandria.
Evaluating query: Denis Gage Deane-Tanner
Recall@1 for BM25: 0.16666666666666666
MRR for BM25: 0.0
Recall@1 for Re-ranker: 0.16666666666666666
MRR for Re-ranker: 0.0
