# Question-Answering using Simple Wikipedia Index

**This script uses the smaller Simple English Wikipedia as document collection to provide answers to user questions / search queries**. This examples demonstrates the setup for Query / Question-Answer-Retrieval. First, we split all Wikipedia articles into paragraphs and encode them with a bi-encoder *(Min comment: get sentence embedding)*. If a new query / question is entered, it is encoded by the same bi-encoder and the paragraphs with the highest cosine-similarity are retrieved (see semantic search). Next, the retrieved candidates are scored by a Cross-Encoder re-ranker and the 5 passages with the highest score from the Cross-Encoder are presented to the user.

https://colab.research.google.com/drive/1l6stpYdRMmeDBK_vw0L5NitdiAuhdsAr?usp=sharing

You can input a query or a question. The script then uses semantic search to find relevant passages in Simple English Wikipedia (as it is smaller and fits better in RAM).

For semantic search, we use SentenceTransformer('msmarco-distilbert-base-v2') and retrieve 100 potentially passages that answer the input query.

Next, we use a more powerful CrossEncoder (cross_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6')) that
scores the query and all retrieved passages for their relevancy. The cross-encoder is neccessary to filter out certain noise
that might be retrieved from the semantic search step.

In [1]:
import pandas as pd

In [None]:
!pip install -U sentence-transformers

In [4]:
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import torch

if not torch.cuda.is_available():
    print("Warning: No GPU found. Please add GPU to your notebook")

In [5]:
!pip install rank-bm25

Collecting rank-bm25
  Downloading rank_bm25-0.2.1-py3-none-any.whl (8.5 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.1


In [6]:
from rank_bm25 import BM25Okapi
#from sklearn.feature_extraction import stop_words
import string
from tqdm.autonotebook import tqdm
import numpy as np

# Checking the Data

In [None]:
embeddings_filepath = '/content/close_defects_small.csv'  # D:/python_working_dir/nlp/data/open_dmg_9.csv

df = pd.read_csv(embeddings_filepath, encoding = 'utf-8', sep = ";")
df.head()

# filling nan
df[['Skadebeskrivning', 'Skaderubrik', 'Åtgärdsbeskrivning']] = df[['Skadebeskrivning','Skaderubrik', 'Åtgärdsbeskrivning']].fillna(value='')

df['Skade_text'] = df['Skaderubrik'] + ' ' + df['Skadebeskrivning']

# droping nan 
df = df[df['Skade_text'].notna()]

passages = df['Skade_text'].values.tolist()
len(passages)
#passage
df.head()

# Empty Memeory of GPU

In [None]:
# empty the memeory of gpu
import torch
torch.cuda.empty_cache()

# Custom Function to Run Multiple Models 

In [None]:
#This function will search all texts in passages that answer the query
def model_search(query, bi_encoder_name, cross_encoder_name, top_k_biencoder, top_n_res):
    
    bi_encoder = SentenceTransformer(bi_encoder_name)
    top_k = top_k_biencoder     #Number of passages we want to retrieve with the bi-encoder

    #The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
    cross_encoder = CrossEncoder(cross_encoder_name)
    
    #Here, we compute the corpus_embeddings from scratch (which can take a while depending on the GPU)
    corpus_embeddings = bi_encoder.encode(passages,  batch_size=32, convert_to_tensor=True, show_progress_bar=True)
    
    print("Input question:", query)

    #BM25 search (lexical search)
    bm25_scores = bm25.get_scores(bm25_tokenizer(query))
    top_n = np.argpartition(bm25_scores, -top_n_res)[-top_n_res:]
    bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
    print("Top-" + str(top_n_res) + "lexical search (BM25) hits")
    for hit in bm25_hits[0:top_n_res]:
        print("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']].replace("\n", " ")))

    ##### Sematic Search #####
    #Encode the query using the bi-encoder and find potentially relevant passages
    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
    question_embedding = question_embedding.cuda()
    hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
    hits = hits[0]  # Get the hits for the first query

    ##### Re-Ranking #####
    #Now, score all retrieved passages with the cross_encoder
    cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits]
    cross_scores = cross_encoder.predict(cross_inp)

    #Sort results by the cross-encoder scores
    for idx in range(len(cross_scores)):
        hits[idx]['cross-score'] = cross_scores[idx]


    #Output of top-10 hits
    print("Top-" + str(top_n_res) + "Bi-Encoder Retrieval hits")
    hits = sorted(hits, key=lambda x: x['score'], reverse=True)
    for hit in hits[0:top_n_res]:
        print("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']].replace("\n", " ")))
        print(hit['corpus_id'])

    print("Top-" + str(top_n_res) + "Cross-Encoder Re-ranker hits")
    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
    for hit in hits[0:top_n_res]:
        print("\t{:.3f}\t{}".format(hit['cross-score'], passages[hit['corpus_id']].replace("\n", " ")))

In [None]:
bi_encoder_model_list = ["paraphrase-multilingual-mpnet-base-v2", "sentence-transformers/stsb-xlm-r-multilingual", 
                         "sentence-transformers/msmarco-distilbert-multilingual-en-de-v2-tmp-lng-aligned"]
cross_encoder_model_list = ["cross-encoder/ms-marco-TinyBERT-L-6", 
                            "cross-encoder/quora-roberta-large", 
                            "cross-encoder/qnli-electra-base",
                           "cross-encoder/stsb-roberta-large"]

top_k = 70     #Number of passages we want to retrieve with the bi-encoder
query = "Vilka fel ligger på vagn 2"  # questions: Vilka fel på vagn a2

model_search(query, bi_encoder_name = bi_encoder_model_list[0], cross_encoder_name = cross_encoder_model_list[0], 
             top_k_biencoder=top_k, top_n_res=10)
# memery issue, cannot run the loops
[model_search(query, bi_encoder_name, cross_encoder_name, top_k_biencoder=top_k, top_n_res=10) 
 for bi_encoder_name in bi_encoder_model_list for cross_encoder_name in cross_encoder_model_list]        

In [None]:
# "cross-encoder/quora-roberta-large" : better
model_search(query, bi_encoder_name = bi_encoder_model_list[0], cross_encoder_name = cross_encoder_model_list[1], 
             top_k_biencoder=top_k, top_n_res=10)

In [None]:
bi_encoder_model_list = ["paraphrase-multilingual-mpnet-base-v2", "sentence-transformers/stsb-xlm-r-multilingual", 
                         "sentence-transformers/msmarco-distilbert-multilingual-en-de-v2-tmp-lng-aligned"]
cross_encoder_model_list = ["cross-encoder/ms-marco-TinyBERT-L-6", 
                            "cross-encoder/quora-roberta-large", 
                            "cross-encoder/qnli-electra-base",
                           "cross-encoder/stsb-roberta-large"]

# bi_encoder = sentence-transformers/stsb-xlm-r-multilingual, good
model_search(query, bi_encoder_name = bi_encoder_model_list[1], cross_encoder_name = cross_encoder_model_list[2], 
             top_k_biencoder=top_k, top_n_res=10)

# gradio

## Fixed Models

In [None]:
import json
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import torch

from rank_bm25 import BM25Okapi
   #from sklearn.feature_extraction import stop_words
import string
from tqdm.autonotebook import tqdm
import numpy as np
import pandas as pd

import gradio as gr


def bm25_tokenizer(text):
  tokenized_doc = []
  for token in text.lower().split():
    token = token.strip(string.punctuation)

    #if len(token) > 0 and token not in stop_words.ENGLISH_STOP_WORDS:
    tokenized_doc.append(token)
  return tokenized_doc

#This function will search all texts in passages that answer the query
def model_search(query):
  
  embeddings_filepath = '/content/close_defects_small.csv'  # D:/python_working_dir/nlp/data/open_dmg_9.csv
  df = pd.read_csv(embeddings_filepath, encoding = 'utf-8', sep = ";")
  # filling nan
  df[['Skadebeskrivning', 'Skaderubrik', 'Åtgärdsbeskrivning']] = df[['Skadebeskrivning','Skaderubrik', 'Åtgärdsbeskrivning']].fillna(value='')
  df['Skade_text'] = df['Skaderubrik'] + ' ' + df['Skadebeskrivning']
  # droping nan
  df = df[df['Skade_text'].notna()]
  passages = df['Skade_text'].values.tolist()
    
  bi_encoder = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")
  top_k = 100     #Number of passages we want to retrieve with the bi-encoder

  #The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
  cross_encoder = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-6")
    
  #Here, we compute the corpus_embeddings from scratch (which can take a while depending on the GPU)
  corpus_embeddings = bi_encoder.encode(passages,  batch_size=32, convert_to_tensor=True, show_progress_bar=True)
    
  #print("Input question:", query)

  #BM25 search (lexical search)

  tokenized_corpus = []
  for passage in tqdm(passages):
    tokenized_corpus.append(bm25_tokenizer(passage))

  bm25 = BM25Okapi(tokenized_corpus)
  bm25_scores = bm25.get_scores(bm25_tokenizer(query))
  top_n = np.argpartition(bm25_scores, -10)[-10:]
  bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
  bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
  bm25_output = []
  print("Top-" + str(10) + "lexical search (BM25) hits")
  for hit in bm25_hits[0:10]:
    line = str(round(hit['score'], 2)) + " , " + passages[hit['corpus_id']]
    bm25_output.append(line)

  ##### Sematic Search #####
  #Encode the query using the bi-encoder and find potentially relevant passages
  question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
  question_embedding = question_embedding.cuda()
  hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
  hits = hits[0]  # Get the hits for the first query

  ##### Re-Ranking #####
  #Now, score all retrieved passages with the cross_encoder
  cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits]
  cross_scores = cross_encoder.predict(cross_inp)

  #Sort results by the cross-encoder scores
  for idx in range(len(cross_scores)):
    hits[idx]['cross-score'] = cross_scores[idx]


  #Output of top-10 hits
  print("Top-" + str(10) + "Bi-Encoder Retrieval hits")
  hits = sorted(hits, key=lambda x: x['score'], reverse=True)
  bi_encoder_output = []
  for hit in hits[0:10]:
    line_bi = str(round(hit['score'], 2)) + " , " + passages[hit['corpus_id']] + " . " + "Åtgärder: " + df.Åtgärdsbeskrivning[hit['corpus_id']] + "\n  "
    bi_encoder_output.append(line_bi)

  print("Top-" + str(10) + "Cross-Encoder Re-ranker hits")
  hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
  cross_encoder_output = []
  for hit in hits[0:10]:
    line_c = str(round(hit['cross-score'], 2)) + " , " + passages[hit['corpus_id']]  + " . " + "Åtgärder: " + df.Åtgärdsbeskrivning[hit['corpus_id']] + "\n  "
    cross_encoder_output.append(line_c)
  
  return bm25_output, bi_encoder_output, cross_encoder_output


iface = gr.Interface(
  fn=model_search,
  inputs=["text"], 
  outputs=["text", "text", "text"])
iface.launch(debug=True)


## Non-fixed Models

In [None]:
import json
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import torch

from rank_bm25 import BM25Okapi
   #from sklearn.feature_extraction import stop_words
import string
from tqdm.autonotebook import tqdm
import numpy as np
import pandas as pd

import gradio as gr


def bm25_tokenizer(text):
  tokenized_doc = []
  for token in text.lower().split():
    token = token.strip(string.punctuation)

    #if len(token) > 0 and token not in stop_words.ENGLISH_STOP_WORDS:
    tokenized_doc.append(token)
  return tokenized_doc

#This function will search all texts in passages that answer the query
def model_search(query, bi_encoder_name, cross_encoder_name, top_k_biencoder, top_n_res):
    
    embeddings_filepath = '/content/close_defects_small.csv'  # D:/python_working_dir/nlp/data/open_dmg_9.csv
    df = pd.read_csv(embeddings_filepath, encoding = 'utf-8', sep = ";")
    # filling nan
    df[['Skadebeskrivning', 'Skaderubrik']] = df[['Skadebeskrivning','Skaderubrik']].fillna(value='')
    df['Skade_text'] = df['Skaderubrik'] + ' ' + df['Skadebeskrivning']
    # droping nan
    df = df[df['Skade_text'].notna()]
    passages = df['Skade_text'].values.tolist()

    bi_encoder = SentenceTransformer(bi_encoder_name)
    top_k = int(top_k_biencoder)     #Number of passages we want to retrieve with the bi-encoder

    #The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
    cross_encoder = CrossEncoder(cross_encoder_name)
    
    #Here, we compute the corpus_embeddings from scratch (which can take a while depending on the GPU)
    corpus_embeddings = bi_encoder.encode(passages,  batch_size=32, convert_to_tensor=True, show_progress_bar=True)
    
    #BM25 search (lexical search)
    tokenized_corpus = []
    for passage in tqdm(passages):
      tokenized_corpus.append(bm25_tokenizer(passage))

    bm25 = BM25Okapi(tokenized_corpus)
    bm25_scores = bm25.get_scores(bm25_tokenizer(query))
    top_n_res = int(top_n_res)
    top_n = np.argpartition(bm25_scores, -top_n_res)[-top_n_res:]
    bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)

    print("Input question:", query)

    print("Top-" + str(top_n_res) + "lexical search (BM25) hits")
    for hit in bm25_hits[0:top_n_res]:
        print("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']].replace("\n", " ")))

    ##### Sematic Search #####
    #Encode the query using the bi-encoder and find potentially relevant passages
    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
    question_embedding = question_embedding.cuda()
    hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
    hits = hits[0]  # Get the hits for the first query

    ##### Re-Ranking #####
    #Now, score all retrieved passages with the cross_encoder
    cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits]
    cross_scores = cross_encoder.predict(cross_inp)

    #Sort results by the cross-encoder scores
    for idx in range(len(cross_scores)):
        hits[idx]['cross-score'] = cross_scores[idx]


    #Output of top-10 hits
    print("Top-" + str(top_n_res) + "Bi-Encoder Retrieval hits")
    hits = sorted(hits, key=lambda x: x['score'], reverse=True)
    for hit in hits[0:top_n_res]:
        print("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']].replace("\n", " ")))
        print(hit['corpus_id'])
    
    print("Top-" + str(top_n_res) + "Cross-Encoder Re-ranker hits")
    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
    for hit in hits[0:top_n_res]:
        print("\t{:.3f}\t{}".format(hit['cross-score'], passages[hit['corpus_id']].replace("\n", " ")))
    

iface = gr.Interface(
    fn=model_search,
    inputs=["text", "text", "text", "number", "number"], 
    outputs=["text"])
iface.launch(debug=True)

### Return

In [None]:
import json
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import torch

from rank_bm25 import BM25Okapi
   #from sklearn.feature_extraction import stop_words
import string
from tqdm.autonotebook import tqdm
import numpy as np
import pandas as pd

import gradio as gr


def bm25_tokenizer(text):
  tokenized_doc = []
  for token in text.lower().split():
    token = token.strip(string.punctuation)

    #if len(token) > 0 and token not in stop_words.ENGLISH_STOP_WORDS:
    tokenized_doc.append(token)
  return tokenized_doc

#This function will search all texts in passages that answer the query
def model_search(query, bi_encoder_name, cross_encoder_name, top_k_biencoder, top_n_res):
    
    embeddings_filepath = '/content/close_defects_small.csv'  # D:/python_working_dir/nlp/data/open_dmg_9.csv
    df = pd.read_csv(embeddings_filepath, encoding = 'utf-8', sep = ";")
    # filling nan
    df[['Skadebeskrivning', 'Skaderubrik', 'Åtgärdsbeskrivning']] = df[['Skadebeskrivning','Skaderubrik', 'Åtgärdsbeskrivning']].fillna(value='')
    df['Skade_text'] = df['Skaderubrik'] + ' ' + df['Skadebeskrivning']
    # droping nan
    df = df[df['Skade_text'].notna()]
    passages = df['Skade_text'].values.tolist()

    bi_encoder = SentenceTransformer(bi_encoder_name)
    top_k = int(top_k_biencoder)     #Number of passages we want to retrieve with the bi-encoder

    #The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
    cross_encoder = CrossEncoder(cross_encoder_name)
    
    #Here, we compute the corpus_embeddings from scratch (which can take a while depending on the GPU)
    corpus_embeddings = bi_encoder.encode(passages,  batch_size=32, convert_to_tensor=True, show_progress_bar=True)
    
    #BM25 search (lexical search)
    tokenized_corpus = []
    for passage in tqdm(passages):
      tokenized_corpus.append(bm25_tokenizer(passage))

    bm25 = BM25Okapi(tokenized_corpus)
    bm25_scores = bm25.get_scores(bm25_tokenizer(query))
    top_n_res = int(top_n_res)
    top_n = np.argpartition(bm25_scores, -top_n_res)[-top_n_res:]
    bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)

    
    bm25_output = []

    print("Top-" + str(top_n_res) + "lexical search (BM25) hits")
    for hit in bm25_hits[0:top_n_res]:
        line = str(round(hit['score'], 2)) + " , " + passages[hit['corpus_id']]
        bm25_output.append(line)

    ##### Sematic Search #####
    #Encode the query using the bi-encoder and find potentially relevant passages
    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
    question_embedding = question_embedding.cuda()
    hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
    hits = hits[0]  # Get the hits for the first query

    ##### Re-Ranking #####
    #Now, score all retrieved passages with the cross_encoder
    cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits]
    cross_scores = cross_encoder.predict(cross_inp)

    #Sort results by the cross-encoder scores
    for idx in range(len(cross_scores)):
        hits[idx]['cross-score'] = cross_scores[idx]


    #Output of top-10 hits
    print("Top-" + str(top_n_res) + "Bi-Encoder Retrieval hits")
    hits = sorted(hits, key=lambda x: x['score'], reverse=True)
    bi_encoder_output = []
    for hit in hits[0:top_n_res]:
        line_bi = str(round(hit['score'], 2)) + " , " + passages[hit['corpus_id']] #+ " . " + hit['corpus_id']
        bi_encoder_output.append(line_bi)

    
    print("Top-" + str(top_n_res) + "Cross-Encoder Re-ranker hits")
    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
    cross_encoder_output = []
    for hit in hits[0:top_n_res]:
        line_c = str(round(hit['cross-score'], 2)) + " , " + passages[hit['corpus_id']]  + " . " + "Åtgärder: " + df.Åtgärdsbeskrivning[hit['corpus_id']] + "\n  "
        cross_encoder_output.append(line_c)
    
    return bm25_output, bi_encoder_output, cross_encoder_output

iface = gr.Interface(
    fn=model_search,
    inputs=["text", "text", "text", "number", "number"], 
    outputs=["text", "text", "text"])
iface.launch(debug=True)