# Extractive QA

This notebook implements the full Extractive QA pipeline.

## Importing the libraries

In [1]:
from pathlib import Path

from src.repositories import CsvDocumentsRepository
from src.clients import ChromaDatabaseClient
from src.encoders import SentenceTransformersEncoder
from src.indexer import DatabaseIndexer
from src.retriever import VectorSearchRetriever
from src.readers import Reader
from src.pipelines import ExtractiveQAPipeline

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/joao.barroca/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joao.barroca/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


## Constants and Config

In [2]:
MAIN_PATH = Path.cwd().resolve().absolute().parent

DOCUMENTS_FILEPATH =  MAIN_PATH / "data/documents.csv"

ENCODER_MODEL_FILEPATH = MAIN_PATH / "models/encoders/sentence-transformers/all-mpnet-base-v2-deus"
#ENCODER_MODEL_FILEPATH = "all-MiniLM-L6-v2"

READER_MODEL_FILEPATH = "deepset/roberta-base-squad2"

DOCUMENT_ID_COLUMN = "document_id"
DOCUMENT_CONTENT_COLUMN = "document_content"

PERSIST_PATH = MAIN_PATH / "data/chroma_db"
COLLECTION_NAME = "documents-all-mpnet-base-deus"


## Setup 

In [3]:
documents_repository = CsvDocumentsRepository(
    path=DOCUMENTS_FILEPATH,
    document_id_column=DOCUMENT_ID_COLUMN,
    document_content_column=DOCUMENT_CONTENT_COLUMN
)

encoder = SentenceTransformersEncoder(model_filepath=ENCODER_MODEL_FILEPATH)

client = ChromaDatabaseClient(collection_name=COLLECTION_NAME, persist=True, persist_path=str(PERSIST_PATH))

indexer = DatabaseIndexer(client=client, encoder=encoder)

retriever = VectorSearchRetriever(client=client, encoder=encoder)

reader = Reader(model_filepath=READER_MODEL_FILEPATH)

extractive_qa = ExtractiveQAPipeline(retriever=retriever, reader=reader)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Index the documents

In [5]:
# Since we are using a persistent database, we only need to run this once to index the documents.

#docs = documents_repository.get_all()
#indexer.index(documents=docs)

## Run ExtractiveQAPipeline

In [18]:
from pprint import pprint

questions = [
    "When was the Premier League created?",
    "What is the primary cue for migration?",
    "What did Whitehead believe was the relationship between the individual and social aspects of religion?",
    "When can the EU expect Armenia to attempt to join it?",
    "Pope Sixtus V limited the number of cardinals to?",
    "A progam inside the ROM of a PC is called what?",
    "In which areas are the four standard Serbo-Croatian variants spoken?",
    "Who was appointed justiciar?",
]

for question in questions:
    answers = extractive_qa.run(question=question, top_k=3, include_contexts=True)

    # Weight scores by relevance
    weighted_answers = [(answer.content, answer.score*answer.context.relevance) for answer in answers]

    print("Question: ", question)
    print("Answers: ", sorted(weighted_answers, key=lambda x: -x[1]))
    print()
    pprint([answer.as_dict() for answer in answers])
    print("\n=========================================================\n")

Question:  When was the Premier League created?
Answers:  [('27 May 1992', 0.2723108695059864), ('1992–', 0.2514834761534068), ('1888', 0.09275213617686084)]

[{'content': '1888',
  'context': {'document': {'content': "The world's oldest football competition "
                                      'is the FA Cup, which was founded by C. '
                                      'W. Alcock and has been contested by '
                                      'English teams since 1872. The first '
                                      'official international football match '
                                      'also took place in 1872, between '
                                      'Scotland and England in Glasgow, again '
                                      'at the instigation of C. W. Alcock. '
                                      "England is also home to the world's "
                                      'first football league, which was '
                                      'found