# Retrieval-Augmented Generation

In [None]:
!pip -q install langchain tiktoken bert-score chromadb InstructorEmbedding unstructured jq datasets rouge-score sentence_transformers
!pip -q install git+https://github.com/huggingface/transformers # need to install from github

In [29]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
%cd gdrive/MyDrive/

/content/gdrive/MyDrive


In [5]:
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader, JSONLoader


from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

  from tqdm.autonotebook import trange


# Creating the vector database

In [6]:
DATA_FOLDER = 'pubmed_qaa_context.json'
loader = JSONLoader(file_path=DATA_FOLDER, jq_schema=".[].context")
docs = loader.load()

In [7]:
# Because of limited ressources we select 100k samples
docs = docs[:100000]

#### Here we will use the HuggingFaceInstructEmbeddings

In [6]:
instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base",
                                                      model_kwargs={"device": "cuda"})

load INSTRUCTOR_Transformer
max_seq_length  512


In [8]:
def get_vector_db(docs, embeddings):
    persist_directory = 'vector_db'

    vectordb = Chroma.from_documents(documents=docs, embedding=embeddings, persist_directory=persist_directory)

    vectordb.persist()
    #vectordb=None

    # now we load the persisted db from disk
    #vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

    return vectordb

def get_retriever(vectordb):
    retriever = vectordb.as_retriever(search_kwargs={"k": 1})
    return retriever

In [7]:
#%%time

# Use this when it is the first time generating the vector database
#vector_db = get_vector_db(docs, instructor_embeddings)
persist_directory = 'vector_db'
vector_db = Chroma(persist_directory=persist_directory, embedding_function=instructor_embeddings)

In [12]:
!zip -r /vector_db.zip vector_db

  adding: vector_db/ (stored 0%)
  adding: vector_db/chroma.sqlite3 (deflated 34%)
  adding: vector_db/333e2873-2778-4cb3-995c-963fd819821f/ (stored 0%)
  adding: vector_db/333e2873-2778-4cb3-995c-963fd819821f/header.bin (deflated 53%)
  adding: vector_db/333e2873-2778-4cb3-995c-963fd819821f/data_level0.bin (deflated 8%)
  adding: vector_db/333e2873-2778-4cb3-995c-963fd819821f/length.bin (deflated 64%)
  adding: vector_db/333e2873-2778-4cb3-995c-963fd819821f/link_lists.bin (deflated 67%)
  adding: vector_db/333e2873-2778-4cb3-995c-963fd819821f/index_metadata.pickle (deflated 78%)


In [19]:
from google.colab import files
files.download("/vector_db.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Once we have created the vector database, we can create

In [9]:
db_retriever = get_retriever(vector_db)

In [10]:
## Here is an example of how the approach can be used

query = "Are the long-term results of the transanal pull-through equal to those of the transabdominal pull-through?"

In [11]:
retrieved_docs = db_retriever.get_relevant_documents(query)

In [12]:
retrieved_docs[0].page_content

'We administered high-frequency jet ventilation (HFJV) to a tracheal-lung model with connectors of internal diameter 2.5-8.5 mm to simulate ventilation through varying degrees of laryngotracheal stenosis. With reductions in diameter, end-expiratory pressure (EEP) and peak inspiratory pressure (PIP) increased. During supraglottic, translaryngeal, and transtracheal HFJV, respectively, EEP was > or =10 mm Hg at diameters narrower than 5.5, 4.0, and 3.5 cm, and PIP was >20 mm Hg at diameters narrower than 5.5, 3.5, and 3.0 cm. EEP and PIP were greater during supraglottic HFJV than during translaryngeal and transtracheal HFJV (P < 0.01). At diameters of <3.5 and 4.0 cm, respectively, PIP and EEP increased and were significantly greater (P < 0.01) during translaryngeal HFJV than during transtracheal HFJV. In a second experiment, the degree of ventilation and air entrainment was assessed by administering nitrous oxide 4 L/min to the model. Nitrous oxide concentrations were significantly (P < 

Once we have retrieved the most relevant document out of the vector db, we will use the EPFL model Meditron to answer the query (question).

In [13]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, GenerationConfig

In [14]:
tokenizer = AutoTokenizer.from_pretrained("epfl-llm/meditron-7b", token='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', add_eos_token=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


1

In [15]:
model = AutoModelForCausalLM.from_pretrained("epfl-llm/meditron-7b", token='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',
    device_map="cuda")

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [31]:
def get_prompt(context, query):
  prompt = f"""Given this information  :
    ### Info : {context}
    Answer this medical question below, provide a reponse for it :
    ### Question : {query}
    ### Response:"""
  return prompt

In [40]:
def model_generate(query, model, tokenizer, retriever):

  retrieved_docs = retriever.get_relevant_documents(query)

  context = retrieved_docs[0].page_content

  text = get_prompt(context, query)

  inputs = tokenizer(
      text,
      return_tensors="pt",
  )
  input_ids = inputs["input_ids"].cuda()

  generation_config = GenerationConfig(
      temperature=0.6,
      top_p=0.95,
      repetition_penalty=1.2,
  )

  generation_output = model.generate(
      input_ids=input_ids,
      generation_config=generation_config,
      return_dict_in_generate=True,
      output_scores=True,
      max_new_tokens=512,
      pad_token_id = 0,
      eos_token_id = 50256,
  )

  response = ''
  for s in generation_output.sequences:
      response += (tokenizer.decode(s) + '\n')

  return response

In [41]:
query = "Are the long-term results of the transanal pull-through equal to those of the transabdominal pull-through?"

response = model_generate(query, model, tokenizer, db_retriever)

print(response)



<s> Given this information  : 
    ### Info : We administered high-frequency jet ventilation (HFJV) to a tracheal-lung model with connectors of internal diameter 2.5-8.5 mm to simulate ventilation through varying degrees of laryngotracheal stenosis. With reductions in diameter, end-expiratory pressure (EEP) and peak inspiratory pressure (PIP) increased. During supraglottic, translaryngeal, and transtracheal HFJV, respectively, EEP was > or =10 mm Hg at diameters narrower than 5.5, 4.0, and 3.5 cm, and PIP was >20 mm Hg at diameters narrower than 5.5, 3.5, and 3.0 cm. EEP and PIP were greater during supraglottic HFJV than during translaryngeal and transtracheal HFJV (P < 0.01). At diameters of <3.5 and 4.0 cm, respectively, PIP and EEP increased and were significantly greater (P < 0.01) during translaryngeal HFJV than during transtracheal HFJV. In a second experiment, the degree of ventilation and air entrainment was assessed by administering nitrous oxide 4 L/min to the model. Nitrous 

# Evaluation

For evaluation we will be using the BERT-score. We will use the pubmed-qa dataset of questions and answers. This will allow us to compare our approach to the fine-tuning approach.

In [22]:
# Here we load a the pubmed_qa dataset processed, with a new field text that is created using the generate_prompt function in the pubmed_qa_processing.py file
from datasets import load_from_disk
data = load_from_disk("dl_chatbot/tokenized_dataset")

In [58]:
data

DatasetDict({
    train: Dataset({
        features: ['question', 'text', 'input_ids', 'attention_mask'],
        num_rows: 221085
    })
    test: Dataset({
        features: ['question', 'text', 'input_ids', 'attention_mask'],
        num_rows: 11637
    })
})

In [48]:
generated_answers = [model_generate(q['question'], model, tokenizer, db_retriever) for q in data['test']]



In [52]:
from bert_score import score

P, R, F1 = score(generated_answers, data['test']['text'], lang='en')

average_score = F1.mean().item()

print("The F1 score for our model : ", average_score)

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The F1 score for our model :  0.8393945097923279


In [55]:
from rouge_score import rouge_scorer

def calculate_rouge_scores(hypotheses, references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    rouge_scores = {
        'rouge1': {'precision': 0, 'recall': 0, 'fmeasure': 0},
        'rouge2': {'precision': 0, 'recall': 0, 'fmeasure': 0},
        'rougeL': {'precision': 0, 'recall': 0, 'fmeasure': 0},
    }

    total_samples = len(hypotheses)

    for hyp, ref in zip(hypotheses, references):
        scores = scorer.score(hyp, ref)
        for metric in rouge_scores.keys():
            rouge_scores[metric]['precision'] += scores[metric].precision
            rouge_scores[metric]['recall'] += scores[metric].recall
            rouge_scores[metric]['fmeasure'] += scores[metric].fmeasure

    # Calculate average scores
    for metric in rouge_scores.keys():
        rouge_scores[metric]['precision'] /= total_samples
        rouge_scores[metric]['recall'] /= total_samples
        rouge_scores[metric]['fmeasure'] /= total_samples

    return rouge_scores

In [56]:
rouge_scores = calculate_rouge_scores(generated_answers, data['test']['text'])

for metric, scores in rouge_scores.items():
    print(f"{metric}:")
    print(f"  Precision: {scores['precision']:.4f}")
    print(f"  Recall   : {scores['recall']:.4f}")
    print(f"  F1 Score : {scores['fmeasure']:.4f}")

rouge1:
  Precision: 0.6574
  Recall   : 0.1243
  F1 Score : 0.2009
rouge2:
  Precision: 0.3362
  Recall   : 0.0584
  F1 Score : 0.0972
rougeL:
  Precision: 0.4614
  Recall   : 0.0840
  F1 Score : 0.1376
