### Installations

In [None]:
!pip install datasets evaluate rouge_score py7zr pinecone-client --quiet
!pip install accelerate -U --quiet

### Set Up

In [1]:
import gc
import re
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset, Dataset
import torch
from torch.utils.data import Subset
from transformers import (
      AutoTokenizer, AutoModel,pipeline,
      DataCollatorWithPadding,
      AutoModelForSpeechSeq2Seq,
      BartForConditionalGeneration,
      TrainingArguments,
      Seq2SeqTrainingArguments,
      Trainer
)

from pinecone import Pinecone, ServerlessSpec
import evaluate
from tqdm.auto import tqdm
import warnings
from IPython.display import clear_output

warnings.filterwarnings("ignore")

In [2]:
seed = 42
np.random.seed(seed)

### Data Preperation

#### iCliniq

In [4]:
icliniq = load_dataset('lavita/ChatDoctor-iCliniq')['train']
icliniq

Dataset({
    features: ['input', 'answer_icliniq', 'answer_chatgpt', 'answer_chatdoctor'],
    num_rows: 7321
})

#### Healthcare Magic

In [3]:
HMCDataset = load_dataset('hakeematyab/HealthCareMagicWithSummary-100k')
HMCDataset = HMCDataset['train'].train_test_split(train_size=0.8,seed=seed)
HMCDatasetTemp = HMCDataset['test'].train_test_split(train_size=0.5,seed=seed)
HMCDataset['validation'] = HMCDatasetTemp.pop('train')
HMCDataset['test']  = HMCDatasetTemp.pop('test')
del HMCDatasetTemp
HMCDataset

DatasetDict({
    train: Dataset({
        features: ['input', 'preprocessed_output', 'summarized_input', 'summarized_output'],
        num_rows: 89732
    })
    test: Dataset({
        features: ['input', 'preprocessed_output', 'summarized_input', 'summarized_output'],
        num_rows: 11217
    })
    validation: Dataset({
        features: ['input', 'preprocessed_output', 'summarized_input', 'summarized_output'],
        num_rows: 11216
    })
})

In [None]:
def add_ids(batch):
    global start_id
    end_id = start_id + len(batch['input'])
    ids = [f'HCM-{i}' for i in range(start_id, end_id)]
    start_id = end_id  # Update start_id to the new end_id
    return {'ids': ids}
start_id=0
HMCDataset = HMCDataset.map(add_ids, batched=True)

In [None]:
start_id = 89733
HMCDatasetVal = HMCDataset['validation'].map(add_ids, batched=True)

In [6]:
HMCDataset['train']['ids'][:5],HMCDatasetVal['ids'][:5]

(['HCM-0', 'HCM-1', 'HCM-2', 'HCM-3', 'HCM-4'],
 ['HCM-89733', 'HCM-89734', 'HCM-89735', 'HCM-89736', 'HCM-89737'])

#### MedAlpaca

In [7]:
MedAlpacaDataset = load_dataset('medalpaca/medical_meadow_medical_flashcards')
MedAlpacaDataset = MedAlpacaDataset['train'].train_test_split(train_size=0.8,seed=seed)
MedAlpacaDatasetTemp = MedAlpacaDataset['test'].train_test_split(train_size=0.5,seed=seed)
MedAlpacaDataset['validation'] = MedAlpacaDatasetTemp['train']
MedAlpacaDataset['test'] = MedAlpacaDatasetTemp['test']
del MedAlpacaDatasetTemp
MedAlpacaDataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 27164
    })
    test: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 3396
    })
    validation: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 3395
    })
})

In [8]:
len(MedAlpacaDataset['train'])

27164

In [9]:
start_id = 0
def add_ids(batch):
    global start_id
    end_id = start_id + len(batch['input'])
    ids = [f'MedAlpaca-{i}' for i in range(start_id, end_id)]
    start_id = end_id  # Update start_id to the new end_id
    return {'ids': ids}

MedAlpacaDataset = MedAlpacaDataset.map(add_ids, batched=True)

In [10]:
start_id = 27165
MedAlpacaDatasetVal = MedAlpacaDataset['validation'].map(add_ids, batched=True)

In [11]:
MedAlpacaDataset['train']['ids'][:5],MedAlpacaDatasetVal['ids'][:5]

(['MedAlpaca-0', 'MedAlpaca-1', 'MedAlpaca-2', 'MedAlpaca-3', 'MedAlpaca-4'],
 ['MedAlpaca-27165',
  'MedAlpaca-27166',
  'MedAlpaca-27167',
  'MedAlpaca-27168',
  'MedAlpaca-27169'])

### Vectorizing Documents and Indexing to VectorDB

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained("neuml/pubmedbert-base-embeddings")
model = AutoModel.from_pretrained("neuml/pubmedbert-base-embeddings")

In [7]:
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [8]:
# Mean Pooling - Take attention mask into account for correct averaging
def meanpooling(output, mask):
    embeddings = output[0] # First element of model_output contains all token embeddings
    mask = mask.unsqueeze(-1).expand(embeddings.size()).float()
    return torch.sum(embeddings * mask, 1) / torch.clamp(mask.sum(1), min=1e-9)

def create_embeddings(text):
    # Tokenize sentences
    inputs = tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors='pt').to(device)
    # Compute token embeddings
    with torch.no_grad():
        output = model(**inputs)
    # Perform pooling. In this case, mean pooling.
    embeddings = meanpooling(output, inputs['attention_mask'])
    return embeddings

In [16]:
inputText = [MedAlpacaDataset['train'][i]['input']+'\n'+ MedAlpacaDataset['train'][i]['output'] for i in range(5)]
inputText

['What is the potential consequence of acute hypopituitarism caused by Pituitary apoplexy?\nPituitary apoplexy may lead to cardiovascular collapse due to acute hypopituitarism causing decreased ACTH.',
 'What type of motoneurons mediate the motor component of skeletal muscle reflexes?\nThe motor component of skeletal muscle reflexes is mediated by lower motoneurons.',
 'What are the histological features of Ménétrier disease?\nMénétrier disease is characterized by hyperplasia of the gastric mucosa and atrophy of the parietal/chief cells.',
 "What is the vector responsible for transmitting Rift Valley Fever, and how does it transmit the virus?\nRift Valley Fever is a viral disease that is primarily transmitted via the Aedes mosquito vector. Mosquitoes become infected with the virus by feeding on infected animals, such as cattle, sheep, and camels. The virus then multiplies in the mosquito's body and can be transmitted to humans and other animals through mosquito bites. In addition to mo

In [17]:
emb = create_embeddings(inputText)
dim = emb.shape
emb, dim

(tensor([[ 0.2752, -0.5027,  0.1954,  ...,  0.1117,  0.5115,  0.6854],
         [ 0.0665,  0.3465,  0.3925,  ...,  0.5131, -0.6344,  0.5566],
         [ 0.7244, -0.1561,  0.7682,  ..., -0.2008,  0.6889,  0.3719],
         [ 0.4082,  0.7751,  0.0725,  ..., -0.2482, -1.5272, -0.6972],
         [ 0.3154, -0.0864, -0.0266,  ..., -0.1454, -0.2869,  0.1163]],
        device='cuda:0'),
 torch.Size([5, 768]))

In [9]:
# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = ""
proxy_url = ''
# configure client
pc = Pinecone(api_key=api_key,proxy_url=proxy_url)
dim = 768

In [10]:
spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)


In [11]:
index_name = 'careconnect-knowledge-cosine'
# index_name = 'careconnect-knowledge'
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]


In [12]:
# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=dim,  # dimensionality of ada 002
        metric='cosine',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 131507}},
 'total_vector_count': 131507}

In [49]:
def index_vector_embeddings(batch):
    inputText = [input_text+'\n'+ output for input_text,output in zip(batch['input'],batch['output'])]
    embeds = create_embeddings(inputText)
    metadata = [
        {'question': input_text, 'answer': output} for input_text, output in zip(batch['input'], batch['output'])
    ]
    index.upsert(vectors=zip(batch['ids'],embeds,metadata))

In [None]:
batch_size = 256
last_index = 0
MedAlpacaDataset['train'].map(index_vector_embeddings,batched=True,batch_size=batch_size)

In [29]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 131507}},
 'total_vector_count': 131507}

### Creating RAG Evaluation Dataset

In [13]:
def search(query,top_k=3,num_results=3):
    emb = create_embeddings([query])[0].tolist()
    result = index.query(vector=emb,top_k=top_k,k=num_results, include_metadata=True)
    return result

def parse_matches(matches,top_p=0.3):
    contexts = []
    scores = []
    for eachMatch in matches:
        answer = eachMatch['metadata']['answer']
        score = eachMatch['score']
        contexts.append(answer)
        scores.append(score)
    top_p_indices = filter_by_top_p(scores, top_p)
    return [(contexts[index],scores[index]) for index in top_p_indices]

def filter_by_top_p(scores, top_p):
    indexed_scores = list(enumerate(scores))

    sorted_indexed_scores = sorted(indexed_scores, key=lambda x: x[1], reverse=True)

    sorted_scores = [score for index, score in sorted_indexed_scores]
    sorted_indices = [index for index, score in sorted_indexed_scores]

    cumulative_sum = np.cumsum(sorted_scores)

    cumulative_probabilities = cumulative_sum / cumulative_sum[-1]

    top_p_index = np.searchsorted(cumulative_probabilities, top_p) + 1

    top_p_indices = sorted_indices[:top_p_index]

    return top_p_indices


In [24]:
query= MedAlpacaDataset['train'][9]['input']
answer = MedAlpacaDataset['train'][9]['output']
query, answer

('What happens in megaloblastic anemia when immature megaloblasts in the bone marrow rupture?',
 'In megaloblastic anemia, immature megaloblasts in the bone marrow can rupture, releasing LDH and unconjugated bilirubin.')

In [26]:
search(query)['matches'][0]

{'id': 'MedAlpaca-15926',
 'metadata': {'answer': 'No, the presence of hypersegmented neutrophils is not '
                        'a common finding in non-megaloblastic macrocytic '
                        'anemia.',
              'question': 'Is the presence of hypersegmented neutrophils a '
                          'common finding in non-megaloblastic macrocytic '
                          'anemia?'},
 'score': 117.957268,
 'values': []}

In [34]:
matches = search(query)['matches']
output = parse_matches(matches)
print(f'Matching Vectors')
print('='*150)
for val in output:
    text, score = val
    print(f'Document: {text}\nScore:{score}')
    print('-'*150)

Matching Vectors
Document: In megaloblastic anemia, immature megaloblasts in the bone marrow can rupture, releasing LDH and unconjugated bilirubin.
Score:0.924022257
------------------------------------------------------------------------------------------------------------------------------------------------------


In [15]:
evaluationDataset = []
testSize = 100
for i in range(testSize):
    # data = HMCDataset['test'].shuffle(seed=42)[i]
    data = icliniq.shuffle(seed=42)[i]
    inputText = data['input']
    outputText = data['answer_icliniq']
    matches = search(inputText)['matches'][0]
    retDoc = matches['metadata']
    finData = {'question':inputText, 'answer':outputText, 'retrieved_question': retDoc['question'],'retrieved_answer': retDoc['answer'], 'scores':matches['score']}
    evaluationDataset.append(finData)

In [18]:
evaluationData = Dataset.from_list(evaluationDataset)
evaluationData

Dataset({
    features: ['question', 'answer', 'retrieved_question', 'retrieved_answer', 'scores'],
    num_rows: 100
})

In [19]:
evaluationData[0]

{'question': 'Hi doctor,I am suffering from irregular periods. I am currently taking medication Levothyroxine 50. My T3 is 0.87 ng/mL, T4 is 8.30 ug/dL, TSH is 2.43 uIU/mL. I am 34 years old, weigh 75 kg, and 5 feet tall. Please advice.',
 'answer': 'Hi. From your query, I understand that you are suffering from hypothyroidism. I need to know when was it diagnosed, and what was the thyroid profile at that time. Irregular periods and weight gain is a part of hypothyroidism. I would like to modify your dose as well as advise you ways to reduce your weight. All will be well once we modify your dose.',
 'retrieved_question': 'Sir suggests homoeo medicine for high tsh on 57-year-old woman.',
 'retrieved_answer': 'If you are hypothyroid, you should seek medical attention and start taking thyroid replacement medication.',
 'scores': 0.890752733}

In [None]:
evaluationData.push_to_hub(f"hakeematyab/icliniq-Cosine-Test-100",commit_message='icliniq test data for RAG application.')