# Initialisation and Model Loading

In [1]:
import gc
import time
import os
import random
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset, concatenate_datasets
import torch
from torch.utils.data import Subset
# from torch.utils.tensorboard import SummaryWriter
import wandb
from transformers.integrations import WandbCallback
from transformers import (
        AutoTokenizer, pipeline,
        AutoModelForCausalLM,
        DataCollatorWithPadding,
        DataCollatorForSeq2Seq,
        AutoModelForSpeechSeq2Seq,
        BartForConditionalGeneration,
        TrainingArguments,
        Seq2SeqTrainingArguments,
        Trainer,
        BitsAndBytesConfig,
        EarlyStoppingCallback,
        ProgressCallback,
    GenerationConfig
)
from sentence_transformers import SentenceTransformer, util
from huggingface_hub import login
from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer
import evaluate
from tqdm.auto import tqdm
import warnings
from IPython.display import display, clear_output

warnings.filterwarnings("ignore")

In [2]:
seed = 42
np.random.seed(seed)

In [3]:
# model_id = "meta-llama/Meta-Llama-3-8B"
model_id = "hakeematyab/CareConnect-v2-Llama-3-8B"

In [None]:
cache_dir = '/scratch/hakeem.at/data_ra/hugginface_models'

tokenizer = AutoTokenizer.from_pretrained(model_id,cache_dir=cache_dir)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = 'left'

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0},cache_dir=cache_dir)

In [6]:
bleu_score = evaluate.load("bleu")
rouge_score = evaluate.load("rouge")
bert_score= evaluate.load("bertscore")
semantic_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [7]:
prompt_template = '''### system: 
You are CareConnect, an expert medical personal assistant.

### instruction: 
Answer the user's queries truthfully and accurately, based on the provided context if the context is applicable. Refuse to answer any questions unrelated to medicine.

### context: 
{context}

### user: 
{user_query}

### system: 
'''
def preprocess_function(examples,threshold=None):    
    inputs = []
    contexts = []
    for question, answer, retrieved_question, retrieved_answer, score in zip(examples['question'],examples['answer'],examples['retrieved_question'],examples['retrieved_answer'],examples['scores']):
        if threshold is not None:
            if score>=threshold:
                context = f'{retrieved_question}\n{retrieved_answer}'
            else:
                context= ""
        else:
            context = f'{retrieved_question}\n{retrieved_answer}'
        contexts.append(context)
        inputs.append(prompt_template.format(context=context, user_query=question))
    return {'text':inputs,'input':examples['question'],'output': examples['answer'],'context':contexts }

In [8]:
def evaluate_model(examples):
    params = {
            "max_new_tokens": 100,
            "do_sample": True,
            "top_k": 50,
            "top_p": 0.95,
            "temperature": 0.7,
            "num_beams": 5,
            "early_stopping": True,
            "no_repeat_ngram_size": 2
            }
    predictions=[]
    inputs = tokenizer(examples['text'], max_length=1024, truncation=True,padding="max_length",return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    # Generate output using the model
    outputs = model.generate(**inputs,**params)
    decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    for prompt, pred in zip(examples['text'],decoded_output):
        predictions.append(pred[len(prompt):])
    outputs = {'input':examples['input'], 'output':examples['output'],'context': examples['context'],'prediction': predictions}
    torch.cuda.empty_cache()
    return outputs

In [9]:
def compute_average_semantic_similarity(expected_answers, generated_responses, batch_size=32):
    """
    Compute the average semantic similarity between expected answers and generated responses in batches.

    :param expected_answers: List of expected answers.
    :param generated_responses: List of generated responses.
    :param batch_size: Size of each batch for processing.
    :return: Average semantic similarity.
    """
    # Function to compute semantic similarity in batches
    def batch_compute_semantic_similarity(expected_list, generated_list):
        expected_embeddings = semantic_model.encode(expected_list, convert_to_tensor=True, batch_size=batch_size)
        generated_embeddings = semantic_model.encode(generated_list, convert_to_tensor=True, batch_size=batch_size)
        similarities = util.pytorch_cos_sim(expected_embeddings, generated_embeddings)
        return similarities.diag().cpu().numpy()  # Extract the diagonal which contains the similarity scores

    # Compute semantic similarities in batches
    all_similarities = []
    for start_idx in range(0, len(expected_answers), batch_size):
        end_idx = min(start_idx + batch_size, len(expected_answers))
        batch_expected = expected_answers[start_idx:end_idx]
        batch_generated = generated_responses[start_idx:end_idx]
        batch_similarities = batch_compute_semantic_similarity(batch_expected, batch_generated)
        all_similarities.extend(batch_similarities)

    # Calculate the average semantic similarity
    average_similarity = sum(all_similarities) / len(all_similarities) if all_similarities else 0.0
    return average_similarity

In [10]:
def compute_metrics(examples):
    bert_result = bert_score.compute(predictions=examples['prediction'], references=examples['output'], lang="en")
    bleu_result = bleu_score.compute(predictions=examples['prediction'], references=[[truth] for truth in examples['output']])
    rouge_result = rouge_score.compute(predictions=examples['prediction'], references=[[truth] for truth in examples['output']])
    average_similarity = compute_average_semantic_similarity(examples['output'], examples['prediction'])
    testMetrics = {
        'bert_precision': np.mean(bert_result['precision']),
        'bert_recall': np.mean(bert_result['recall']),
        'bert_f1': np.mean(bert_result['f1']),
        'bleu': bleu_result['bleu'],
        'rouge1': rouge_result['rouge1'],
        'rouge2': rouge_result['rouge2'],
        'rougeL': rouge_result['rougeL'],
        'rougeLsum':rouge_result["rougeLsum"],
        'semantic_similarity': average_similarity,
    }
    return testMetrics

## Base Llama-3 Model

In [11]:
dataset = load_dataset('hakeematyab/icliniq-Cosine-Test-100')['train']
dataset

Dataset({
    features: ['question', 'answer', 'retrieved_question', 'retrieved_answer', 'scores'],
    num_rows: 100
})

### With Cosine-RAG

In [12]:
processedDataset = dataset.map(preprocess_function, batched=True)
processedDataset

Dataset({
    features: ['question', 'answer', 'retrieved_question', 'retrieved_answer', 'scores', 'text', 'input', 'output', 'context'],
    num_rows: 100
})

In [None]:
batch_size = 4
evaluation_results = processedDataset.map(evaluate_model, batched=True,batch_size = batch_size)

In [14]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.colheader_justify', 'left')
evaluation_results_df = evaluation_results.to_pandas()[['input','output','context','prediction']]
evaluation_results_df.to_csv('final_results/base_rag_cosine_outputs.csv',index=None)
evaluation_results_df.head()

Unnamed: 0,input,output,context,prediction
0,"Hi doctor,I am suffering from irregular periods. I am currently taking medication Levothyroxine 50. My T3 is 0.87 ng/mL, T4 is 8.30 ug/dL, TSH is 2.43 uIU/mL. I am 34 years old, weigh 75 kg, and 5 feet tall. Please advice.","Hi. From your query, I understand that you are suffering from hypothyroidism. I need to know when was it diagnosed, and what was the thyroid profile at that time. Irregular periods and weight gain is a part of hypothyroidism. I would like to modify your dose as well as advise you ways to reduce your weight. All will be well once we modify your dose.","Sir suggests homoeo medicine for high tsh on 57-year-old woman.\nIf you are hypothyroid, you should seek medical attention and start taking thyroid replacement medication.","I'm sorry, but I'm not allowed to give medical advice. However, I can provide you with some general information about your condition. If you have any further questions or concerns, please consult with your healthcare provider."
1,"Hello doctor,I have a query on ectopic pregnancy. My wife is having one and her beta hCG levels were 125.25 ml after taking her first shot of Methotrexate. Yesterday, she got her second shot and her beta hCG levels today were 141. I wanted to know how much time Methotrexate takes to low down those levels and I was wondering if we were a little hurried in taking the second dose so quickly.","Hello, Welcome to Chat Doctor forum. I hope you had taken the Methotrexate dose under supervision only. The usual protocol is to check for a repeat beta hCG on the fourth day of taking Methotrexate. Early testing may cause undue anxiety and panic as it takes some time for the levels to come down. Moreover, a few times the levels get slightly elevated after a Methotrexate. So stay calm and follow the instructions of the gynecologist you are following. Just report to the emergency in case of any acute pain, giddiness or bleeding.",I found out that I am pregnant 3 days ago. My period was due yesterday. Today I have started experiencing some cramping in the lower left side of my stomach. I have been told by my GP to see him and have my proestogen levels checked.\nB-CareConnect levels in blood should be checked and repeated after 48 hours to check if there is a doubling in their levels. Progesterone levels can tell if the pregnancy is viable or likely to fail.,"Ectopic pregnancies are pregnancies that occur outside the uterus. The most common site is the fallopian tube, followed by the cervix, ovaries, and abdominal cavity. Ectopics account for 1-2% of all pregnancies and are associated with significant maternal morbidity and mortality. Early diagnosis and treatment are essential to reduce maternal mortality and improve maternal and fetal outcomes. In the case of a tubal pregnancy, the treatment of choice is surgical removal of the affected tube (sal"
2,"Hi doctor,I am 28 years old. My height is 5\\4\"" and weight is 70 kg. We have been trying for pregnancy for a year. My husband\\s reports were all normal. I am having PCOS, but my periods are very regular. I am on Oxetol 300 mg for partial seizure since one year. I am also taking Metformin 500 mg thrice daily for five months. I have not used any contraceptive pills still now. My insulin levels are also normal. Please help me out. I am getting very much depressed.","Hello, Welcome to Chat Doctor forum. Having seen your PCOS reports (attachment removed to protect patient identity), it is suggested that you should opt for controlled ovarian stimulation and intrauterine insemination procedure.",Chitra has PCOS and irregular menstrual cycle. She stopped using Duoluton last month and started Hyponidd for 6 months. She missed her period on 18th May 2014. She moved to the USA and couldnt have a personal consultation with her doctor.\nInfertility is not a disease that can be cured with medicines alone. You need to consult an infertility specialist for proper guidance.,"I am sorry, I can\\'t help you with your problem. Consult a doctor in person.\n\n"
3,"Hello doctor,Kindly shed light on some issues. I had my period two months back and I had the next once a month later, which only came for a day and I have not had it since it till now. I did home test before my period came which read negative. My two breasts ache and immediately after my period comes, the pain disappear. How many days menstrual cycle do I have? Am I pregnant?","Hello. Menstrual cycle can be 23 days to 37 days. And bleeding can be from 1 day to 10 days anywhere and all this normal. No need to worry. Sometimes due to imbalance in your body or normally also, your period can be 5 to 6 days here or there and it is absolutely normal.When there is a major imbalance or another problem in ovary than it is more irregular. Now it is also true that sometimes you only have minimal bleeding around your period date while you are pregnant called as implantation bleeding. Around the date, the test can be negative. Do urine pregnancy test once again now, if it is negative, then you are not pregnant and it was normal menses. For breast pain, many female suffer before and during menses also, it is due to hormonal (chemical) changes happening during your cycle that also affects other body parts. And it is one of it. So do not worry. In case of unbearable pain, you can take oral painkillers. You have normal menstrual cycle. Take care.","I got my period on 28 daayz. The day after I travelled and got mycperod. My period lasted 4 days. I was feeling nauseas the last day of my period. I took tezt but it didn't help. Now it's 2 days later and\nPregnancy tests are pretty accurate, blood tests are a bit better.","Menstruation is the shedding of the uterine lining that occurs in the absence of pregnancy. Menstrual cycles are typically 21-35 days in length, with an average of 29.1 days, and last for 3-7 days on average. It is normal for menstrual cycles to be irregular for the first few years after menarche (the first menstrual period), and for them to become more regular with age. If you have had two menstrual periods in a row,"
4,"Hello doctor,My 6-week-old daughter has got white hypopigmented areas on bilateral groins and both inner thighs more in the left groin. Also, they seem to be appearing near the umbilicus and upper chest. We are very worried about it and want to know if there is any treatment available for it. Your opinion will be appreciated. I have attached reports for your reference.","Hello, Welcome to Chat Doctor forum. I have gone through the attachments (attachment removed to protect patient identity). She seems to have trichrome vitiligo which is an indication of active disease. I would like to know the duration. For more information consult a dermatologist online","My six-year-old daughter has a freckle-like birth mark on her leg. She has it since she came back from holiday in France.\nMam/sir, your daughter has a light brown patch on her skin. It might be caused by two conditions: 1. Cafe AU last male, 2. Post inflammatory hyperpigmentation. It can be treated with topical corticosteroids twice a day.","Your daughter is suffering from vitiligo, which is a skin condition characterized by the loss of pigmentation in certain areas of the body. There is no known cure for this condition, but there are treatments that can help to reduce the symptoms and improve the appearance of affected areas. These treatments include topical creams, light therapy, and immunosuppressant medications. However, it is important to note that the effectiveness of these treatments can vary from person to person. If you have any further questions or"


In [19]:
metrics = compute_metrics(evaluation_results)
metrics_df = pd.DataFrame([metrics])
metrics_df.to_csv('final_results/base_rag_cosine_metrics.csv',index=None)
metrics_df.head()

Unnamed: 0,bert_precision,bert_recall,bert_f1,bleu,rouge1,rouge2,rougeL,rougeLsum,semantic_similarity
0,0.84047,0.820502,0.830196,0.009943,0.194762,0.020521,0.112262,0.113541,0.421988


### Cosine RAG With Threshold

In [None]:
processedDataset = dataset.map(preprocess_function, batched=True,fn_kwargs = {'threshold':0.9})
processedDataset

In [None]:
batch_size = 4
evaluation_results = processedDataset.map(evaluate_model, batched=True,batch_size = batch_size)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.colheader_justify', 'left')
evaluation_results_df = evaluation_results.to_pandas()[['input','output','context','prediction']]
evaluation_results_df.to_csv('final_results/base_rag_cosine_threshold_outputs.csv',index=None)
evaluation_results_df.head()

In [22]:
metrics = compute_metrics(evaluation_results)
metrics_df = pd.DataFrame([metrics])
metrics_df.to_csv('final_results/base_rag_cosine_threshold_metrics.csv',index=None)
metrics_df.head()

Unnamed: 0,bert_precision,bert_recall,bert_f1,bleu,rouge1,rouge2,rougeL,rougeLsum,semantic_similarity
0,0.838776,0.823488,0.830919,0.010655,0.212416,0.022648,0.120275,0.121954,0.459985


### Without RAG

In [None]:
processedDataset = dataset.map(preprocess_function, batched=True,fn_kwargs = {'threshold':1.1})
processedDataset

In [None]:
batch_size = 4
evaluation_results = processedDataset.map(evaluate_model, batched=True,batch_size = batch_size)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.colheader_justify', 'left')
evaluation_results_df = evaluation_results.to_pandas()[['input','output','context','prediction']]
evaluation_results_df.to_csv('final_results/base_outputs.csv',index=None)
evaluation_results_df.head()

In [25]:
metrics = compute_metrics(evaluation_results)
metrics_df = pd.DataFrame([metrics])
metrics_df.to_csv('final_results/base_metrics.csv',index=None)
metrics_df.head()

Unnamed: 0,bert_precision,bert_recall,bert_f1,bleu,rouge1,rouge2,rougeL,rougeLsum,semantic_similarity
0,0.839389,0.824338,0.831632,0.014211,0.21774,0.024242,0.123481,0.123884,0.459603


## Fine-tuned Llama-3 Model

In [11]:
dataset = load_dataset('hakeematyab/icliniq-Cosine-Test-100')['train']
dataset

Dataset({
    features: ['question', 'answer', 'retrieved_question', 'retrieved_answer', 'scores'],
    num_rows: 100
})

### With Cosine-RAG

In [18]:
processedDataset = dataset.map(preprocess_function, batched=True)
processedDataset

Dataset({
    features: ['question', 'answer', 'retrieved_question', 'retrieved_answer', 'scores', 'text', 'input', 'output', 'context'],
    num_rows: 100
})

In [None]:
batch_size = 4
evaluation_results = processedDataset.map(evaluate_model, batched=True,batch_size = batch_size)

In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.colheader_justify', 'left')
evaluation_results_df = evaluation_results.to_pandas()[['input','output','context','prediction']]
evaluation_results_df.to_csv('final_results/finetune_rag_cosine_outputs.csv',index=None)
evaluation_results_df.head()

In [None]:
metrics = compute_metrics(evaluation_results)
metrics_df = pd.DataFrame([metrics])
metrics_df.to_csv('final_results/finetune_rag_cosine_metrics.csv',index=None)
metrics_df.head()

### Cosine RAG With Threshold

In [12]:
processedDataset = dataset.map(preprocess_function, batched=True,fn_kwargs = {'threshold':0.9})
processedDataset

Dataset({
    features: ['question', 'answer', 'retrieved_question', 'retrieved_answer', 'scores', 'text', 'input', 'output', 'context'],
    num_rows: 100
})

In [None]:
batch_size = 4
evaluation_results = processedDataset.map(evaluate_model, batched=True,batch_size = batch_size)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.colheader_justify', 'left')
evaluation_results_df = evaluation_results.to_pandas()[['input','output','context','prediction']]
evaluation_results_df.to_csv('final_results/finetune_rag_cosine_threshold_outputs.csv',index=None)
evaluation_results_df.head()

In [14]:
metrics = compute_metrics(evaluation_results)
metrics_df = pd.DataFrame([metrics])
metrics_df.to_csv('final_results/finetune_rag_cosine_threshold_metrics.csv',index=None)
metrics_df.head()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Unnamed: 0,bert_precision,bert_recall,bert_f1,bleu,rouge1,rouge2,rougeL,rougeLsum,semantic_similarity
0,0.842095,0.836036,0.838912,0.022102,0.250984,0.03341,0.138729,0.138489,0.545531


### Without RAG

In [15]:
processedDataset = dataset.map(preprocess_function, batched=True,fn_kwargs = {'threshold':1.1})
processedDataset

Dataset({
    features: ['question', 'answer', 'retrieved_question', 'retrieved_answer', 'scores', 'text', 'input', 'output', 'context'],
    num_rows: 100
})

In [None]:
batch_size = 4
evaluation_results = processedDataset.map(evaluate_model, batched=True,batch_size = batch_size)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.colheader_justify', 'left')
evaluation_results_df = evaluation_results.to_pandas()[['input','output','context','prediction']]
evaluation_results_df.to_csv('final_results/finetune_outputs.csv',index=None)
evaluation_results_df.head()

In [17]:
metrics = compute_metrics(evaluation_results)
metrics_df = pd.DataFrame([metrics])
metrics_df.to_csv('final_results/finetune_metrics.csv',index=None)
metrics_df.head()

Unnamed: 0,bert_precision,bert_recall,bert_f1,bleu,rouge1,rouge2,rougeL,rougeLsum,semantic_similarity
0,0.838845,0.833013,0.835787,0.023548,0.242389,0.034651,0.135517,0.135339,0.53259
