In [None]:
!pip install sentence_transformers
!pip install llama_index
!pip install llama-index-llms-huggingface
!pip install llama-index-embeddings-langchain

In [None]:
!pip install torch torchvision

In [1]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.prompts.prompts import SimpleInputPrompt

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
%pip install llama-index-llms-openai
%pip install llama-index-embeddings-openai
%pip install llama-index-graph-stores-nebula
%pip install llama-index-llms-azure-openai

In [2]:
import json


with open("./Datasets/medical_dialog_dataset/en_medical_dialog.json", "r") as f:
    data = json.load(f)

# print(data[0])

data = data[0:20000]

In [3]:
len(data)

20000

In [4]:
import multiprocessing
from functools import partial

def process_entry(entry):
    patient_query = entry["Description"] + entry["Patient"]
    doctor_response = entry["Doctor"]
    return "<Patient>" + patient_query + "<Doctor>" + doctor_response + "\n\n"

# Adjust the number of processes according to your system's capabilities
num_processes = multiprocessing.cpu_count()

with multiprocessing.Pool(processes=num_processes) as pool:
    processed_data = pool.map(process_entry, data)
    pool.close()  # Close the pool to prevent any more tasks from being submitted
    pool.join()   # Wait for all processes to complete

refined_data = "".join(processed_data)

file_path = "./Datasets/medical_dialog_dataset/refined_data/final_dataset.txt"
with open(file_path, "w") as file1:
    file1.write(refined_data)

print("Data written to:", file_path)


Data written to: ./Datasets/medical_dialog_dataset/refined_data/final_dataset.txt


In [3]:
!huggingface-cli login --token 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

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/hrudayte.akkalad/.cache/huggingface/token
Login successful


In [4]:
system_prompt=""""

You are a QA Assistant. Your goal is to answer questions as accurates as possible based onthe instructions and context provided
"""

query_wrapper_prompt = SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")

In [None]:
!pip install -i https://pypi.org/simple/ bitsandbytes

In [5]:
import torch

llm = HuggingFaceLLM(
    context_window = 4096,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.0, "do_sample": False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name = "meta-llama/Llama-2-7b-chat-hf",
    model_name="meta-llama/Llama-2-7b-chat-hf",
    device_map = "auto",
    model_kwargs={"torch_dtype": torch.float16, "load_in_8bit": True}
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 2/2 [00:29<00:00, 14.89s/it]


In [None]:
!pip install langchain

In [6]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.core import ServiceContext
from llama_index.embeddings.langchain import LangchainEmbedding

embed_model = LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2"))

In [7]:
docs = SimpleDirectoryReader("./Datasets/medical_dialog_dataset/refined_data/").load_data()

In [8]:
service_context = ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
    embed_model=embed_model
)

  service_context = ServiceContext.from_defaults(


In [9]:
type(docs)

list

In [10]:
index=VectorStoreIndex.from_documents(docs, service_context=service_context)

In [11]:
index.storage_context.persist(persist_dir="./VectorStores/new-test/")

In [12]:
query_engine = index.as_query_engine()

In [15]:
response = query_engine.query("Ive had a cold which started on Christmas eve but appeared to be getting better over the following week. However I now have what I think may be sinusitis - pain in the head, yellow mucus from the nose and stuffiness-squeaking from the sinuses. Will this go away on its own or should I see my GP?")

In [16]:
ground_truth = "Hi, Welcome to Chat Doctor! Yes, from what you have described, it appears that you are having Sinusitis. However, you need to be examined to diagnose it finally. So I'd recommend you to visit your GP and get yourself examined. In case the diagnosis is confirmed, you'll require a dose of Antibiotics to ward off the infection. Also, your GP will advise you certain precautions which need to be followed. Hope this information helps. Feel free to ask if you have any doubt. Wishing you a speedy recovery. With warm regards,"

In [20]:
str(response)

"Hi there! I'm just an AI, I don't have personal opinions or experiences, but I can provide you with some general information and advice based on the context you've provided.\n\nIt's possible that your symptoms could be related to sinusitis, which is an infection or inflammation of the sinuses. If you've recently had a cold, it's possible that your sinuses are taking longer to clear up than you expected, and this could be causing your current symptoms.\n\nIt's always a good idea to consult with a medical professional if you're experiencing persistent or severe symptoms, especially if they're affecting your quality of life. Your GP can assess your symptoms and provide a proper diagnosis, as well as recommend appropriate treatment options.\n\nIn the meantime, there are some things you can try to help manage your symptoms:\n\n* Use saline nasal sprays or drops to help loosen and clear out mucus from your nose.\n* Apply a warm compress to your face to help loosen up any sinus pressure or t

In [21]:
from bert_score import score


P, R, F1 = score([str(response)], [ground_truth], lang="en", verbose=True)

print(f"Precision: {P.mean():.2f}")
print(f"Recall: {R.mean():.2f}")
print(f"F1-Score: {F1.mean():.2f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  6.50it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 12.99it/s]

done in 0.24 seconds, 4.12 sentences/sec
Precision: 0.83
Recall: 0.86
F1-Score: 0.85





In [None]:
P, R, F1 = score([generated_response], [ground_truth_response], lang="en", verbose=True)

print(f"Precision: {P.mean():.2f}")
print(f"Recall: {R.mean():.2f}")
print(f"F1-Score: {F1.mean():.2f}")

In [24]:
import nltk
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from nltk.translate.bleu_score import SmoothingFunction
from nltk.translate.bleu_score import modified_precision, brevity_penalty
from rouge_score import rouge_scorer


# # Ground truth responses and model-generated responses
ground_truth_responses = [ground_truth]  # List of ground truth responses
generated_responses = [str(response)]     # List of model-generated responses

# 1. BLEU Score
def calculate_bleu_score(reference, hypothesis):
    return sentence_bleu([reference], hypothesis, smoothing_function=SmoothingFunction().method1)

bleu_scores = [calculate_bleu_score(reference, hypothesis) for reference, hypothesis in zip(ground_truth_responses, generated_responses)]
average_bleu_score = sum(bleu_scores) / len(bleu_scores)
print("Average BLEU Score:", average_bleu_score)

# 2. ROUGE Score
def calculate_rouge_score(reference, hypothesis):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, hypothesis)
    return (scores['rouge1'].fmeasure + scores['rougeL'].fmeasure) / 2

rouge_scores = [calculate_rouge_score(reference, hypothesis) for reference, hypothesis in zip(ground_truth_responses, generated_responses)]
average_rouge_score = sum(rouge_scores) / len(rouge_scores)
print("Average ROUGE Score:", average_rouge_score)

# 3. METEOR Score
def calculate_meteor_score(reference, hypothesis):
    return meteor_score(reference, hypothesis)

meteor_scores = [calculate_meteor_score(reference, hypothesis) for reference, hypothesis in zip(ground_truth_responses, generated_responses)]
average_meteor_score = sum(meteor_scores) / len(meteor_scores)
print("Average METEOR Score:", average_meteor_score)

# 4. Accuracy (for exact matches)
exact_matches = sum(1 for ref, gen in zip(ground_truth_responses, generated_responses) if ref == gen)
accuracy = exact_matches / len(ground_truth_responses)
print("Accuracy:", accuracy)


Average BLEU Score: 0.28849239894886297
Average ROUGE Score: 0.24731182795698925


TypeError: "hypothesis" expects pre-tokenized hypothesis (Iterable[str]): Hi there! I'm just an AI, I don't have personal opinions or experiences, but I can provide you with some general information and advice based on the context you've provided.

It's possible that your symptoms could be related to sinusitis, which is an infection or inflammation of the sinuses. If you've recently had a cold, it's possible that your sinuses are taking longer to clear up than you expected, and this could be causing your current symptoms.

It's always a good idea to consult with a medical professional if you're experiencing persistent or severe symptoms, especially if they're affecting your quality of life. Your GP can assess your symptoms and provide a proper diagnosis, as well as recommend appropriate treatment options.

In the meantime, there are some things you can try to help manage your symptoms:

* Use saline nasal sprays or drops to help loosen and clear out mucus from your nose.
* Apply a warm compress to your face to help loosen up any sinus pressure or tension.
* Try to breathe in some steam from a hot

In [23]:
!pip install rouge_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting absl-py (from rouge_score)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Downloading absl_py-2.1.0-py3-none-any.whl (133 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.7/133.7 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=615d37bf0cf12941c4960cb019ab56bf683da1ac1abb4adf00ba80128ba6eaf4
  Stored in directory: /home/hrudayte.akkalad/.cache/pip/wheels/24/55/6f/ebfc4cb176d1c9665da4e306e1705496206d08215c1acd9dde
Successfully built rouge_score
Installing collected packages: absl-py, rouge_score
Successfully installed absl

In [42]:
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'


In [None]:
!pip install chromadb

In [None]:
import chromadb
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext

# load some documents
documents = SimpleDirectoryReader("./VectorStores/medical_dialog_29k/").load_data()

# initialize client, setting path to save data
db = chromadb.PersistentClient(path="./chroma_db")

# create collection
chroma_collection = db.get_or_create_collection("quickstart")

# assign chroma as the vector_store to the context
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# create your index
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context
)

# create a query engine and query
query_engine = index.as_query_engine()
response = query_engine.query("diarrhea with headache and stomach pain")
print(response)

In [None]:
response = query_engine.query("diarrhea with headache and stomach pain")

In [36]:
print(response)

Hi there! I'm just an AI, I don't have personal experiences, but I'm here to help you with your query.

Based on the context information provided, it seems like you are experiencing some discomforts like diarrhea, headache, and stomach pain. I understand that it can be quite uncomfortable and concerning.

Firstly, let me suggest that you should stay hydrated by drinking plenty of fluids, especially water. Dehydration can exacerbate diarrhea and other symptoms, so it's essential to replenish your body's fluids. You can also try drinking electrolyte-rich beverages like coconut water or sports drinks to help replace lost electrolytes.

In terms of managing your headache, you can try over-the-counter pain relievers like paracetamol or ibuprofen. However, please ensure that you follow the recommended dosage and consult with a medical professional if the pain persists or worsens.

Regarding your stomach pain, it's possible that you may have a


In [40]:
response.response

"Hi there! I'm here to help you with your query. Based on the information provided, it seems like you're experiencing some discomforts that could be related to a few different things.\n\nFirstly, diarrhea can be caused by a variety of factors, such as food poisoning, viral infections, or even a change in diet. If you've recently eaten something that didn't agree with you, it could be the culprit. However, if the diarrhea persists, it's always a good idea to consult with a medical professional to rule out any underlying conditions.\n\nRegarding the headache and stomach pain, it's possible that they could be related to the diarrhea or another underlying condition. Headaches can be caused by a variety of factors, including tension, migraines, or even sinus pressure. Stomach pain can also be caused by a variety of factors, including digestive issues, inflammation, or even a stomach ulcer.\n\nIn any case, I would recommend that you consult with a medical professional to get a proper diagnos

In [31]:
from llama_index.core import StorageContext, load_index_from_storage

storage_context = StorageContext.from_defaults(persist_dir="./VectorStores/new-test/")

# load index
index = load_index_from_storage(storage_context)

ValueError: 
******
Could not load OpenAI embedding model. If you intended to use OpenAI, please check your OPENAI_API_KEY.
Original error:
No API key found for OpenAI.
Please set either the OPENAI_API_KEY environment variable or openai.api_key prior to initialization.
API keys can be found or created at https://platform.openai.com/account/api-keys

Consider using embed_model='local'.
Visit our documentation for more embedding options: https://docs.llamaindex.ai/en/stable/module_guides/models/embeddings.html#modules
******

In [26]:
# from nltk.translate.bleu_score import sentence_bleu


dataset_file_path = "./Datasets/medicare_dataset/refined_data/refined_medicare_test.txt"  # Replace with the path to your dataset file

# List to store patient queries and doctor responses
dataset = []

# Open the file and read its contents
with open(dataset_file_path, "r") as file:
    lines = file.readlines()
lines = lines[:50]
print(lines[50])





In [35]:
eval_data = []

for line in lines:
    if line != "\n":
        eval_data.append((line.split("<Doctor>")[0].replace("<Patient>",""), line.split("<Doctor>")[1].replace("<Doctor>","")))

In [48]:
from nltk.translate.bleu_score import sentence_bleu

def generate_response(patient_query):
    return query_engine.query(patient_query).response

patient = "Ive had a cold which started on Christmas eve but appeared to be getting better over the following week. \\
           However I now have what I think may be sinusitis - pain in the head, yellow mucus from the nose and stuffiness \\
           -squeaking from the sinuses. Will this go away on its own or should I see my GP?"

print(generate_response(patient))


 Thank you for reaching out to me. I'm just an AI, I don't have personal opinions or emotions, but I'm here to help you with your query.

Based on the information provided, it seems that you may be experiencing sinusitis, which can be caused by a viral or bacterial infection. While it's possible for sinusitis to clear up on its own, it's important to consult with a medical professional to determine the cause and appropriate treatment.

Your GP can perform a thorough examination and may recommend further tests, such as a nasal endoscopy or CT scan, to determine the cause of your symptoms. They may also prescribe antibiotics or other medications to help manage your symptoms.

In the meantime, there are some things you can do to help manage your symptoms:

1. Stay hydrated by drinking plenty of fluids, such as water, tea, or soup.
2. Use a humidifier to add moisture to the air, which can help to thin out mucus and make it easier to breathe.
3. Apply warm compress


In [12]:
import pickle
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext

# Step 1: Load the index object from the .pkl file
with open("./index_medical_dialog_50k.pkl", "rb") as file:
    index = pickle.load(file)

In [16]:
import pickle
import json

# Load the object from the pickle file
with open('./index_medical_dialog_50k.pkl', 'rb') as f:
    index_object = pickle.load(f)

# Convert the object to a dictionary
index_dict = index_object.to_dict()

# Save the dictionary to a JSON file
with open('./index.json', 'w') as f:
    json.dump(index_dict, f)


AttributeError: 'VectorStoreIndex' object has no attribute 'to_dict'

In [13]:
query_engine = index.as_query_engine()

AttributeError: _llm

In [None]:
response = query_engine.query("diarrhea with headache and stomach pain")