In [7]:
import pandas as pd
import kagglehub
from kagglehub import KaggleDatasetAdapter
from datasets import load_dataset

In [40]:
qna_data_path = "../data/qna/"
train_size = 110_000
test_size = 2000
val_size = 300

# Medical Conversation Corpus (100k+)

In [1]:

# Download latest version
path = kagglehub.dataset_download("thedevastator/medical-conversation-corpus-100k")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/thedevastator/medical-conversation-corpus-100k?dataset_version_number=2...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 44.3M/44.3M [00:00<00:00, 49.8MB/s]

Extracting files...





Path to dataset files: /home/super_admin/.cache/kagglehub/datasets/thedevastator/medical-conversation-corpus-100k/versions/2


In [3]:
raw_path = "../data/medical_convo_corpus/2/"


In [9]:
raw_test_df = pd.read_csv(raw_path + "test.csv")
raw_train_df = pd.read_csv(raw_path + "train.csv")

In [21]:
raw_corpus_df = pd.concat([raw_train_df, raw_test_df], ignore_index=True)

In [23]:
raw_corpus_df.iloc[-1, -1].replace("ChatDoctor", " Doctor ").split("\n")

['The conversation between human and AI assistant.',
 '[|Human|] hii, i  am male 31 years old. i have high cholestrol and i have been taking cholestrol tablets(atrovostatin) for the past 6 years. i have a fatlly liver, stage 2. my mother has a auto immune hep B  and liver cirrhosis.i had my liver function tests and all my readings were high. i am concerned about liver now. what odes the stage 2 means as far as fatty liver is concerned? any tips to slove this problem??',
 "[|AI|]  HelloThanks for writing to  Doctor Fatty liver is a very common disease among young adults due to changing lifestyle. Most commonly it is due to obesity and sedentary life. It is also related to alcohol intake, altered lipid profile etc. In your case it is related to altered lipid profile. You need few other investigations like routine hologram, Random blood sugar(RBS), Liver function test(LFT), Renal function test(RFT). You should try to lose your weight if you are overweight. Fatty liver is a reversible cond

In [32]:
def process_row(row):
    conversation_split = row["Conversation"].split("\n")
    quesion = conversation_split[1].replace("[|Human|]", "").strip()
    answer = conversation_split[2].replace("[|AI|]", "").strip()
    return pd.Series([quesion, answer])

raw_corpus_df[["question", "answer"]] = raw_corpus_df.apply(process_row, axis=1)

In [38]:
raw_corpus_df = raw_corpus_df.rename(columns={"quesion": "question"})

In [39]:
raw_corpus_df.head()

Unnamed: 0,Conversation,question,answer
0,The conversation between human and AI assistan...,Hi.My names Ahdieh.I m from a small city in Ir...,Hello Addie! Welcome and thank you for asking ...
1,The conversation between human and AI assistan...,Hello and thank you for your time and assistan...,"Hi, You need to focus more on doing exercises ..."
2,The conversation between human and AI assistan...,Yes! I had double knee replacements on Feb. 20...,Hi Hope this message finds you in good health....
3,The conversation between human and AI assistan...,I m extremely itchy ALL OVER the place? I m ex...,YOUR DURATION OF DISEASE IS NOT KNOWN FROM HIS...
4,The conversation between human and AI assistan...,"Hi Doctor, I need your help, im really worried...","Hello there, Thanks for writing your query, Br..."


In [41]:
raw_corpus_df.loc[10, ["question", "answer"]]

question    i have shortness of breath, there is a pain on...
answer      Hi, Dear,Thanks for your query to Chat Doctor....
Name: 10, dtype: object

# Healthcare NLP: LLMs, Transformers, Datasets

In [45]:
kagglehub.dataset_download_file?

Object `kagglehub.dataset_download_file` not found.


In [46]:
path = kagglehub.dataset_download("jpmiller/layoutlm", path="medquad.csv")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/jpmiller/layoutlm?dataset_version_number=16&file_name=medquad.csv...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.95M/4.95M [00:00<00:00, 98.9MB/s]

Extracting zip of medquad.csv...
Path to dataset files: /home/super_admin/.cache/kagglehub/datasets/jpmiller/layoutlm/versions/16/medquad.csv





In [50]:
raw_healthcare_df = pd.read_csv(path)

In [51]:
raw_healthcare_df.head()

Unnamed: 0,question,answer,source,focus_area
0,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma
1,What causes Glaucoma ?,"Nearly 2.7 million people have glaucoma, a lea...",NIHSeniorHealth,Glaucoma
2,What are the symptoms of Glaucoma ?,Symptoms of Glaucoma Glaucoma can develop in ...,NIHSeniorHealth,Glaucoma
3,What are the treatments for Glaucoma ?,"Although open-angle glaucoma cannot be cured, ...",NIHSeniorHealth,Glaucoma
4,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma


# UCSD26/medical_dialog

In [54]:
raw_meddialog_dataset = load_dataset("UCSD26/medical_dialog", "processed.en", split="all")

In [56]:
raw_meddialog_df = raw_meddialog_dataset.to_pandas()

In [60]:
raw_meddialog_df.head()

Unnamed: 0,description,utterances
0,throat a bit sore and want to get a good imune...,[patient: throat a bit sore and want to get a ...
1,"hey there i have had cold ""symptoms"" for over ...","[patient: hey there i have had cold ""symptoms""..."
2,i have a tight and painful chest with a dry co...,[patient: i have a tight and painful chest wit...
3,what will happen after the incubation period f...,[patient: what will happen after the incubatio...
4,suggest treatment for pneumonia,[patient: just found out i was pregnant. yeste...


In [79]:
# We only need q and a pairs
raw_meddialog_df = raw_meddialog_df[raw_meddialog_df["utterances"].apply(len) == 2]

In [80]:
raw_meddialog_df.iloc[0, 1]

array(['patient: throat a bit sore and want to get a good imune booster, especially in light of the virus. please advise. have not been in contact with nyone with the virus.',
       "doctor: during this pandemic. throat pain can be from a strep throat infection (antibiotics needed), a cold or influenza or other virus, or from some other cause such as allergies or irritants. usually, a person sees the doctor (call first) if the sore throat is bothersome, recurrent, or doesn't go away quickly. covid-19 infections tend to have cough, whereas strep throat usually lacks cough but has more throat pain. (3/21/20)"],
      dtype=object)

In [83]:
def get_q_and_a(utterences):
    question = utterences[0].replace("patient:", "").strip()
    answer = utterences[1].replace("doctor:", "").strip()
    return pd.Series([question, answer])

raw_meddialog_df[["question", "answer"]] = raw_meddialog_df["utterances"].apply(get_q_and_a)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_meddialog_df[["question", "answer"]] = raw_meddialog_df["utterances"].apply(get_q_and_a)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_meddialog_df[["question", "answer"]] = raw_meddialog_df["utterances"].apply(get_q_and_a)


In [84]:
raw_meddialog_df.head()

Unnamed: 0,description,utterances,question,answer
0,throat a bit sore and want to get a good imune...,[patient: throat a bit sore and want to get a ...,throat a bit sore and want to get a good imune...,during this pandemic. throat pain can be from ...
1,"hey there i have had cold ""symptoms"" for over ...","[patient: hey there i have had cold ""symptoms""...","hey there i have had cold ""symptoms"" for over ...",yes. protection. it is not enough symptoms to ...
2,i have a tight and painful chest with a dry co...,[patient: i have a tight and painful chest wit...,i have a tight and painful chest with a dry co...,"possible. top symptoms include fever, dry coug..."
3,what will happen after the incubation period f...,[patient: what will happen after the incubatio...,what will happen after the incubation period f...,"in brief: symptoms if you are infected, sympto..."
4,suggest treatment for pneumonia,[patient: just found out i was pregnant. yeste...,just found out i was pregnant. yesterday diagn...,thanks for your question on healthcare magic.i...


# Combine

In [90]:
qna_df = pd.concat([raw_corpus_df[["question", "answer"]], raw_healthcare_df[["question", "answer"]], raw_meddialog_df[["question", "answer"]]], ignore_index=True)

In [88]:
qna_df.sample?

[0;31mSignature:[0m
[0mqna_df[0m[0;34m.[0m[0msample[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mn[0m[0;34m:[0m [0;34m'int | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfrac[0m[0;34m:[0m [0;34m'float | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mreplace[0m[0;34m:[0m [0;34m'bool_t'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mweights[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrandom_state[0m[0;34m:[0m [0;34m'RandomState | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0maxis[0m[0;34m:[0m [0;34m'Axis | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mignore_index[0m[0;34m:[0m [0;34m'bool_t'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;34m'Self'[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Return a ran

In [91]:
qna_df_shuffled = qna_df.sample(frac=1, random_state=32).reset_index(drop=True)

In [97]:
qna_df_shuffled.shape

(129177, 2)

In [92]:
qna_df_shuffled.head(10)

Unnamed: 0,question,answer
0,I have had a cyst now for 6 months.6weeks ago ...,Welcome to Chat Doctor It needs to be examined...
1,What are the symptoms of Pili torti ?,What are the signs and symptoms of Pili torti?...
2,I have a mass just under my ribs on the right ...,"Hi, Good Morning. I am Chat Doctor. S.CUTS -su..."
3,"My sister is on Xanax, feyntnol patch and a pa...","Welcome to Chat Doctor, Though you have not gi..."
4,Hello. So today I woke up and was perfectly fi...,Thanks for your question on Chat Doctor. I can...
5,I was playing basketball the other night and w...,Hi Thanks for posting your query on Chat Docto...
6,"Hello, my Mother passed away suddenly on Chris...",Hi. I understand your concern. Stress can affe...
7,For the past few days I have been having a tig...,Thanks for your question on Chat Doctor. I can...
8,i have large cyst on either side on my lower ...,Hello and welcome to Chat Doctor . I thank you...
9,my semen analysis was conducted and the result...,"Hello, Welcome to Chat Doctor, I am Chat Docto..."


In [96]:
qna_df_shuffled.to_csv(qna_data_path + "qna.csv")

In [100]:
train_df = qna_df_shuffled.loc[:train_size -1, :]
test_df = qna_df_shuffled.loc[train_size:train_size + test_size - 1, :]
val_df = qna_df_shuffled.loc[train_size + test_size: train_size + test_size + val_size - 1, :]

In [101]:
train_df.shape, test_df.shape, val_df.shape

((120000, 2), (2000, 2), (300, 2))

In [102]:
train_df.to_csv(qna_data_path + "train.csv", index=False)
test_df.to_csv(qna_data_path + "test.csv", index=False)
val_df.to_csv(qna_data_path + "val.csv", index=False)

In [2]:
from transformers import AutoTokenizer
model_id = "microsoft/Phi-3.5-mini-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

In [3]:
prompt_template = """
# Instruction:
Assume you are an excellent doctor. Using your knowledge, answer the quesion given below.

# Question: {question}

# Answer: """
prompt_template = prompt_template.strip()
print(prompt_template)

# Instruction:
Assume you are an excellent doctor. Using your knowledge, answer the quesion given below.

# Question: {question}

# Answer:


In [13]:
def calculate_token_counts(example):
    prompt = prompt_template.format(question=example['question'])
    answer = example["answer"] + tokenizer.eos_token
    
    # Tokenize prompt to get its length
    prompt_tokens = tokenizer(
        prompt,
        truncation=False
    )
    
    prompt_len = len(prompt_tokens["input_ids"])
    
    # Tokenize full sequence once
    tokenized = tokenizer(
        prompt,
        answer,
        truncation=False
    )

    full_len = len(tokenized["input_ids"])
    return pd.Series([prompt_len, full_len])

In [8]:
qna_df = pd.read_csv(qna_data_path + "qna.csv")

In [17]:
qna_df = qna_df[qna_df["answer"].apply(type) == str]

In [24]:
qna_df = qna_df[["question", "answer"]].reset_index(drop=True)

In [25]:
qna_df.iloc[:10, :].apply(calculate_token_counts, axis=1)

Unnamed: 0,0,1
0,227,364
1,46,464
2,110,323
3,145,277
4,105,283
5,113,254
6,214,360
7,182,356
8,119,273
9,158,339


In [26]:
qna_df[["n_prompt_tokens", "n_full_tokens"]] = qna_df.apply(calculate_token_counts, axis=1)

In [27]:
qna_df.head(30)

Unnamed: 0,question,answer,n_prompt_tokens,n_full_tokens
0,I have had a cyst now for 6 months.6weeks ago ...,Welcome to Chat Doctor It needs to be examined...,227,364
1,What are the symptoms of Pili torti ?,What are the signs and symptoms of Pili torti?...,46,464
2,I have a mass just under my ribs on the right ...,"Hi, Good Morning. I am Chat Doctor. S.CUTS -su...",110,323
3,"My sister is on Xanax, feyntnol patch and a pa...","Welcome to Chat Doctor, Though you have not gi...",145,277
4,Hello. So today I woke up and was perfectly fi...,Thanks for your question on Chat Doctor. I can...,105,283
5,I was playing basketball the other night and w...,Hi Thanks for posting your query on Chat Docto...,113,254
6,"Hello, my Mother passed away suddenly on Chris...",Hi. I understand your concern. Stress can affe...,214,360
7,For the past few days I have been having a tig...,Thanks for your question on Chat Doctor. I can...,182,356
8,i have large cyst on either side on my lower ...,Hello and welcome to Chat Doctor . I thank you...,119,273
9,my semen analysis was conducted and the result...,"Hello, Welcome to Chat Doctor, I am Chat Docto...",158,339


In [38]:
qna_df.shape

(129172, 4)

In [41]:
qna_df = qna_df[qna_df["n_full_tokens"] < 512]

In [47]:
qna_df = qna_df.reset_index(drop=True)

In [50]:
train_df = qna_df.loc[:train_size -1, ["question", "answer"]]
test_df = qna_df.loc[train_size:train_size + test_size - 1, ["question", "answer"]]
val_df = qna_df.loc[train_size + test_size: train_size + test_size + val_size - 1, ["question", "answer"]]

In [51]:
train_df.head()

Unnamed: 0,question,answer
0,I have had a cyst now for 6 months.6weeks ago ...,Welcome to Chat Doctor It needs to be examined...
1,What are the symptoms of Pili torti ?,What are the signs and symptoms of Pili torti?...
2,I have a mass just under my ribs on the right ...,"Hi, Good Morning. I am Chat Doctor. S.CUTS -su..."
3,"My sister is on Xanax, feyntnol patch and a pa...","Welcome to Chat Doctor, Though you have not gi..."
4,Hello. So today I woke up and was perfectly fi...,Thanks for your question on Chat Doctor. I can...


In [52]:
train_df.shape, test_df.shape, val_df.shape

((110000, 2), (2000, 2), (300, 2))

In [53]:
train_df.to_csv(qna_data_path + "train.csv", index=False)
test_df.to_csv(qna_data_path + "test.csv", index=False)
val_df.to_csv(qna_data_path + "val.csv", index=False)

In [54]:
qna_df.to_csv(qna_data_path + "qna_long_seq_filtered.csv", index=False)