# Prepare data for Q and A

## Imports

In [None]:
import pandas as pd
import kagglehub
from kagglehub import KaggleDatasetAdapter
from datasets import load_dataset

In [None]:
qna_data_path = "../data/qna/"
train_size = 110_000
test_size = 2000
val_size = 300

# Medical Conversation Corpus (100k+)

In [None]:
# Download latest version
path = kagglehub.dataset_download("thedevastator/medical-conversation-corpus-100k")

print("Path to dataset files:", path)

In [None]:
raw_path = "../data/medical_convo_corpus/2/"

In [None]:
raw_test_df = pd.read_csv(raw_path + "test.csv")
raw_train_df = pd.read_csv(raw_path + "train.csv")

In [None]:
raw_corpus_df = pd.concat([raw_train_df, raw_test_df], ignore_index=True)

In [None]:
raw_corpus_df.iloc[-1, -1].replace("ChatDoctor", " Doctor ").split("\n")

In [None]:
def process_row(row):
    conversation_split = row["Conversation"].split("\n")
    quesion = conversation_split[1].replace("[|Human|]", "").strip()
    answer = conversation_split[2].replace("[|AI|]", "").strip()
    return pd.Series([quesion, answer])

raw_corpus_df[["question", "answer"]] = raw_corpus_df.apply(process_row, axis=1)

In [None]:
raw_corpus_df = raw_corpus_df.rename(columns={"quesion": "question"})

In [None]:
raw_corpus_df.head()

In [None]:
raw_corpus_df.loc[10, ["question", "answer"]]

# Healthcare NLP: LLMs, Transformers, Datasets

In [None]:
kagglehub.dataset_download_file?

In [None]:
path = kagglehub.dataset_download("jpmiller/layoutlm", path="medquad.csv")

print("Path to dataset files:", path)

In [None]:
raw_healthcare_df = pd.read_csv(path)

In [None]:
raw_healthcare_df.head()

# UCSD26/medical_dialog

In [None]:
raw_meddialog_dataset = load_dataset("UCSD26/medical_dialog", "processed.en", split="all")

In [None]:
raw_meddialog_df = raw_meddialog_dataset.to_pandas()

In [None]:
raw_meddialog_df.head()

In [None]:
# We only need q and a pairs
raw_meddialog_df = raw_meddialog_df[raw_meddialog_df["utterances"].apply(len) == 2]

In [None]:
raw_meddialog_df.iloc[0, 1]

In [None]:
def get_q_and_a(utterences):
    question = utterences[0].replace("patient:", "").strip()
    answer = utterences[1].replace("doctor:", "").strip()
    return pd.Series([question, answer])

raw_meddialog_df[["question", "answer"]] = raw_meddialog_df["utterances"].apply(get_q_and_a)

In [None]:
raw_meddialog_df.head()

# Combine

In [None]:
qna_df = pd.concat([raw_corpus_df[["question", "answer"]], raw_healthcare_df[["question", "answer"]], raw_meddialog_df[["question", "answer"]]], ignore_index=True)

In [None]:
qna_df.sample?

In [None]:
qna_df_shuffled = qna_df.sample(frac=1, random_state=32).reset_index(drop=True)

In [None]:
qna_df_shuffled.shape

In [None]:
qna_df_shuffled.head(10)

In [None]:
qna_df_shuffled.to_csv(qna_data_path + "qna.csv")

In [None]:
train_df = qna_df_shuffled.loc[:train_size -1, :]
test_df = qna_df_shuffled.loc[train_size:train_size + test_size - 1, :]
val_df = qna_df_shuffled.loc[train_size + test_size: train_size + test_size + val_size - 1, :]

In [None]:
train_df.shape, test_df.shape, val_df.shape

In [None]:
train_df.to_csv(qna_data_path + "train.csv", index=False)
test_df.to_csv(qna_data_path + "test.csv", index=False)
val_df.to_csv(qna_data_path + "val.csv", index=False)

In [None]:
from transformers import AutoTokenizer
model_id = "microsoft/Phi-3.5-mini-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

In [None]:
prompt_template = """
# Instruction:
Assume you are an excellent doctor. Using your knowledge, answer the quesion given below.

# Question: {question}

# Answer: """
prompt_template = prompt_template.strip()
print(prompt_template)

In [None]:
def calculate_token_counts(example):
    prompt = prompt_template.format(question=example['question'])
    answer = example["answer"] + tokenizer.eos_token
    
    # Tokenize prompt to get its length
    prompt_tokens = tokenizer(
        prompt,
        truncation=False
    )
    
    prompt_len = len(prompt_tokens["input_ids"])
    
    # Tokenize full sequence once
    tokenized = tokenizer(
        prompt,
        answer,
        truncation=False
    )

    full_len = len(tokenized["input_ids"])
    return pd.Series([prompt_len, full_len])

In [None]:
qna_df = pd.read_csv(qna_data_path + "qna.csv")

In [None]:
qna_df = qna_df[qna_df["answer"].apply(type) == str]

In [None]:
qna_df = qna_df[["question", "answer"]].reset_index(drop=True)

In [None]:
qna_df.iloc[:10, :].apply(calculate_token_counts, axis=1)

In [None]:
qna_df[["n_prompt_tokens", "n_full_tokens"]] = qna_df.apply(calculate_token_counts, axis=1)

In [None]:
qna_df.head(30)

In [None]:
qna_df.shape

In [None]:
qna_df = qna_df[qna_df["n_full_tokens"] < 512]

In [None]:
qna_df = qna_df.reset_index(drop=True)

In [None]:
train_df = qna_df.loc[:train_size -1, ["question", "answer"]]
test_df = qna_df.loc[train_size:train_size + test_size - 1, ["question", "answer"]]
val_df = qna_df.loc[train_size + test_size: train_size + test_size + val_size - 1, ["question", "answer"]]

In [None]:
train_df.head()

In [None]:
train_df.shape, test_df.shape, val_df.shape

In [None]:
train_df.to_csv(qna_data_path + "train.csv", index=False)
test_df.to_csv(qna_data_path + "test.csv", index=False)
val_df.to_csv(qna_data_path + "val.csv", index=False)

In [None]:
qna_df.to_csv(qna_data_path + "qna_long_seq_filtered.csv", index=False)

In [None]:
train_df = pd.read_csv(qna_data_path + "train.csv")

In [None]:
train_df.iloc[5, 0]