In [1]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
HF_TOKEN = user_secrets.get_secret("HF_TOKEN")

# Create dataset for Sleep and fitness

In [2]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from huggingface_hub import create_repo, upload_file

In [3]:
# Define my repository
repo_id = "johnjehiel/ph-llm-dataset"

In [None]:
# Load the CSV dataset (assumed to have columns: 'Category', 'ID', 'Prompt', 'Response')
csv_path = "/kaggle/input/sleep-and-fitness-dataset/PH-LLM Custom Dataset.csv"
df = pd.read_csv(csv_path)

# Stratified split by 'Category': 90% train, 10% test
train_df, test_df = train_test_split(
    df, test_size=0.1, stratify=df["Category"], random_state=42
)

# Save the train and test splits as separate Parquet files
train_parquet_path = "ph-llm-dataset_train.parquet"
test_parquet_path = "ph-llm-dataset_test.parquet"
train_df.to_parquet(train_parquet_path, index=False)
test_df.to_parquet(test_parquet_path, index=False)

# Create the dataset repository on Hugging Face (specify repo_type="dataset")
create_repo(repo_id=repo_id, token=HF_TOKEN, repo_type="dataset", exist_ok=True)

# Upload the train split to the dataset repository
upload_file(
    path_or_fileobj=train_parquet_path,
    path_in_repo="ph-llm-dataset_train.parquet",
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset",  # IMPORTANT: ensure you're uploading to a dataset repo
    commit_message="Upload train split (90%) for sleep and fitness"
)

# Upload the test split to the dataset repository
upload_file(
    path_or_fileobj=test_parquet_path,
    path_in_repo="ph-llm-dataset_test.parquet",
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset",  # IMPORTANT: ensure you're uploading to a dataset repo
    commit_message="Upload test split (10%) for sleep and fitness"
)

In [None]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("johnjehiel/ph-llm-dataset")

In [None]:
ds

# Create dataset for MMLU clinical knowledge

In [7]:
from datasets import load_dataset, DatasetDict, concatenate_datasets
import pandas as pd

In [None]:
# Load the mmlu clinical knowledge dataset
mmlu_ds = load_dataset("openlifescienceai/mmlu_clinical_knowledge")

In [None]:
prompt_template = """You are a medical expert {specialization}. Answer the following question by selecting the correct option.

### Question:
{question}

### Options:
(A) {option_A}
(B) {option_B}
(C) {option_C}
(D) {option_D}
"""
response_template = "Answer: ({correct_option}) {correct_answer}"

In [None]:
def format_prompt_response(record):
    # Extract data from the 'data' dictionary.
    data = record["data"]
    question = data.get("Question", "")
    options = data.get("Options", {})
    option_A = options.get("A", "")
    option_B = options.get("B", "")
    option_C = options.get("C", "")
    option_D = options.get("D", "")
    specialization = "specialized in " + " ".join(record["subject_name"].split("_")) if record["subject_name"] else ""
    prompt = prompt_template.format(
        specialization=specialization,
        question=question,
        option_A=option_A,
        option_B=option_B,
        option_C=option_C,
        option_D=option_D
    )
    correct_answer = data.get("Correct Answer", "")
    correct_option = data.get("Correct Option", "")
    response = response_template.format(
        correct_option=correct_option,
        correct_answer=correct_answer
    )
    return {
        "Category": record["subject_name"],
        "ID": record["id"],
        "Prompt": prompt,
        "Response": response
    }

In [None]:
# Convert the "test" split to prompt–response pairs
converted_test = mmlu_ds["test"].map(format_prompt_response)
converted_test = converted_test.remove_columns(['subject_name', 'data', 'id'])
# Convert the "validation" split
converted_validation = mmlu_ds["validation"].map(format_prompt_response)
converted_validation = converted_validation.remove_columns(['subject_name', 'data', 'id'])
# Create a new DatasetDict
new_data = DatasetDict({
    "train": converted_test,  # will be added to the "train" split
    "test": converted_validation  # will be added to the "test" split
})

In [None]:
def add_sequential_ids(record, idx):
    record['ID'] = idx+1
    return record

# Apply the function to both splits
new_data["train"] = new_data["train"].map(add_sequential_ids, with_indices=True)
new_data["test"] = new_data["test"].map(add_sequential_ids, with_indices=True)

print(new_data)

In [None]:
# Load the existing dataset from Hugging Face
existing_ds = load_dataset("johnjehiel/ph-llm-dataset")

# Append (concatenate) the new prompt–response pairs to the existing splits.
updated_train = concatenate_datasets([existing_ds["train"], new_data["train"]])
updated_test = concatenate_datasets([existing_ds["test"], new_data["test"]])

# Create an updated DatasetDict
updated_ds = DatasetDict({
    "train": updated_train,
    "test": updated_test
})

In [None]:
# Convert each split to a Parquet file (local temporary files)
train_parquet_path = "updated_train.parquet"
test_parquet_path = "updated_test.parquet"

# Save the splits to Parquet
updated_ds["train"].to_parquet(train_parquet_path)
updated_ds["test"].to_parquet(test_parquet_path)

repo_id = "johnjehiel/ph-llm-dataset"
create_repo(repo_id=repo_id, token=HF_TOKEN, repo_type="dataset", exist_ok=True)

# Upload the updated train split using the existing file name in the repo
upload_file(
    path_or_fileobj=train_parquet_path,
    path_in_repo="ph-llm-dataset_train.parquet",  # using the existing train file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset", 
    commit_message="Update train split with mmlu clinical knowledge data"
)

# Upload the updated test split using the existing file name in the repo
upload_file(
    path_or_fileobj=test_parquet_path,
    path_in_repo="ph-llm-dataset_test.parquet",  # using the existing test file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset",
    commit_message="Update test split with mmlu clinical knowledge data"
)

In [None]:
ds = load_dataset(repo_id)

In [None]:
ds

# Create dataset for MMLU college medicine

In [None]:
ds = load_dataset("openlifescienceai/mmlu_college_medicine")

In [None]:
# Convert the "test" split to prompt–response pairs
converted_test = ds["test"].map(format_prompt_response)
converted_test = converted_test.remove_columns(['subject_name', 'data', 'id'])
# Convert the "validation" split
converted_validation = ds["validation"].map(format_prompt_response)
converted_validation = converted_validation.remove_columns(['subject_name', 'data', 'id'])
# Create a new DatasetDict
new_data = DatasetDict({
    "train": converted_test,  # will be added to the "train" split
    "test": converted_validation  # will be added to the "test" split
})

In [None]:
def add_sequential_ids(record, idx):
    record['ID'] = idx+1
    return record

# Apply the function to both splits
new_data["train"] = new_data["train"].map(add_sequential_ids, with_indices=True)
new_data["test"] = new_data["test"].map(add_sequential_ids, with_indices=True)

print(new_data)

In [None]:
# Load the existing dataset from Hugging Face
existing_ds = load_dataset("johnjehiel/ph-llm-dataset")

# Append (concatenate) the new prompt–response pairs to the existing splits.
updated_train = concatenate_datasets([existing_ds["train"], new_data["train"]])
updated_test = concatenate_datasets([existing_ds["test"], new_data["test"]])

# Create an updated DatasetDict
updated_ds = DatasetDict({
    "train": updated_train,
    "test": updated_test
})

In [None]:
# Convert each split to a Parquet file (local temporary files)
train_parquet_path = "updated_train.parquet"
test_parquet_path = "updated_test.parquet"

# Save the splits to Parquet
updated_ds["train"].to_parquet(train_parquet_path)
updated_ds["test"].to_parquet(test_parquet_path)

repo_id = "johnjehiel/ph-llm-dataset"
create_repo(repo_id=repo_id, token=HF_TOKEN, repo_type="dataset", exist_ok=True)

# Upload the updated train split using the existing file name in the repo
upload_file(
    path_or_fileobj=train_parquet_path,
    path_in_repo="ph-llm-dataset_train.parquet",  # using the existing train file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset", 
    commit_message="Update train split with mmlu college medicine data"
)

# Upload the updated test split using the existing file name in the repo
upload_file(
    path_or_fileobj=test_parquet_path,
    path_in_repo="ph-llm-dataset_test.parquet",  # using the existing test file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset",
    commit_message="Update test split with mmlu college medicine data"
)

In [None]:
ds = load_dataset(repo_id)

In [None]:
ds

# Create dataset for college biology

In [None]:
ds = load_dataset("openlifescienceai/mmlu_college_biology")

In [None]:
# Convert the "test" split to prompt–response pairs
converted_test = ds["test"].map(format_prompt_response)
converted_test = converted_test.remove_columns(['subject_name', 'data', 'id'])
# Convert the "validation" split
converted_validation = ds["validation"].map(format_prompt_response)
converted_validation = converted_validation.remove_columns(['subject_name', 'data', 'id'])
# Create a new DatasetDict
new_data = DatasetDict({
    "train": converted_test,  # will be added to the "train" split
    "test": converted_validation  # will be added to the "test" split
})

In [None]:
def add_sequential_ids(record, idx):
    record['ID'] = idx+1
    return record

# Apply the function to both splits
new_data["train"] = new_data["train"].map(add_sequential_ids, with_indices=True)
new_data["test"] = new_data["test"].map(add_sequential_ids, with_indices=True)

print(new_data)

In [None]:
# Load the existing dataset from Hugging Face
existing_ds = load_dataset("johnjehiel/ph-llm-dataset")

# Append (concatenate) the new prompt–response pairs to the existing splits.
updated_train = concatenate_datasets([existing_ds["train"], new_data["train"]])
updated_test = concatenate_datasets([existing_ds["test"], new_data["test"]])

# Create an updated DatasetDict
updated_ds = DatasetDict({
    "train": updated_train,
    "test": updated_test
})

In [None]:
# Convert each split to a Parquet file (local temporary files)
train_parquet_path = "updated_train.parquet"
test_parquet_path = "updated_test.parquet"

# Save the splits to Parquet
updated_ds["train"].to_parquet(train_parquet_path)
updated_ds["test"].to_parquet(test_parquet_path)

repo_id = "johnjehiel/ph-llm-dataset"
create_repo(repo_id=repo_id, token=HF_TOKEN, repo_type="dataset", exist_ok=True)

# Upload the updated train split using the existing file name in the repo
upload_file(
    path_or_fileobj=train_parquet_path,
    path_in_repo="ph-llm-dataset_train.parquet",  # using the existing train file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset", 
    commit_message="Update train split with mmlu college biology data"
)

# Upload the updated test split using the existing file name in the repo
upload_file(
    path_or_fileobj=test_parquet_path,
    path_in_repo="ph-llm-dataset_test.parquet",  # using the existing test file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset",
    commit_message="Update test split with mmlu college biology data"
)

In [None]:
ds = load_dataset(repo_id)

In [None]:
ds

# Create dataset for MMLU anatomy

In [None]:
ds = load_dataset("openlifescienceai/mmlu_anatomy")

In [None]:
# Convert the "test" split to prompt–response pairs
converted_test = ds["test"].map(format_prompt_response)
converted_test = converted_test.remove_columns(['subject_name', 'data', 'id'])
# Convert the "validation" split
converted_validation = ds["validation"].map(format_prompt_response)
converted_validation = converted_validation.remove_columns(['subject_name', 'data', 'id'])
# Create a new DatasetDict
new_data = DatasetDict({
    "train": converted_test,  # will be added to the "train" split
    "test": converted_validation  # will be added to the "test" split
})

In [None]:
def add_sequential_ids(record, idx):
    record['ID'] = idx+1
    return record

# Apply the function to both splits
new_data["train"] = new_data["train"].map(add_sequential_ids, with_indices=True)
new_data["test"] = new_data["test"].map(add_sequential_ids, with_indices=True)

print(new_data)

In [None]:
# Load the existing dataset from Hugging Face
existing_ds = load_dataset("johnjehiel/ph-llm-dataset")

# Append (concatenate) the new prompt–response pairs to the existing splits.
updated_train = concatenate_datasets([existing_ds["train"], new_data["train"]])
updated_test = concatenate_datasets([existing_ds["test"], new_data["test"]])

# Create an updated DatasetDict
updated_ds = DatasetDict({
    "train": updated_train,
    "test": updated_test
})

In [None]:
# Convert each split to a Parquet file (local temporary files)
train_parquet_path = "updated_train.parquet"
test_parquet_path = "updated_test.parquet"

# Save the splits to Parquet
updated_ds["train"].to_parquet(train_parquet_path)
updated_ds["test"].to_parquet(test_parquet_path)

repo_id = "johnjehiel/ph-llm-dataset"
create_repo(repo_id=repo_id, token=HF_TOKEN, repo_type="dataset", exist_ok=True)

# Upload the updated train split using the existing file name in the repo
upload_file(
    path_or_fileobj=train_parquet_path,
    path_in_repo="ph-llm-dataset_train.parquet",  # using the existing train file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset", 
    commit_message="Update train split with mmlu anatomy data"
)

# Upload the updated test split using the existing file name in the repo
upload_file(
    path_or_fileobj=test_parquet_path,
    path_in_repo="ph-llm-dataset_test.parquet",  # using the existing test file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset",
    commit_message="Update test split with mmlu anatomy data"
)

In [None]:
ds = load_dataset(repo_id)

In [None]:
ds

# Create dataset for MMLU professional medicine

In [None]:
ds = load_dataset("openlifescienceai/mmlu_professional_medicine")

In [None]:
# Convert the "test" split to prompt–response pairs
converted_test = ds["test"].map(format_prompt_response)
converted_test = converted_test.remove_columns(['subject_name', 'data', 'id'])
# Convert the "validation" split
converted_validation = ds["validation"].map(format_prompt_response)
converted_validation = converted_validation.remove_columns(['subject_name', 'data', 'id'])
# Create a new DatasetDict
new_data = DatasetDict({
    "train": converted_test,  # will be added to the "train" split
    "test": converted_validation  # will be added to the "test" split
})

In [None]:
def add_sequential_ids(record, idx):
    record['ID'] = idx+1
    return record

# Apply the function to both splits
new_data["train"] = new_data["train"].map(add_sequential_ids, with_indices=True)
new_data["test"] = new_data["test"].map(add_sequential_ids, with_indices=True)

print(new_data)

In [None]:
# Load the existing dataset from Hugging Face
existing_ds = load_dataset("johnjehiel/ph-llm-dataset")

# Append (concatenate) the new prompt–response pairs to the existing splits.
updated_train = concatenate_datasets([existing_ds["train"], new_data["train"]])
updated_test = concatenate_datasets([existing_ds["test"], new_data["test"]])

# Create an updated DatasetDict
updated_ds = DatasetDict({
    "train": updated_train,
    "test": updated_test
})

In [None]:
# Convert each split to a Parquet file (local temporary files)
train_parquet_path = "updated_train.parquet"
test_parquet_path = "updated_test.parquet"

# Save the splits to Parquet
updated_ds["train"].to_parquet(train_parquet_path)
updated_ds["test"].to_parquet(test_parquet_path)

repo_id = "johnjehiel/ph-llm-dataset"
create_repo(repo_id=repo_id, token=HF_TOKEN, repo_type="dataset", exist_ok=True)

# Upload the updated train split using the existing file name in the repo
upload_file(
    path_or_fileobj=train_parquet_path,
    path_in_repo="ph-llm-dataset_train.parquet",  # using the existing train file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset", 
    commit_message="Update train split with mmlu professional medicine data"
)

# Upload the updated test split using the existing file name in the repo
upload_file(
    path_or_fileobj=test_parquet_path,
    path_in_repo="ph-llm-dataset_test.parquet",  # using the existing test file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset",
    commit_message="Update test split with mmlu professional medicine data"
)

In [None]:
ds = load_dataset(repo_id)

In [None]:
ds

# Create dataset for MMLU medical genetics

In [None]:
ds = load_dataset("openlifescienceai/mmlu_medical_genetics")

In [None]:
# Convert the "test" split to prompt–response pairs
converted_test = ds["test"].map(format_prompt_response)
converted_test = converted_test.remove_columns(['subject_name', 'data', 'id'])
# Convert the "validation" split
converted_validation = ds["validation"].map(format_prompt_response)
converted_validation = converted_validation.remove_columns(['subject_name', 'data', 'id'])
# Create a new DatasetDict
new_data = DatasetDict({
    "train": converted_test,  # will be added to the "train" split
    "test": converted_validation  # will be added to the "test" split
})

In [None]:
def add_sequential_ids(record, idx):
    record['ID'] = idx+1
    return record

# Apply the function to both splits
new_data["train"] = new_data["train"].map(add_sequential_ids, with_indices=True)
new_data["test"] = new_data["test"].map(add_sequential_ids, with_indices=True)

print(new_data)

In [None]:
# Load the existing dataset from Hugging Face
existing_ds = load_dataset("johnjehiel/ph-llm-dataset")

# Append (concatenate) the new prompt–response pairs to the existing splits.
updated_train = concatenate_datasets([existing_ds["train"], new_data["train"]])
updated_test = concatenate_datasets([existing_ds["test"], new_data["test"]])

# Create an updated DatasetDict
updated_ds = DatasetDict({
    "train": updated_train,
    "test": updated_test
})

In [None]:
# Convert each split to a Parquet file (local temporary files)
train_parquet_path = "updated_train.parquet"
test_parquet_path = "updated_test.parquet"

# Save the splits to Parquet
updated_ds["train"].to_parquet(train_parquet_path)
updated_ds["test"].to_parquet(test_parquet_path)

repo_id = "johnjehiel/ph-llm-dataset"
create_repo(repo_id=repo_id, token=HF_TOKEN, repo_type="dataset", exist_ok=True)

# Upload the updated train split using the existing file name in the repo
upload_file(
    path_or_fileobj=train_parquet_path,
    path_in_repo="ph-llm-dataset_train.parquet",  # using the existing train file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset", 
    commit_message="Update train split with mmlu medical genetics data"
)

# Upload the updated test split using the existing file name in the repo
upload_file(
    path_or_fileobj=test_parquet_path,
    path_in_repo="ph-llm-dataset_test.parquet",  # using the existing test file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset",
    commit_message="Update test split with mmlu medical genetics data"
)

In [None]:
ds = load_dataset(repo_id)

In [None]:
ds

# Create dataset for MedQA

In [None]:
ds = load_dataset("openlifescienceai/medqa")

In [None]:
# Convert the "train" split to prompt–response pairs
converted_test = ds["train"].map(format_prompt_response)
converted_test = converted_test.remove_columns(['subject_name', 'data', 'id'])
# Convert the "test" split
converted_validation = ds["test"].map(format_prompt_response)
converted_validation = converted_validation.remove_columns(['subject_name', 'data', 'id'])
# Create a new DatasetDict
new_data = DatasetDict({
    "train": converted_test,  # will be added to the "train" split
    "test": converted_validation  # will be added to the "test" split
})

In [None]:
def add_sequential_ids(record, idx):
    record['ID'] = idx+1
    return record

# Apply the function to both splits
new_data["train"] = new_data["train"].map(add_sequential_ids, with_indices=True)
new_data["test"] = new_data["test"].map(add_sequential_ids, with_indices=True)

print(new_data)

In [None]:
new_data["train"] = new_data["train"].map(lambda x: {'Category': 'medqa'})
new_data["test"] = new_data["test"].map(lambda x: {'Category': 'medqa'})

In [None]:
# Load the existing dataset from Hugging Face
existing_ds = load_dataset("johnjehiel/ph-llm-dataset")

# Append (concatenate) the new prompt–response pairs to the existing splits.
updated_train = concatenate_datasets([existing_ds["train"], new_data["train"]])
updated_test = concatenate_datasets([existing_ds["test"], new_data["test"]])

# Create an updated DatasetDict
updated_ds = DatasetDict({
    "train": updated_train,
    "test": updated_test
})

In [None]:
# Convert each split to a Parquet file (local temporary files)
train_parquet_path = "updated_train.parquet"
test_parquet_path = "updated_test.parquet"

# Save the splits to Parquet
updated_ds["train"].to_parquet(train_parquet_path)
updated_ds["test"].to_parquet(test_parquet_path)

repo_id = "johnjehiel/ph-llm-dataset"
create_repo(repo_id=repo_id, token=HF_TOKEN, repo_type="dataset", exist_ok=True)

# Upload the updated train split using the existing file name in the repo
upload_file(
    path_or_fileobj=train_parquet_path,
    path_in_repo="ph-llm-dataset_train.parquet",  # using the existing train file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset", 
    commit_message="Update train split with medqa data"
)

# Upload the updated test split using the existing file name in the repo
upload_file(
    path_or_fileobj=test_parquet_path,
    path_in_repo="ph-llm-dataset_test.parquet",  # using the existing test file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset",
    commit_message="Update test split with medqa data"
)

In [None]:
ds = load_dataset(repo_id)

In [None]:
ds

# Create dataset for PubMedQA

In [None]:
ds = load_dataset("openlifescienceai/pubmedqa")

In [None]:
ds["train"] = concatenate_datasets([ds["train"], ds["test"]])
ds

In [None]:
prompt_template = """You are a medical expert specialized in bio-medical research. Analyze the given context and answer the following question by selecting the correct or best option.

### Context:
{context}

### Question:
{question}

### Options:
(A) {option_A}
(B) {option_B}
(C) {option_C}
"""
response_template = """Answer: ({correct_option}) {correct_answer}
Explanation: {long_answer}"""

In [None]:
def format_prompt_response(record):
    # Extract data from the 'data' dictionary.
    data = record["data"]
    context = data.get("Context", "")
    question = data.get("Question", "")
    options = data.get("Options", {})
    option_A = options.get("A", "")
    option_B = options.get("B", "")
    option_C = options.get("C", "")
    prompt = prompt_template.format(
        context="\n".join(context),
        question=question,
        option_A=option_A,
        option_B=option_B,
        option_C=option_C
    )
    correct_answer = data.get("Correct Answer", "")
    correct_option = data.get("Correct Option", "")
    long_answer = data.get("Long Answer", "")
    response = response_template.format(
        correct_option=correct_option,
        correct_answer=correct_answer,
        long_answer=long_answer
    )
    return {
        "Category": "pubmedqa",
        "ID": record["id"],
        "Prompt": prompt,
        "Response": response
    }

In [None]:
# Convert the "train" split to prompt–response pairs
converted_test = ds["train"].map(format_prompt_response)
converted_test = converted_test.remove_columns(['data', 'id'])
# Convert the "validation" split
converted_validation = ds["validation"].map(format_prompt_response)
converted_validation = converted_validation.remove_columns(['data', 'id'])
# Create a new DatasetDict
new_data = DatasetDict({
    "train": converted_test,  # will be added to the "train" split
    "test": converted_validation  # will be added to the "test" split
})

In [None]:
def add_sequential_ids(record, idx):
    record['ID'] = idx+1
    return record

# Apply the function to both splits
new_data["train"] = new_data["train"].map(add_sequential_ids, with_indices=True)
new_data["test"] = new_data["test"].map(add_sequential_ids, with_indices=True)

print(new_data)

In [None]:
# Load the existing dataset from Hugging Face
existing_ds = load_dataset("johnjehiel/ph-llm-dataset")

# Append (concatenate) the new prompt–response pairs to the existing splits.
updated_train = concatenate_datasets([existing_ds["train"], new_data["train"]])
updated_test = concatenate_datasets([existing_ds["test"], new_data["test"]])

# Create an updated DatasetDict
updated_ds = DatasetDict({
    "train": updated_train,
    "test": updated_test
})

In [None]:
# Convert each split to a Parquet file (local temporary files)
train_parquet_path = "updated_train.parquet"
test_parquet_path = "updated_test.parquet"

# Save the splits to Parquet
updated_ds["train"].to_parquet(train_parquet_path)
updated_ds["test"].to_parquet(test_parquet_path)

repo_id = "johnjehiel/ph-llm-dataset"
create_repo(repo_id=repo_id, token=HF_TOKEN, repo_type="dataset", exist_ok=True)

# Upload the updated train split using the existing file name in the repo
upload_file(
    path_or_fileobj=train_parquet_path,
    path_in_repo="ph-llm-dataset_train.parquet",  # using the existing train file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset", 
    commit_message="Update train split with pubmedqa data"
)

# Upload the updated test split using the existing file name in the repo
upload_file(
    path_or_fileobj=test_parquet_path,
    path_in_repo="ph-llm-dataset_test.parquet",  # using the existing test file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset",
    commit_message="Update test split with pubmedqa data"
)

In [None]:
ds = load_dataset(repo_id)

In [None]:
ds

# Create dataset for MedMCQA

In [None]:
ds = load_dataset("openlifescienceai/medmcqa")

In [4]:
keywords = [
    'sleep', 'fitness', 'stress', 'heart', 'health', 'health care', 'personal', 'medical', 'cardio', 'medicine', 'exercise', 'smoking', 'smoker', 'alcohol', 'alcoholic', 'bmi', 'blood pressure', 'steps', 'step', 'run', 'jog', ' rem ', 'circadian', 'homeostatic', 'injury', ' train', 'etiology', 'etiological', 'recommend', 'advice', 'advise', 'assistance', 'assist', 'workout', 'work out', 'lifestyle', 'z-score', 'z score', 'athelete', 'athlete', 'sport', 'respiratory', 'fatigue', 'pressure', 'recover', 'hydrate', 'faint', 'drowsy', 'drowsiness', 'gym', 'muscle', 'sore', 'wake', 'rest', 'relax', 'insomnia', 'physic', 'care', 'calorie', 'fat loss', 'weight', 'height', 'mobility', 'activity', 'active'
]

In [5]:
len(keywords)

65

In [None]:
# Function to check if any keyword is present in a record
def contains_keyword(example):
    for keyword in keywords:
        if keyword in example['question'].lower():
            return True
    return False

In [None]:
# Apply the filter to each split
for split in ds.keys():
    ds[split] = ds[split].filter(contains_keyword)

print(ds)

In [None]:
prompt_template = """You are a medical expert specialized in {subject_name}. Answer the following question by selecting the correct or best option.

### Question:
{question}

### Options:
(A) {opa}
(B) {opb}
(C) {opc}
(D) {opd}
"""
response_template = """Answer: ({correct_option}) {correct_answer}
{exp}"""

In [None]:
def format_prompt_response(record):
    question = record["question"]
    opa = record["opa"]
    opb = record["opb"]
    opc = record["opc"]
    opd = record["opd"]
    subject_name = record['subject_name']
    prompt = prompt_template.format(
        subject_name=subject_name,
        question=question,
        opa=opa,
        opb=opb,
        opc=opc,
        opd=opd
    )
    optionMap = {0:['A', opa], 1:['B', opb], 2:['C', opc], 3:['D', opd]}
    correct_option = optionMap[record["cop"]][0]
    correct_answer = optionMap[record["cop"]][1]
    exp = ""
    if record["exp"]:
        exp = f"Explanation: {record['exp']}" 
    response = response_template.format(
        correct_option=correct_option,
        correct_answer=correct_answer,
        exp=exp
    )
    return {
        "Category": "MedMCQA",
        "ID": record["id"],
        "Prompt": prompt,
        "Response": response
    }

In [None]:
# Convert the "train" split to prompt–response pairs
converted_train = ds["train"].map(format_prompt_response)
converted_train = converted_train.remove_columns(['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name'])
# Convert the "validation" split
converted_validation = ds["validation"].map(format_prompt_response)
converted_validation = converted_validation.remove_columns(['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name'])
# Create a new DatasetDict
new_data = DatasetDict({
    "train": converted_train,  # will be added to the "train" split
    "test": converted_validation  # will be added to the "test" split
})

In [None]:
def add_sequential_ids(record, idx):
    record['ID'] = idx+1
    return record

# Apply the function to both splits
new_data["train"] = new_data["train"].map(add_sequential_ids, with_indices=True)
new_data["test"] = new_data["test"].map(add_sequential_ids, with_indices=True)

print(new_data)

In [None]:
# Load the existing dataset from Hugging Face
existing_ds = load_dataset("johnjehiel/ph-llm-dataset")

# Append (concatenate) the new prompt–response pairs to the existing splits.
updated_train = concatenate_datasets([existing_ds["train"], new_data["train"]])
updated_test = concatenate_datasets([existing_ds["test"], new_data["test"]])

# Create an updated DatasetDict
updated_ds = DatasetDict({
    "train": updated_train,
    "test": updated_test
})

In [None]:
# Convert each split to a Parquet file (local temporary files)
train_parquet_path = "updated_train.parquet"
test_parquet_path = "updated_test.parquet"

# Save the splits to Parquet
updated_ds["train"].to_parquet(train_parquet_path)
updated_ds["test"].to_parquet(test_parquet_path)

repo_id = "johnjehiel/ph-llm-dataset"
create_repo(repo_id=repo_id, token=HF_TOKEN, repo_type="dataset", exist_ok=True)

# Upload the updated train split using the existing file name in the repo
upload_file(
    path_or_fileobj=train_parquet_path,
    path_in_repo="ph-llm-dataset_train.parquet",  # using the existing train file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset", 
    commit_message="Update train split with medmcqa data"
)

# Upload the updated test split using the existing file name in the repo
upload_file(
    path_or_fileobj=test_parquet_path,
    path_in_repo="ph-llm-dataset_test.parquet",  # using the existing test file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset",
    commit_message="Update test split with medmcqa data"
)

In [None]:
ds = load_dataset(repo_id)

In [None]:
ds

# create dataset for Patient-Doctor interaction

In [8]:
ds = load_dataset("ruslanmv/ai-medical-chatbot")

README.md:   0%|          | 0.00/863 [00:00<?, ?B/s]

dialogues.parquet:   0%|          | 0.00/142M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/256916 [00:00<?, ? examples/s]

In [9]:
ds

DatasetDict({
    train: Dataset({
        features: ['Description', 'Patient', 'Doctor'],
        num_rows: 256916
    })
})

In [11]:
def contains_keyword(example):
    for keyword in keywords:
        if keyword in example['Description'].lower():
            return True
    return False

In [12]:
# Apply the filter to each split
for split in ds.keys():
    ds[split] = ds[split].filter(contains_keyword)

print(ds)

Filter:   0%|          | 0/256916 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Description', 'Patient', 'Doctor'],
        num_rows: 38164
    })
})


In [15]:
ds = ds["train"].train_test_split(test_size=0.1)

In [16]:
ds

DatasetDict({
    train: Dataset({
        features: ['Description', 'Patient', 'Doctor'],
        num_rows: 34347
    })
    test: Dataset({
        features: ['Description', 'Patient', 'Doctor'],
        num_rows: 3817
    })
})

In [17]:
prompt_template = """You are a compassionate and expert medical advisor specializing in personalized health assistance.
Analyze the context and the patient’s query carefully, considering their background and concerns.

### Patient Context:
{description}

### Patient Query:
{patient_query}

Provide a detailed, evidence-based, and empathetic response that offers practical recommendations.
"""

response_template = """{doctor_response}"""

In [19]:
def format_prompt_response(record, record_index):
    description = record["Description"]
    patient_query = record["Patient"]
    doctor_response = record["Doctor"]
    
    prompt = prompt_template.format(
        description=description,
        patient_query=patient_query
    )
    
    response = response_template.format(
        doctor_response=doctor_response
    )
    
    return {
        "Category": "patient_doctor_conversation",
        "ID": record_index + 1,  # use record's index as ID
        "Prompt": prompt,
        "Response": response
    }

In [20]:
# Convert the "train" split to prompt–response pairs
converted_train = ds["train"].map(format_prompt_response, with_indices=True)
converted_train = converted_train.remove_columns(['Description', 'Patient', 'Doctor'])
# Convert the "test" split
converted_test = ds["test"].map(format_prompt_response, with_indices=True)
converted_test = converted_test.remove_columns(['Description', 'Patient', 'Doctor'])
# Create a new DatasetDict
new_data = DatasetDict({
    "train": converted_train,  # will be added to the "train" split
    "test": converted_test  # will be added to the "test" split
})

Map:   0%|          | 0/34347 [00:00<?, ? examples/s]

Map:   0%|          | 0/3817 [00:00<?, ? examples/s]

In [32]:
# print(new_data["train"][-100])
# print(new_data["test"][-100])

In [30]:
# Load the existing dataset from Hugging Face
existing_ds = load_dataset("johnjehiel/ph-llm-dataset")

# Append (concatenate) the new prompt–response pairs to the existing splits.
updated_train = concatenate_datasets([existing_ds["train"], new_data["train"]])
updated_test = concatenate_datasets([existing_ds["test"], new_data["test"]])

# Create an updated DatasetDict
updated_ds = DatasetDict({
    "train": updated_train,
    "test": updated_test
})

ph-llm-dataset_train.parquet:   0%|          | 0.00/19.2M [00:00<?, ?B/s]

ph-llm-dataset_test.parquet:   0%|          | 0.00/1.24M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/32954 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2117 [00:00<?, ? examples/s]

In [33]:
# Convert each split to a Parquet file (local temporary files)
train_parquet_path = "updated_train.parquet"
test_parquet_path = "updated_test.parquet"

# Save the splits to Parquet
updated_ds["train"].to_parquet(train_parquet_path)
updated_ds["test"].to_parquet(test_parquet_path)

repo_id = "johnjehiel/ph-llm-dataset"
create_repo(repo_id=repo_id, token=HF_TOKEN, repo_type="dataset", exist_ok=True)

# Upload the updated train split using the existing file name in the repo
upload_file(
    path_or_fileobj=train_parquet_path,
    path_in_repo="ph-llm-dataset_train.parquet",  # using the existing train file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset", 
    commit_message="Update train split with patient-doctor conversation data"
)

# Upload the updated test split using the existing file name in the repo
upload_file(
    path_or_fileobj=test_parquet_path,
    path_in_repo="ph-llm-dataset_test.parquet",  # using the existing test file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset",
    commit_message="Update test split with patient-doctor conversation data"
)

Creating parquet from Arrow format:   0%|          | 0/68 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

updated_train.parquet:   0%|          | 0.00/43.3M [00:00<?, ?B/s]

updated_test.parquet:   0%|          | 0.00/3.89M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/johnjehiel/ph-llm-dataset/commit/53cc68972d85a823e6b6baa8b3c8e6dc61d065ae', commit_message='Update test split with patient-doctor conversation data', commit_description='', oid='53cc68972d85a823e6b6baa8b3c8e6dc61d065ae', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/johnjehiel/ph-llm-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='johnjehiel/ph-llm-dataset'), pr_revision=None, pr_num=None)