In [3]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
HF_TOKEN = user_secrets.get_secret("HF_TOKEN")

# Create PH-LLM Dataset for Sleep and fitness

In [4]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from huggingface_hub import create_repo, upload_file

In [None]:
# Define repository details (ensure this repo name is for a dataset)
repo_id = "johnjehiel/ph-llm-dataset"  # replace with your Hugging Face username/repo name

# Load the CSV dataset (assumed to have columns: 'Category', 'ID', 'Prompt', 'Response')
csv_path = "/kaggle/input/sleep-and-fitness-dataset/PH-LLM Custom Dataset.csv"
df = pd.read_csv(csv_path)

# Stratified split by 'Category': 90% train, 10% test
train_df, test_df = train_test_split(
    df, test_size=0.1, stratify=df["Category"], random_state=42
)

# Save the train and test splits as separate Parquet files
train_parquet_path = "ph-llm-dataset_train.parquet"
test_parquet_path = "ph-llm-dataset_test.parquet"
train_df.to_parquet(train_parquet_path, index=False)
test_df.to_parquet(test_parquet_path, index=False)

# Create the dataset repository on Hugging Face (specify repo_type="dataset")
create_repo(repo_id=repo_id, token=HF_TOKEN, repo_type="dataset", exist_ok=True)

# Upload the train split to the dataset repository
upload_file(
    path_or_fileobj=train_parquet_path,
    path_in_repo="ph-llm-dataset_train.parquet",
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset",  # IMPORTANT: ensure you're uploading to a dataset repo
    commit_message="Upload train split (90%) for sleep and fitness"
)

# Upload the test split to the dataset repository
upload_file(
    path_or_fileobj=test_parquet_path,
    path_in_repo="ph-llm-dataset_test.parquet",
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset",  # IMPORTANT: ensure you're uploading to a dataset repo
    commit_message="Upload test split (10%) for sleep and fitness"
)

In [4]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("johnjehiel/ph-llm-dataset")

ph-llm-dataset_train.parquet:   0%|          | 0.00/2.19M [00:00<?, ?B/s]

ph-llm-dataset_test.parquet:   0%|          | 0.00/252k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1557 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/173 [00:00<?, ? examples/s]

In [5]:
ds

DatasetDict({
    train: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 1557
    })
    test: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 173
    })
})

# Create MMLU clinical knowledge dataset

In [5]:
from datasets import load_dataset, DatasetDict, concatenate_datasets
import pandas as pd

In [6]:
# Load the mmlu clinical knowledge dataset
mmlu_ds = load_dataset("openlifescienceai/mmlu_clinical_knowledge")

In [25]:
prompt_template = """You are a medical expert {specialization}. Answer the following question by selecting the correct option.

### Question:
{question}

### Options:
(A) {option_A}
(B) {option_B}
(C) {option_C}
(D) {option_D}
"""
response_template = "Answer: ({correct_option}) {correct_answer}"

In [55]:
def format_prompt_response(record):
    # Extract data from the 'data' dictionary.
    data = record["data"]
    question = data.get("Question", "")
    options = data.get("Options", {})
    option_A = options.get("A", "")
    option_B = options.get("B", "")
    option_C = options.get("C", "")
    option_D = options.get("D", "")
    specialization = "specialized in " + " ".join(record["subject_name"].split("_")) if record["subject_name"] else ""
    prompt = prompt_template.format(
        specialization=specialization,
        question=question,
        option_A=option_A,
        option_B=option_B,
        option_C=option_C,
        option_D=option_D
    )
    correct_answer = data.get("Correct Answer", "")
    correct_option = data.get("Correct Option", "")
    response = response_template.format(
        correct_option=correct_option,
        correct_answer=correct_answer
    )
    return {
        "Category": record["subject_name"],
        "ID": record["id"],
        "Prompt": prompt,
        "Response": response
    }

In [27]:
# Convert the "test" split to prompt–response pairs
converted_test = mmlu_ds["test"].map(format_prompt_response)
converted_test = converted_test.remove_columns(['subject_name', 'data', 'id'])
# Convert the "validation" split
converted_validation = mmlu_ds["validation"].map(format_prompt_response)
converted_validation = converted_validation.remove_columns(['subject_name', 'data', 'id'])
# Create a new DatasetDict
new_data = DatasetDict({
    "train": converted_test,  # will be added to the "train" split
    "test": converted_validation  # will be added to the "test" split
})

Map:   0%|          | 0/29 [00:00<?, ? examples/s]

In [32]:
def add_sequential_ids(record, idx):
    record['ID'] = idx+1
    return record

# Apply the function to both splits
new_data["train"] = new_data["train"].map(add_sequential_ids, with_indices=True)
new_data["test"] = new_data["test"].map(add_sequential_ids, with_indices=True)

print(new_data)

Map:   0%|          | 0/265 [00:00<?, ? examples/s]

Map:   0%|          | 0/29 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 265
    })
    test: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 29
    })
})


In [40]:
# Load the existing dataset from Hugging Face
existing_ds = load_dataset("johnjehiel/ph-llm-dataset")

# Append (concatenate) the new prompt–response pairs to the existing splits.
updated_train = concatenate_datasets([existing_ds["train"], new_data["train"]])
updated_test = concatenate_datasets([existing_ds["test"], new_data["test"]])

# Create an updated DatasetDict
updated_ds = DatasetDict({
    "train": updated_train,
    "test": updated_test
})

ph-llm-dataset_train.parquet:   0%|          | 0.00/2.19M [00:00<?, ?B/s]

ph-llm-dataset_test.parquet:   0%|          | 0.00/252k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1557 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/173 [00:00<?, ? examples/s]

In [52]:
# Convert each split to a Parquet file (local temporary files)
train_parquet_path = "updated_train.parquet"
test_parquet_path = "updated_test.parquet"

# Save the splits to Parquet
updated_ds["train"].to_parquet(train_parquet_path)
updated_ds["test"].to_parquet(test_parquet_path)

repo_id = "johnjehiel/ph-llm-dataset"
create_repo(repo_id=repo_id, token=HF_TOKEN, repo_type="dataset", exist_ok=True)

# Upload the updated train split using the existing file name in the repo
upload_file(
    path_or_fileobj=train_parquet_path,
    path_in_repo="ph-llm-dataset_train.parquet",  # using the existing train file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset", 
    commit_message="Update train split with mmlu clinical knowledge data"
)

# Upload the updated test split using the existing file name in the repo
upload_file(
    path_or_fileobj=test_parquet_path,
    path_in_repo="ph-llm-dataset_test.parquet",  # using the existing test file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset",
    commit_message="Update test split with mmlu clinical knowledge data"
)

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

updated_train.parquet:   0%|          | 0.00/2.23M [00:00<?, ?B/s]

updated_test.parquet:   0%|          | 0.00/257k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/johnjehiel/ph-llm-dataset/commit/6551f0c3667130c7eef6989a97b55c9bc7941925', commit_message='Update test split with mmlu clinical knowledge data', commit_description='', oid='6551f0c3667130c7eef6989a97b55c9bc7941925', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/johnjehiel/ph-llm-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='johnjehiel/ph-llm-dataset'), pr_revision=None, pr_num=None)

In [53]:
ds = load_dataset(repo_id)

ph-llm-dataset_train.parquet:   0%|          | 0.00/2.23M [00:00<?, ?B/s]

ph-llm-dataset_test.parquet:   0%|          | 0.00/257k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [54]:
ds

DatasetDict({
    train: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 1822
    })
    test: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 202
    })
})

# Create dataset for MMLU college medicine

In [56]:
ds = load_dataset("openlifescienceai/mmlu_college_medicine")

README.md:   0%|          | 0.00/854 [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/63.7k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/14.3k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/8.61k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/173 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/22 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

In [57]:
# Convert the "test" split to prompt–response pairs
converted_test = ds["test"].map(format_prompt_response)
converted_test = converted_test.remove_columns(['subject_name', 'data', 'id'])
# Convert the "validation" split
converted_validation = ds["validation"].map(format_prompt_response)
converted_validation = converted_validation.remove_columns(['subject_name', 'data', 'id'])
# Create a new DatasetDict
new_data = DatasetDict({
    "train": converted_test,  # will be added to the "train" split
    "test": converted_validation  # will be added to the "test" split
})

Map:   0%|          | 0/173 [00:00<?, ? examples/s]

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

In [61]:
def add_sequential_ids(record, idx):
    record['ID'] = idx+1
    return record

# Apply the function to both splits
new_data["train"] = new_data["train"].map(add_sequential_ids, with_indices=True)
new_data["test"] = new_data["test"].map(add_sequential_ids, with_indices=True)

print(new_data)

Map:   0%|          | 0/173 [00:00<?, ? examples/s]

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 173
    })
    test: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 22
    })
})


In [69]:
# Load the existing dataset from Hugging Face
existing_ds = load_dataset("johnjehiel/ph-llm-dataset")

# Append (concatenate) the new prompt–response pairs to the existing splits.
updated_train = concatenate_datasets([existing_ds["train"], new_data["train"]])
updated_test = concatenate_datasets([existing_ds["test"], new_data["test"]])

# Create an updated DatasetDict
updated_ds = DatasetDict({
    "train": updated_train,
    "test": updated_test
})

In [75]:
# Convert each split to a Parquet file (local temporary files)
train_parquet_path = "updated_train.parquet"
test_parquet_path = "updated_test.parquet"

# Save the splits to Parquet
updated_ds["train"].to_parquet(train_parquet_path)
updated_ds["test"].to_parquet(test_parquet_path)

repo_id = "johnjehiel/ph-llm-dataset"
create_repo(repo_id=repo_id, token=HF_TOKEN, repo_type="dataset", exist_ok=True)

# Upload the updated train split using the existing file name in the repo
upload_file(
    path_or_fileobj=train_parquet_path,
    path_in_repo="ph-llm-dataset_train.parquet",  # using the existing train file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset", 
    commit_message="Update train split with mmlu college medicine data"
)

# Upload the updated test split using the existing file name in the repo
upload_file(
    path_or_fileobj=test_parquet_path,
    path_in_repo="ph-llm-dataset_test.parquet",  # using the existing test file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset",
    commit_message="Update test split with mmlu college medicine data"
)

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

updated_train.parquet:   0%|          | 0.00/2.28M [00:00<?, ?B/s]

updated_test.parquet:   0%|          | 0.00/263k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/johnjehiel/ph-llm-dataset/commit/1db0db814251e686363d33aae201d846149f4839', commit_message='Update test split with mmlu college medicine data', commit_description='', oid='1db0db814251e686363d33aae201d846149f4839', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/johnjehiel/ph-llm-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='johnjehiel/ph-llm-dataset'), pr_revision=None, pr_num=None)

In [76]:
ds = load_dataset(repo_id)

ph-llm-dataset_train.parquet:   0%|          | 0.00/2.28M [00:00<?, ?B/s]

ph-llm-dataset_test.parquet:   0%|          | 0.00/263k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [81]:
ds

DatasetDict({
    train: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 1995
    })
    test: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 224
    })
})

# Create dataset for college biology

In [82]:
ds = load_dataset("openlifescienceai/mmlu_college_biology")

README.md:   0%|          | 0.00/853 [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/49.5k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/12.0k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/7.33k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/144 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/16 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

In [83]:
# Convert the "test" split to prompt–response pairs
converted_test = ds["test"].map(format_prompt_response)
converted_test = converted_test.remove_columns(['subject_name', 'data', 'id'])
# Convert the "validation" split
converted_validation = ds["validation"].map(format_prompt_response)
converted_validation = converted_validation.remove_columns(['subject_name', 'data', 'id'])
# Create a new DatasetDict
new_data = DatasetDict({
    "train": converted_test,  # will be added to the "train" split
    "test": converted_validation  # will be added to the "test" split
})

Map:   0%|          | 0/144 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

In [84]:
def add_sequential_ids(record, idx):
    record['ID'] = idx+1
    return record

# Apply the function to both splits
new_data["train"] = new_data["train"].map(add_sequential_ids, with_indices=True)
new_data["test"] = new_data["test"].map(add_sequential_ids, with_indices=True)

print(new_data)

Map:   0%|          | 0/144 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 144
    })
    test: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 16
    })
})


In [85]:
# Load the existing dataset from Hugging Face
existing_ds = load_dataset("johnjehiel/ph-llm-dataset")

# Append (concatenate) the new prompt–response pairs to the existing splits.
updated_train = concatenate_datasets([existing_ds["train"], new_data["train"]])
updated_test = concatenate_datasets([existing_ds["test"], new_data["test"]])

# Create an updated DatasetDict
updated_ds = DatasetDict({
    "train": updated_train,
    "test": updated_test
})

In [86]:
# Convert each split to a Parquet file (local temporary files)
train_parquet_path = "updated_train.parquet"
test_parquet_path = "updated_test.parquet"

# Save the splits to Parquet
updated_ds["train"].to_parquet(train_parquet_path)
updated_ds["test"].to_parquet(test_parquet_path)

repo_id = "johnjehiel/ph-llm-dataset"
create_repo(repo_id=repo_id, token=HF_TOKEN, repo_type="dataset", exist_ok=True)

# Upload the updated train split using the existing file name in the repo
upload_file(
    path_or_fileobj=train_parquet_path,
    path_in_repo="ph-llm-dataset_train.parquet",  # using the existing train file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset", 
    commit_message="Update train split with mmlu college biology data"
)

# Upload the updated test split using the existing file name in the repo
upload_file(
    path_or_fileobj=test_parquet_path,
    path_in_repo="ph-llm-dataset_test.parquet",  # using the existing test file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset",
    commit_message="Update test split with mmlu college biology data"
)

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

updated_train.parquet:   0%|          | 0.00/2.32M [00:00<?, ?B/s]

updated_test.parquet:   0%|          | 0.00/267k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/johnjehiel/ph-llm-dataset/commit/65a0fbf0a3cfa3db6f75130e3f6fc045e8119c19', commit_message='Update test split with mmlu college biology data', commit_description='', oid='65a0fbf0a3cfa3db6f75130e3f6fc045e8119c19', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/johnjehiel/ph-llm-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='johnjehiel/ph-llm-dataset'), pr_revision=None, pr_num=None)

In [87]:
ds = load_dataset(repo_id)

ph-llm-dataset_train.parquet:   0%|          | 0.00/2.32M [00:00<?, ?B/s]

ph-llm-dataset_test.parquet:   0%|          | 0.00/267k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2139 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/240 [00:00<?, ? examples/s]

In [88]:
ds

DatasetDict({
    train: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 2139
    })
    test: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 240
    })
})

# Create dataset for MMLU anatomy

In [89]:
ds = load_dataset("openlifescienceai/mmlu_anatomy")

README.md:   0%|          | 0.00/853 [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/36.2k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/9.35k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/6.57k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/135 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/14 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

In [90]:
# Convert the "test" split to prompt–response pairs
converted_test = ds["test"].map(format_prompt_response)
converted_test = converted_test.remove_columns(['subject_name', 'data', 'id'])
# Convert the "validation" split
converted_validation = ds["validation"].map(format_prompt_response)
converted_validation = converted_validation.remove_columns(['subject_name', 'data', 'id'])
# Create a new DatasetDict
new_data = DatasetDict({
    "train": converted_test,  # will be added to the "train" split
    "test": converted_validation  # will be added to the "test" split
})

Map:   0%|          | 0/135 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

In [91]:
def add_sequential_ids(record, idx):
    record['ID'] = idx+1
    return record

# Apply the function to both splits
new_data["train"] = new_data["train"].map(add_sequential_ids, with_indices=True)
new_data["test"] = new_data["test"].map(add_sequential_ids, with_indices=True)

print(new_data)

Map:   0%|          | 0/135 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 135
    })
    test: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 14
    })
})


In [93]:
# Load the existing dataset from Hugging Face
existing_ds = load_dataset("johnjehiel/ph-llm-dataset")

# Append (concatenate) the new prompt–response pairs to the existing splits.
updated_train = concatenate_datasets([existing_ds["train"], new_data["train"]])
updated_test = concatenate_datasets([existing_ds["test"], new_data["test"]])

# Create an updated DatasetDict
updated_ds = DatasetDict({
    "train": updated_train,
    "test": updated_test
})

In [94]:
# Convert each split to a Parquet file (local temporary files)
train_parquet_path = "updated_train.parquet"
test_parquet_path = "updated_test.parquet"

# Save the splits to Parquet
updated_ds["train"].to_parquet(train_parquet_path)
updated_ds["test"].to_parquet(test_parquet_path)

repo_id = "johnjehiel/ph-llm-dataset"
create_repo(repo_id=repo_id, token=HF_TOKEN, repo_type="dataset", exist_ok=True)

# Upload the updated train split using the existing file name in the repo
upload_file(
    path_or_fileobj=train_parquet_path,
    path_in_repo="ph-llm-dataset_train.parquet",  # using the existing train file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset", 
    commit_message="Update train split with mmlu anatomy data"
)

# Upload the updated test split using the existing file name in the repo
upload_file(
    path_or_fileobj=test_parquet_path,
    path_in_repo="ph-llm-dataset_test.parquet",  # using the existing test file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset",
    commit_message="Update test split with mmlu anatomy data"
)

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

updated_train.parquet:   0%|          | 0.00/2.34M [00:00<?, ?B/s]

updated_test.parquet:   0%|          | 0.00/269k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/johnjehiel/ph-llm-dataset/commit/2e412c86b33daa76bdaa54d6b4c6bf1ae734c6d9', commit_message='Update test split with mmlu anatomy data', commit_description='', oid='2e412c86b33daa76bdaa54d6b4c6bf1ae734c6d9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/johnjehiel/ph-llm-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='johnjehiel/ph-llm-dataset'), pr_revision=None, pr_num=None)

In [95]:
ds = load_dataset(repo_id)

ph-llm-dataset_train.parquet:   0%|          | 0.00/2.34M [00:00<?, ?B/s]

ph-llm-dataset_test.parquet:   0%|          | 0.00/269k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [96]:
ds

DatasetDict({
    train: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 2274
    })
    test: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 254
    })
})

# Create dataset for MMLU professional medicine

In [97]:
ds = load_dataset("openlifescienceai/mmlu_professional_medicine")

README.md:   0%|          | 0.00/857 [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/149k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/11.4k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/272 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/31 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

In [98]:
# Convert the "test" split to prompt–response pairs
converted_test = ds["test"].map(format_prompt_response)
converted_test = converted_test.remove_columns(['subject_name', 'data', 'id'])
# Convert the "validation" split
converted_validation = ds["validation"].map(format_prompt_response)
converted_validation = converted_validation.remove_columns(['subject_name', 'data', 'id'])
# Create a new DatasetDict
new_data = DatasetDict({
    "train": converted_test,  # will be added to the "train" split
    "test": converted_validation  # will be added to the "test" split
})

Map:   0%|          | 0/272 [00:00<?, ? examples/s]

Map:   0%|          | 0/31 [00:00<?, ? examples/s]

In [99]:
def add_sequential_ids(record, idx):
    record['ID'] = idx+1
    return record

# Apply the function to both splits
new_data["train"] = new_data["train"].map(add_sequential_ids, with_indices=True)
new_data["test"] = new_data["test"].map(add_sequential_ids, with_indices=True)

print(new_data)

Map:   0%|          | 0/272 [00:00<?, ? examples/s]

Map:   0%|          | 0/31 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 272
    })
    test: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 31
    })
})


In [100]:
# Load the existing dataset from Hugging Face
existing_ds = load_dataset("johnjehiel/ph-llm-dataset")

# Append (concatenate) the new prompt–response pairs to the existing splits.
updated_train = concatenate_datasets([existing_ds["train"], new_data["train"]])
updated_test = concatenate_datasets([existing_ds["test"], new_data["test"]])

# Create an updated DatasetDict
updated_ds = DatasetDict({
    "train": updated_train,
    "test": updated_test
})

In [101]:
# Convert each split to a Parquet file (local temporary files)
train_parquet_path = "updated_train.parquet"
test_parquet_path = "updated_test.parquet"

# Save the splits to Parquet
updated_ds["train"].to_parquet(train_parquet_path)
updated_ds["test"].to_parquet(test_parquet_path)

repo_id = "johnjehiel/ph-llm-dataset"
create_repo(repo_id=repo_id, token=HF_TOKEN, repo_type="dataset", exist_ok=True)

# Upload the updated train split using the existing file name in the repo
upload_file(
    path_or_fileobj=train_parquet_path,
    path_in_repo="ph-llm-dataset_train.parquet",  # using the existing train file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset", 
    commit_message="Update train split with mmlu professional medicine data"
)

# Upload the updated test split using the existing file name in the repo
upload_file(
    path_or_fileobj=test_parquet_path,
    path_in_repo="ph-llm-dataset_test.parquet",  # using the existing test file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset",
    commit_message="Update test split with mmlu professional medicine data"
)

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

updated_train.parquet:   0%|          | 0.00/2.47M [00:00<?, ?B/s]

updated_test.parquet:   0%|          | 0.00/284k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/johnjehiel/ph-llm-dataset/commit/69e070424ede53ada0e9f23e8930b806bfe3c322', commit_message='Update test split with mmlu professional medicine data', commit_description='', oid='69e070424ede53ada0e9f23e8930b806bfe3c322', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/johnjehiel/ph-llm-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='johnjehiel/ph-llm-dataset'), pr_revision=None, pr_num=None)

In [102]:
ds = load_dataset(repo_id)

ph-llm-dataset_train.parquet:   0%|          | 0.00/2.47M [00:00<?, ?B/s]

ph-llm-dataset_test.parquet:   0%|          | 0.00/284k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2546 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/285 [00:00<?, ? examples/s]

In [107]:
ds

DatasetDict({
    train: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 2546
    })
    test: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 285
    })
})

# Create dataset for MMLU medical genetics

In [108]:
ds = load_dataset("openlifescienceai/mmlu_medical_genetics")

README.md:   0%|          | 0.00/853 [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/27.4k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/9.74k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/6.64k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

In [109]:
# Convert the "test" split to prompt–response pairs
converted_test = ds["test"].map(format_prompt_response)
converted_test = converted_test.remove_columns(['subject_name', 'data', 'id'])
# Convert the "validation" split
converted_validation = ds["validation"].map(format_prompt_response)
converted_validation = converted_validation.remove_columns(['subject_name', 'data', 'id'])
# Create a new DatasetDict
new_data = DatasetDict({
    "train": converted_test,  # will be added to the "train" split
    "test": converted_validation  # will be added to the "test" split
})

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

In [110]:
def add_sequential_ids(record, idx):
    record['ID'] = idx+1
    return record

# Apply the function to both splits
new_data["train"] = new_data["train"].map(add_sequential_ids, with_indices=True)
new_data["test"] = new_data["test"].map(add_sequential_ids, with_indices=True)

print(new_data)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 100
    })
    test: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 11
    })
})


In [111]:
# Load the existing dataset from Hugging Face
existing_ds = load_dataset("johnjehiel/ph-llm-dataset")

# Append (concatenate) the new prompt–response pairs to the existing splits.
updated_train = concatenate_datasets([existing_ds["train"], new_data["train"]])
updated_test = concatenate_datasets([existing_ds["test"], new_data["test"]])

# Create an updated DatasetDict
updated_ds = DatasetDict({
    "train": updated_train,
    "test": updated_test
})

In [112]:
# Convert each split to a Parquet file (local temporary files)
train_parquet_path = "updated_train.parquet"
test_parquet_path = "updated_test.parquet"

# Save the splits to Parquet
updated_ds["train"].to_parquet(train_parquet_path)
updated_ds["test"].to_parquet(test_parquet_path)

repo_id = "johnjehiel/ph-llm-dataset"
create_repo(repo_id=repo_id, token=HF_TOKEN, repo_type="dataset", exist_ok=True)

# Upload the updated train split using the existing file name in the repo
upload_file(
    path_or_fileobj=train_parquet_path,
    path_in_repo="ph-llm-dataset_train.parquet",  # using the existing train file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset", 
    commit_message="Update train split with mmlu medical genetics data"
)

# Upload the updated test split using the existing file name in the repo
upload_file(
    path_or_fileobj=test_parquet_path,
    path_in_repo="ph-llm-dataset_test.parquet",  # using the existing test file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset",
    commit_message="Update test split with mmlu medical genetics data"
)

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

updated_train.parquet:   0%|          | 0.00/2.49M [00:00<?, ?B/s]

updated_test.parquet:   0%|          | 0.00/286k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/johnjehiel/ph-llm-dataset/commit/4813e37ca07225a1474a75ad8368b9e2db2af0b0', commit_message='Update test split with mmlu medical genetics data', commit_description='', oid='4813e37ca07225a1474a75ad8368b9e2db2af0b0', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/johnjehiel/ph-llm-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='johnjehiel/ph-llm-dataset'), pr_revision=None, pr_num=None)

In [113]:
ds = load_dataset(repo_id)

ph-llm-dataset_train.parquet:   0%|          | 0.00/2.49M [00:00<?, ?B/s]

ph-llm-dataset_test.parquet:   0%|          | 0.00/286k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2646 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/296 [00:00<?, ? examples/s]

In [114]:
ds

DatasetDict({
    train: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 2646
    })
    test: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 296
    })
})

# Create dataset for MedQA

In [117]:
ds = load_dataset("openlifescienceai/medqa")

README.md:   0%|          | 0.00/858 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/5.68M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/739k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/720k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10178 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1273 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/1272 [00:00<?, ? examples/s]

In [119]:
# Convert the "train" split to prompt–response pairs
converted_test = ds["train"].map(format_prompt_response)
converted_test = converted_test.remove_columns(['subject_name', 'data', 'id'])
# Convert the "test" split
converted_validation = ds["test"].map(format_prompt_response)
converted_validation = converted_validation.remove_columns(['subject_name', 'data', 'id'])
# Create a new DatasetDict
new_data = DatasetDict({
    "train": converted_test,  # will be added to the "train" split
    "test": converted_validation  # will be added to the "test" split
})

Map:   0%|          | 0/10178 [00:00<?, ? examples/s]

In [120]:
def add_sequential_ids(record, idx):
    record['ID'] = idx+1
    return record

# Apply the function to both splits
new_data["train"] = new_data["train"].map(add_sequential_ids, with_indices=True)
new_data["test"] = new_data["test"].map(add_sequential_ids, with_indices=True)

print(new_data)

Map:   0%|          | 0/10178 [00:00<?, ? examples/s]

Map:   0%|          | 0/1273 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 10178
    })
    test: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 1273
    })
})


In [124]:
new_data["train"] = new_data["train"].map(lambda x: {'Category': 'medqa'})
new_data["test"] = new_data["test"].map(lambda x: {'Category': 'medqa'})

Map:   0%|          | 0/10178 [00:00<?, ? examples/s]

Map:   0%|          | 0/1273 [00:00<?, ? examples/s]

In [130]:
# Load the existing dataset from Hugging Face
existing_ds = load_dataset("johnjehiel/ph-llm-dataset")

# Append (concatenate) the new prompt–response pairs to the existing splits.
updated_train = concatenate_datasets([existing_ds["train"], new_data["train"]])
updated_test = concatenate_datasets([existing_ds["test"], new_data["test"]])

# Create an updated DatasetDict
updated_ds = DatasetDict({
    "train": updated_train,
    "test": updated_test
})

In [131]:
# Convert each split to a Parquet file (local temporary files)
train_parquet_path = "updated_train.parquet"
test_parquet_path = "updated_test.parquet"

# Save the splits to Parquet
updated_ds["train"].to_parquet(train_parquet_path)
updated_ds["test"].to_parquet(test_parquet_path)

repo_id = "johnjehiel/ph-llm-dataset"
create_repo(repo_id=repo_id, token=HF_TOKEN, repo_type="dataset", exist_ok=True)

# Upload the updated train split using the existing file name in the repo
upload_file(
    path_or_fileobj=train_parquet_path,
    path_in_repo="ph-llm-dataset_train.parquet",  # using the existing train file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset", 
    commit_message="Update train split with medqa data"
)

# Upload the updated test split using the existing file name in the repo
upload_file(
    path_or_fileobj=test_parquet_path,
    path_in_repo="ph-llm-dataset_test.parquet",  # using the existing test file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset",
    commit_message="Update test split with medqa data"
)

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

updated_train.parquet:   0%|          | 0.00/7.88M [00:00<?, ?B/s]

updated_test.parquet:   0%|          | 0.00/979k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/johnjehiel/ph-llm-dataset/commit/5cb0b0a90d31ff73d5f730d14e3121750b908f5e', commit_message='Update test split with medqa data', commit_description='', oid='5cb0b0a90d31ff73d5f730d14e3121750b908f5e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/johnjehiel/ph-llm-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='johnjehiel/ph-llm-dataset'), pr_revision=None, pr_num=None)

In [132]:
ds = load_dataset(repo_id)

ph-llm-dataset_train.parquet:   0%|          | 0.00/7.88M [00:00<?, ?B/s]

ph-llm-dataset_test.parquet:   0%|          | 0.00/979k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [133]:
ds

DatasetDict({
    train: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 12824
    })
    test: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 1569
    })
})

# Create dataset for PubMedQA

In [140]:
ds = load_dataset("openlifescienceai/pubmedqa")

In [141]:
ds["train"] = concatenate_datasets([ds["train"], ds["test"]])
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'data'],
        num_rows: 950
    })
    test: Dataset({
        features: ['id', 'data'],
        num_rows: 500
    })
    validation: Dataset({
        features: ['id', 'data'],
        num_rows: 50
    })
})

In [143]:
prompt_template = """You are a medical expert specialized in bio-medical research. Analyze the given context and answer the following question by selecting the correct or best option.

### Context:
{context}

### Question:
{question}

### Options:
(A) {option_A}
(B) {option_B}
(C) {option_C}
"""
response_template = """Answer: ({correct_option}) {correct_answer}
Explanation: {long_answer}"""

In [144]:
def format_prompt_response(record):
    # Extract data from the 'data' dictionary.
    data = record["data"]
    context = data.get("Context", "")
    question = data.get("Question", "")
    options = data.get("Options", {})
    option_A = options.get("A", "")
    option_B = options.get("B", "")
    option_C = options.get("C", "")
    prompt = prompt_template.format(
        context="\n".join(context),
        question=question,
        option_A=option_A,
        option_B=option_B,
        option_C=option_C
    )
    correct_answer = data.get("Correct Answer", "")
    correct_option = data.get("Correct Option", "")
    long_answer = data.get("Long Answer", "")
    response = response_template.format(
        correct_option=correct_option,
        correct_answer=correct_answer,
        long_answer=long_answer
    )
    return {
        "Category": "pubmedqa",
        "ID": record["id"],
        "Prompt": prompt,
        "Response": response
    }

In [146]:
# Convert the "train" split to prompt–response pairs
converted_test = ds["train"].map(format_prompt_response)
converted_test = converted_test.remove_columns(['data', 'id'])
# Convert the "validation" split
converted_validation = ds["validation"].map(format_prompt_response)
converted_validation = converted_validation.remove_columns(['data', 'id'])
# Create a new DatasetDict
new_data = DatasetDict({
    "train": converted_test,  # will be added to the "train" split
    "test": converted_validation  # will be added to the "test" split
})

Map:   0%|          | 0/950 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [147]:
def add_sequential_ids(record, idx):
    record['ID'] = idx+1
    return record

# Apply the function to both splits
new_data["train"] = new_data["train"].map(add_sequential_ids, with_indices=True)
new_data["test"] = new_data["test"].map(add_sequential_ids, with_indices=True)

print(new_data)

Map:   0%|          | 0/950 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 950
    })
    test: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 50
    })
})


In [153]:
# Load the existing dataset from Hugging Face
existing_ds = load_dataset("johnjehiel/ph-llm-dataset")

# Append (concatenate) the new prompt–response pairs to the existing splits.
updated_train = concatenate_datasets([existing_ds["train"], new_data["train"]])
updated_test = concatenate_datasets([existing_ds["test"], new_data["test"]])

# Create an updated DatasetDict
updated_ds = DatasetDict({
    "train": updated_train,
    "test": updated_test
})

In [154]:
# Convert each split to a Parquet file (local temporary files)
train_parquet_path = "updated_train.parquet"
test_parquet_path = "updated_test.parquet"

# Save the splits to Parquet
updated_ds["train"].to_parquet(train_parquet_path)
updated_ds["test"].to_parquet(test_parquet_path)

repo_id = "johnjehiel/ph-llm-dataset"
create_repo(repo_id=repo_id, token=HF_TOKEN, repo_type="dataset", exist_ok=True)

# Upload the updated train split using the existing file name in the repo
upload_file(
    path_or_fileobj=train_parquet_path,
    path_in_repo="ph-llm-dataset_train.parquet",  # using the existing train file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset", 
    commit_message="Update train split with pubmedqa data"
)

# Upload the updated test split using the existing file name in the repo
upload_file(
    path_or_fileobj=test_parquet_path,
    path_in_repo="ph-llm-dataset_test.parquet",  # using the existing test file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset",
    commit_message="Update test split with pubmedqa data"
)

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

updated_train.parquet:   0%|          | 0.00/8.83M [00:00<?, ?B/s]

updated_test.parquet:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/johnjehiel/ph-llm-dataset/commit/b75388cb7e69b011c9fae2730d3776d0a070cad8', commit_message='Update test split with pubmedqa data', commit_description='', oid='b75388cb7e69b011c9fae2730d3776d0a070cad8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/johnjehiel/ph-llm-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='johnjehiel/ph-llm-dataset'), pr_revision=None, pr_num=None)

In [155]:
ds = load_dataset(repo_id)

ph-llm-dataset_train.parquet:   0%|          | 0.00/8.83M [00:00<?, ?B/s]

ph-llm-dataset_test.parquet:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/13774 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1619 [00:00<?, ? examples/s]

In [156]:
ds

DatasetDict({
    train: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 13774
    })
    test: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 1619
    })
})

# Create dataset for MedMCQA

In [197]:
ds = load_dataset("openlifescienceai/medmcqa")

In [199]:
keywords = [
    'sleep', 'fitness', 'stress', 'heart', 'health', 'health care', 'personal', 'medical', 'cardio', 'medicine', 'exercise', 'smoking', 'smoker', 'alcohol', 'alcoholic', 'bmi', 'blood pressure', 'steps', 'step', 'run', 'jog', ' rem ', 'circadian', 'homeostatic', 'injury', ' train', 'etiology', 'etiological', 'recommend', 'advice', 'advise', 'assistance', 'assist', 'workout', 'work out', 'lifestyle', 'z-score', 'z score', 'athelete', 'athlete', 'sport', 'respiratory', 'fatigue', 'pressure', 'recover', 'hydrate', 'faint', 'drowsy', 'drowsiness', 'gym', 'muscle', 'sore', 'wake', 'rest', 'relax', 'insomnia', 'physic', 'care', 'calorie', 'fat loss', 'weight', 'height', 'mobility', 'activity', 'active'
]

In [200]:
len(keywords)

65

In [201]:
# Function to check if any keyword is present in a record
def contains_keyword(example):
    for keyword in keywords:
        if keyword in example['question'].lower():
            return True
    return False

# Apply the filter to each split
for split in ds.keys():
    ds[split] = ds[split].filter(contains_keyword)

print(ds)

Filter:   0%|          | 0/182822 [00:00<?, ? examples/s]

Filter:   0%|          | 0/6150 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4183 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name'],
        num_rows: 19180
    })
    test: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name'],
        num_rows: 501
    })
    validation: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name'],
        num_rows: 498
    })
})


In [202]:
prompt_template = """You are a medical expert specialized in {subject_name}. Answer the following question by selecting the correct or best option.

### Question:
{question}

### Options:
(A) {opa}
(B) {opb}
(C) {opc}
(D) {opd}
"""
response_template = """Answer: ({correct_option}) {correct_answer}
{exp}"""

In [203]:
def format_prompt_response(record):
    question = record["question"]
    opa = record["opa"]
    opb = record["opb"]
    opc = record["opc"]
    opd = record["opd"]
    subject_name = record['subject_name']
    prompt = prompt_template.format(
        subject_name=subject_name,
        question=question,
        opa=opa,
        opb=opb,
        opc=opc,
        opd=opd
    )
    optionMap = {0:['A', opa], 1:['B', opb], 2:['C', opc], 3:['D', opd]}
    correct_option = optionMap[record["cop"]][0]
    correct_answer = optionMap[record["cop"]][1]
    exp = ""
    if record["exp"]:
        exp = f"Explanation: {record['exp']}" 
    response = response_template.format(
        correct_option=correct_option,
        correct_answer=correct_answer,
        exp=exp
    )
    return {
        "Category": "MedMCQA",
        "ID": record["id"],
        "Prompt": prompt,
        "Response": response
    }

In [204]:
# Convert the "train" split to prompt–response pairs
converted_train = ds["train"].map(format_prompt_response)
converted_train = converted_train.remove_columns(['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name'])
# Convert the "validation" split
converted_validation = ds["validation"].map(format_prompt_response)
converted_validation = converted_validation.remove_columns(['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name'])
# Create a new DatasetDict
new_data = DatasetDict({
    "train": converted_train,  # will be added to the "train" split
    "test": converted_validation  # will be added to the "test" split
})

Map:   0%|          | 0/19180 [00:00<?, ? examples/s]

Map:   0%|          | 0/498 [00:00<?, ? examples/s]

In [205]:
def add_sequential_ids(record, idx):
    record['ID'] = idx+1
    return record

# Apply the function to both splits
new_data["train"] = new_data["train"].map(add_sequential_ids, with_indices=True)
new_data["test"] = new_data["test"].map(add_sequential_ids, with_indices=True)

print(new_data)

Map:   0%|          | 0/19180 [00:00<?, ? examples/s]

Map:   0%|          | 0/498 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 19180
    })
    test: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 498
    })
})


In [214]:
# Load the existing dataset from Hugging Face
existing_ds = load_dataset("johnjehiel/ph-llm-dataset")

# Append (concatenate) the new prompt–response pairs to the existing splits.
updated_train = concatenate_datasets([existing_ds["train"], new_data["train"]])
updated_test = concatenate_datasets([existing_ds["test"], new_data["test"]])

# Create an updated DatasetDict
updated_ds = DatasetDict({
    "train": updated_train,
    "test": updated_test
})

In [215]:
# Convert each split to a Parquet file (local temporary files)
train_parquet_path = "updated_train.parquet"
test_parquet_path = "updated_test.parquet"

# Save the splits to Parquet
updated_ds["train"].to_parquet(train_parquet_path)
updated_ds["test"].to_parquet(test_parquet_path)

repo_id = "johnjehiel/ph-llm-dataset"
create_repo(repo_id=repo_id, token=HF_TOKEN, repo_type="dataset", exist_ok=True)

# Upload the updated train split using the existing file name in the repo
upload_file(
    path_or_fileobj=train_parquet_path,
    path_in_repo="ph-llm-dataset_train.parquet",  # using the existing train file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset", 
    commit_message="Update train split with medmcqa data"
)

# Upload the updated test split using the existing file name in the repo
upload_file(
    path_or_fileobj=test_parquet_path,
    path_in_repo="ph-llm-dataset_test.parquet",  # using the existing test file name
    repo_id=repo_id,
    token=HF_TOKEN,
    repo_type="dataset",
    commit_message="Update test split with medmcqa data"
)

Creating parquet from Arrow format:   0%|          | 0/33 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

updated_train.parquet:   0%|          | 0.00/19.2M [00:00<?, ?B/s]

updated_test.parquet:   0%|          | 0.00/1.24M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/johnjehiel/ph-llm-dataset/commit/37cc676336376445e3ccb1f1a8717abbb0fff5b3', commit_message='Update test split with medmcqa data', commit_description='', oid='37cc676336376445e3ccb1f1a8717abbb0fff5b3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/johnjehiel/ph-llm-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='johnjehiel/ph-llm-dataset'), pr_revision=None, pr_num=None)

In [216]:
ds = load_dataset(repo_id)

ph-llm-dataset_train.parquet:   0%|          | 0.00/19.2M [00:00<?, ?B/s]

ph-llm-dataset_test.parquet:   0%|          | 0.00/1.24M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [217]:
ds

DatasetDict({
    train: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 32954
    })
    test: Dataset({
        features: ['Category', 'ID', 'Prompt', 'Response'],
        num_rows: 2117
    })
})

# create dataset for Patient-Doctor interaction

In [218]:
ds = load_dataset("ruslanmv/ai-medical-chatbot")

README.md:   0%|          | 0.00/863 [00:00<?, ?B/s]

dialogues.parquet:   0%|          | 0.00/142M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/256916 [00:00<?, ? examples/s]

In [219]:
ds

DatasetDict({
    train: Dataset({
        features: ['Description', 'Patient', 'Doctor'],
        num_rows: 256916
    })
})

In [220]:
ds["train"][0]

{'Description': 'Q. What does abutment of the nerve root mean?',
 'Patient': 'Hi doctor,I am just wondering what is abutting and abutment of the nerve root means in a back issue. Please explain. What treatment is required for\xa0annular bulging and tear?',
 'Doctor': 'Hi. I have gone through your query with diligence and would like you to know that I am here to help you. For further information consult a neurologist online -->'}