In [1]:
from transformers import AutoTokenizer
from datasets import DatasetDict

from data.q_and_a.train_and_eval import TrainAndEval
from data.q_and_a.eval_with_answers import EvalWithAnswers
from data.q_and_a.target_with_explaination import TargetWithExplanation
from data.q_and_a.target_with_explaination_tokenized import TargetWithExplanationTokenized, to_transformers_dataset, remove_padding_labels

In [2]:
tokenizer =  AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
tokenizer.pad_token = tokenizer.eos_token

eval_dataset = TrainAndEval("../../data/pubmed_QA_eval.json")
train_dataset = TrainAndEval("../../data/pubmed_QA_train.json")
eval_with_answers = EvalWithAnswers(eval_dataset)
eval_explanation = TargetWithExplanation(eval_with_answers)
eval_tokenized = TargetWithExplanationTokenized(tokenizer, eval_explanation)

train_with_answers = EvalWithAnswers(train_dataset)
train_explanation = TargetWithExplanation(train_with_answers)
train_tokenized = TargetWithExplanationTokenized(tokenizer, train_explanation)

In [3]:
transformers_dataset = DatasetDict({
    "train": to_transformers_dataset(train_tokenized),
    "eval": to_transformers_dataset(eval_tokenized)
})

In [4]:
transformers_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 16890
    })
    eval: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5000
    })
})

In [5]:
len(transformers_dataset["train"]), len(transformers_dataset["eval"])

(16890, 5000)

In [6]:
from huggingface_hub import login

login("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")

In [7]:
# push to hub

transformers_dataset.push_to_hub("lgmc/pubmed_q_and_a")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/17 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/431 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/lgmc/pubmed_q_and_a/commit/aba96148a34ae76dbc9b03511e153427bd3f83a5', commit_message='Upload dataset', commit_description='', oid='aba96148a34ae76dbc9b03511e153427bd3f83a5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/lgmc/pubmed_q_and_a', endpoint='https://huggingface.co', repo_type='dataset', repo_id='lgmc/pubmed_q_and_a'), pr_revision=None, pr_num=None)

In [8]:
len(transformers_dataset["train"][0]["input_ids"])

1200

In [9]:
import torch
tokenizer.decode(torch.tensor(transformers_dataset["train"][0]["input_ids"]))

'<|begin_of_text|>You are an expert in multiple-choice questions. Your task is to select the best answer from the given options based on the provided context.\nContext: The rate of action of calcium on the electrical and mechanical responses of the crayfish muscle fibers. The effects of sudden changes in external Ca concentration on the time courses of the changes in size of the action potential and of the associated contraction in a single crayfish muscle fiber were investigated. Procaine-HCl was added to the bathing solution to make the muscle fiber excitable. The concentration of the divalent cations (Ca and Mg) was high enough to keep the threshold potential constant. In Ca-free solution, neither action potential nor contraction was observed. When the external Ca concentration was suddenly increased from 0 to 14 mM, the full sized action potentials were generated within several seconds, but the tensions recovered slowly in an exponential time course with the time constants of 15-40

In [11]:
transformers_dataset["train"][0]["labels"]

[128000,
 2675,
 527,
 459,
 6335,
 304,
 5361,
 63726,
 4860,
 13,
 4718,
 3465,
 374,
 311,
 3373,
 279,
 1888,
 4320,
 505,
 279,
 2728,
 2671,
 3196,
 389,
 279,
 3984,
 2317,
 627,
 2014,
 25,
 578,
 4478,
 315,
 1957,
 315,
 35719,
 389,
 279,
 20314,
 323,
 22936,
 14847,
 315,
 279,
 90016,
 18668,
 16124,
 49774,
 13,
 578,
 6372,
 315,
 11210,
 4442,
 304,
 9434,
 14751,
 20545,
 389,
 279,
 892,
 14307,
 315,
 279,
 4442,
 304,
 1404,
 315,
 279,
 1957,
 4754,
 323,
 315,
 279,
 5938,
 71895,
 304,
 264,
 3254,
 90016,
 18668,
 16124,
 24722,
 1051,
 27313,
 13,
 42187,
 8511,
 11529,
 5176,
 574,
 3779,
 311,
 279,
 73509,
 6425,
 311,
 1304,
 279,
 16124,
 24722,
 3521,
 15729,
 13,
 578,
 20545,
 315,
 279,
 3512,
 69261,
 272,
 811,
 320,
 23389,
 323,
 73693,
 8,
 574,
 1579,
 3403,
 311,
 2567,
 279,
 12447,
 4754,
 6926,
 13,
 763,
 14751,
 12862,
 6425,
 11,
 14188,
 1957,
 4754,
 6463,
 71895,
 574,
 13468,
 13,
 3277,
 279,
 9434,
 14751,
 20545,
 574,
 15187,
 731