In [1]:
from datasets import load_dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the dataset
dataset = load_dataset('medical_dialog', 'processed.en')

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [3]:
with open("train.txt", "w") as f:
    for entry in dataset["train"]:
        f.write(entry['utterances'][0] + " <|endoftext|> " + entry['utterances'][1] + "\n")

with open("val.txt", "w") as f:
    for entry in dataset["validation"]:
        f.write(entry['utterances'][0] + " <|endoftext|> " + entry['utterances'][1] + "\n")

with open("test.txt", "w") as f:
    for entry in dataset["test"]:
        f.write(entry['utterances'][0] + " <|endoftext|> " + entry['utterances'][1] + "\n")

In [4]:
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="train.txt",
    block_size=256
)

val_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="val.txt",
    block_size=256
)

test_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="test.txt",
    block_size=256
)



In [48]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [49]:
training_args = TrainingArguments(
    output_dir="./output",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=32,
    save_steps=10_000,
    save_total_limit=2,
)

In [50]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [51]:
trainer.train()


100%|██████████| 9/9 [01:59<00:00, 13.26s/it]

{'train_runtime': 119.2985, 'train_samples_per_second': 2.179, 'train_steps_per_second': 0.075, 'train_loss': 4.29523425632053, 'epoch': 1.0}





TrainOutput(global_step=9, training_loss=4.29523425632053, metrics={'train_runtime': 119.2985, 'train_samples_per_second': 2.179, 'train_steps_per_second': 0.075, 'train_loss': 4.29523425632053, 'epoch': 1.0})

In [52]:
# Save the model
model.save_pretrained("./model")

# Save the tokenizer
tokenizer.save_pretrained("./model")

('./model\\tokenizer_config.json',
 './model\\special_tokens_map.json',
 './model\\vocab.json',
 './model\\merges.txt',
 './model\\added_tokens.json')

In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load trained model and tokenizer
model = AutoModelForCausalLM.from_pretrained("./model")
tokenizer = AutoTokenizer.from_pretrained("./model", padding_side='left')

# Load base GPT-2 model and tokenizer
base_model = AutoModelForCausalLM.from_pretrained("gpt2")
base_tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side='left')

# Prepare a test query
test_query = f"patient: Can headaches be caused by stress? {tokenizer.eos_token}"

# Fine-tuned model
input_ids = tokenizer.encode(test_query, return_tensors='pt')
attention_mask = torch.ones(input_ids.shape)  # Create attention mask

output_ids = model.generate(
    input_ids=input_ids, 
    attention_mask=attention_mask,
    no_repeat_ngram_size=2,
    max_length=64
)
response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(f"Fine-tuned Model Query: {test_query}")
print(f"Fine-tuned Model Response: {response}")

# Base GPT-2 model
input_ids_base = base_tokenizer.encode(test_query, return_tensors='pt')
attention_mask_base = torch.ones(input_ids_base.shape)  # Create attention mask

output_ids_base = base_model.generate(
    input_ids=input_ids_base, 
    attention_mask=attention_mask_base,
    max_length=64,
    no_repeat_ngram_size=2,
)
response_base = base_tokenizer.decode(output_ids_base[0], skip_special_tokens=True)
print(f"Base Model Query: {test_query}")
print(f"Base Model Response: {response_base}")


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Fine-tuned Model Query: patient: Can headaches be caused by stress? <|endoftext|>
Fine-tuned Model Response: patient: Can headaches be caused by stress? The following is a list of the most common symptoms of migraine.

Symptoms of migraines:
...
. are usually mild, but can be severe. 
The most commonly reported symptom is headache. The most frequently reported symptoms are
Base Model Query: patient: Can headaches be caused by stress? <|endoftext|>
Base Model Response: patient: Can headaches be caused by stress? The first time I saw the new "The Walking Dead" trailer, I was so excited. I had never seen a zombie movie before, and I knew that I wanted to see a new one.

I was excited to get my hands on the first trailer
