In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")

In [None]:
# add special tokens
BOS_TOKEN = "<bos>"
EOS_TOKEN = "<eos>"
PAD_TOKEN = "<pad>"

special_tokens = { "bos_token": BOS_TOKEN, "eos_token": EOS_TOKEN, "pad_token": PAD_TOKEN }
num_added_tokens = tokenizer.add_special_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))

Embedding(50260, 768)

In [None]:
tokenizer.pad_token

'<pad>'

In [None]:
tokenizer.all_special_tokens

['<bos>', '<eos>', '<|endoftext|>', '<pad>']

In [None]:
model.generation_config

GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256
}

In [None]:
# make the dataset
class MedicalDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, tokenizer):
        self.dataset = dataset
        self.tokenizer = tokenizer
    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        # tokenize the input and the output
        text = BOS_TOKEN + self.dataset[idx]["instruction"] + self.dataset[idx]["input"] + EOS_TOKEN

        # tokenize the input and the output
        input_enc = self.tokenizer(text, return_tensors = "pt", padding = "max_length", truncation = True)
        output_enc = self.tokenizer(self.dataset[idx]["output"], return_tensors = "pt", padding="max_length", truncation=True)

        return {
            'input_ids': input_enc.input_ids.squeeze(),
            'attention_mask': input_enc.attention_mask.squeeze(),
            'labels' : output_enc.input_ids.squeeze()
        }

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# json dataset file path
dataset_path = '/content/drive/MyDrive/GenMedGPT-5k.json'

In [None]:
# load the dataset
import json
import torch

with open(dataset_path, 'r') as f:
    dataset = json.load(f)

# the format of the dataset is
# [{"instruction": "instruction to the chat bot", "input": "input text", "output": "output text"}]

# split the dataset
from sklearn.model_selection import train_test_split

train_dataset, eval_dataset = train_test_split( dataset, test_size=0.2, random_state = 1341)



# initialize the dataset class
from torch.utils.data import  DataLoader


train_med_dataset = MedicalDataset(train_dataset, tokenizer)
eval_med_dataset = MedicalDataset(eval_dataset, tokenizer )
# create training and evaluation dataset


# make a dataloader
# dataloader = DataLoader(med_dataset, batch_size=16, shuffle=True)

In [None]:
# training argument and trainer
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

training_argument = TrainingArguments(
    output_dir="./",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    save_steps=1000,
    save_total_limit=3
)

trainer = Trainer(
    model=model,
    args=training_argument,
    train_dataset=med_dataset,

)

trainer.train()

Step,Training Loss
500,0.5011
1000,0.3649


TrainOutput(global_step=1363, training_loss=0.4116586193999908, metrics={'train_runtime': 1222.3592, 'train_samples_per_second': 4.46, 'train_steps_per_second': 1.115, 'total_flos': 1424589884227584.0, 'train_loss': 0.4116586193999908, 'epoch': 1.0})

In [None]:
# generate text using the model
def generate_text(text, tokenizer):
    # create the prompt
    instruction = "If you are a doctor, please answer the medical questions based on the patient's description."
    text = BOS_TOKEN +instruction +  text + EOS_TOKEN

    input_tokens = tokenizer(text, return_tensors="pt", padding='max_length', truncation=True)
    input_enc = input_tokens.input_ids.to(model.device)
    attention_mask = input_tokens.attention_mask.to(model.device)

    output = model.generate(input_enc, num_return_sequences=1, attention_mask=attention_mask, max_new_tokens = 100)
    return tokenizer.decode(output[0], skip_special_tokens=True)

generate_text("Doctor, I have been experiencing a hoarse voice for a few weeks now and it's not getting any better despite taking medication. What could be the problem?")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (1024). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


RuntimeError: The size of tensor a (1024) must match the size of tensor b (1025) at non-singleton dimension 3

In [None]:
checkpoint_file_path = '/content/checkpoint-1000'

# load the model from the checkpoint
model = AutoModelForCausalLM.from_pretrained(checkpoint_file_path)

In [None]:
prompt=
generate_text("Doctor, I have been experiencing a hoarse voice for a few weeks now and it's not getting any better despite taking medication. What could be the problem?")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


RuntimeError: The size of tensor a (1024) must match the size of tensor b (1025) at non-singleton dimension 3