In [12]:
from datasets import load_dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments, GPTNeoForCausalLM
import torch
from transformers import EarlyStoppingCallback, AutoTokenizer, AutoModelForCausalLM, TextDataset, DataCollatorForLanguageModeling, TrainingArguments, Trainer, LineByLineTextDataset

In [3]:
from datasets import load_dataset

# Base URLs for the specific JSON files from GitHub or any other hosting
base_url = "../data/"

# Load the dataset
dataset = load_dataset(
    "json", 
    data_files={
        "train": base_url + "train_en_1.json", 
        "validation": base_url + "valid_en_1.json",
        "test": base_url + "test_en_1.json"
    }, 
)

# Now, 'dataset' will contain the train, validation, and test splits
train_data = dataset["train"]
validation_data = dataset["validation"]
test_data = dataset["test"]

In [13]:
# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side='left', add_prefix_space=True)
tokenizer.pad_token = tokenizer.eos_token

In [11]:


# Prepare the dataset
with open("train.txt", "w") as f:
    for entry in dataset["train"]:
        f.write("[PATIENT]: " + entry['input'] + "  [DOCTOR]:" + entry['output'] + "\n")



model = AutoModelForCausalLM.from_pretrained("gpt2")

# Prepare the dataset and data collator
train_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="train.txt",
    block_size=256
)


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Initialize the Trainer with TrainingArguments
training_args = TrainingArguments(
    output_dir="./output",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=2,
    fp16=True,
    save_steps=100,
    save_total_limit=2,
    save_strategy="steps",  # Add this line
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Train the model
trainer.train()




KeyboardInterrupt: 

In [14]:
# Load the fine-tuned model
fine_tuned_model = GPT2LMHeadModel.from_pretrained("./output/checkpoint-1800/")

# Load the baseline model
baseline_model = GPT2LMHeadModel.from_pretrained("gpt2")

In [19]:
text = """
Answer concisely, give medical advice. 
[PATIENT]: 
I am 24 years old. What are common causes of stomach pain? 
[DOCTOR]:
""" 


In [20]:
# Preprocess the text
input_ids = tokenizer.encode(text, return_tensors='pt')

# Generate outputs from the fine-tuned model
fine_tuned_output = fine_tuned_model.generate(input_ids, max_length=150, temperature=0.5, no_repeat_ngram_size=2)

# Generate outputs from the baseline model
baseline_output = baseline_model.generate(input_ids, max_length=150, temperature=0.5, no_repeat_ngram_size=2)

# Decode the outputs
fine_tuned_text = tokenizer.decode(fine_tuned_output[0], skip_special_tokens=True)
baseline_text = tokenizer.decode(baseline_output[0], skip_special_tokens=True)

# Print the outputs
print("Fine-tuned model output: ", fine_tuned_text.replace(text, ''))
print("Baseline model output: ", baseline_text.replace(text, ''))


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Fine-tuned model output:   Hello, I have studied your case. The most common cause of pain in the stomach is due to gastritis. It is a chronic gastric problem. You should consult your doctor and get done clinical examination of abdomen and pelvis. If the pain is not relieved by medicines, then you should go for gastrostomy. This will give you relief. In case of gastroparesis, you can take tablet cetirizine for pain relief and cefuroxime for stomach ulcer. I would advise you to take tab
Baseline model output:   The stomach is a small, round, open, and narrow space. It is the most important part of the body. The stomach has a large, narrow, flat, or flat stomach.
It is also the stomach's main source of energy. When you eat, your stomach will produce energy, which is why you feel good. You will feel better when you are eating. If you have a stomach problem, you will need to take a medicine. This medicine will help you to feel more comfortable. I have been told that if you take
