In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch
from torch.utils.data import Dataset
import json
import random

# Load the pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Check if a pad token exists, if not, add it
if tokenizer.pad_token_id is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Resize the model's token embeddings to accommodate the new pad token
model.resize_token_embeddings(len(tokenizer))

# Custom dataset class to handle training data
class ChatDataset(Dataset):
    def __init__(self, tokenizer, intents, max_length=128):
        self.input_ids = []
        self.attn_masks = []

        for intent in intents['intents']:
            for pattern in intent['patterns']:
                # Add variations by slightly modifying the pattern
                variations = self.create_variations(pattern)
                for variation in variations:
                    response = random.choice(intent['responses'])
                    # Provide clear format for training with direct Q&A format
                    input_text = f"User: {variation}\nBot: {response}" 

                    encodings = tokenizer(
                        input_text,
                        truncation=True,
                        padding="max_length",
                        max_length=max_length,
                        return_tensors="pt"
                    )
                    self.input_ids.append(encodings['input_ids'][0])
                    self.attn_masks.append(encodings['attention_mask'][0])

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attn_masks[idx],
            'labels': self.input_ids[idx]  # Labels should match the input for language modeling
        }

    def create_variations(self, text):
        # Function to create small variations of input text
        variations = [text]
        # Simple augmentation - Adding exclamations, question marks, etc.
        if not text.endswith("?"):
            variations.append(text + "?")
        if not text.endswith("!"):
            variations.append(text + "!")
        # Add more variations if needed (like swapping synonyms, rephrasing)
        return variations

# Load intents data
with open('intents.json') as file:
    intents = json.load(file)

# Create the dataset and dataloader
dataset = ChatDataset(tokenizer, intents)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Disable masked language modeling since it's not relevant for GPT-2
)

# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,  # Increased epochs to allow more learning
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_steps=100,
    eval_strategy="no",
    prediction_loss_only=True
)

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine-tuned-gpt2")
tokenizer.save_pretrained("./fine-tuned-gpt2")
