## Installing & importing libraries

In [None]:
!pip install accelerate -U
!pip install transformers[torch]

In [None]:
import os
import re
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling, TextDataset, EvalPrediction
from scipy.special import softmax
from sklearn.metrics import log_loss
import numpy as np

## Connecting drive to load dataset from and save the model to!

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

## Defining `compute_metrics` to calculate perplexity

In [None]:
def compute_metrics(p: EvalPrediction):
    logits = p.predictions
    labels = p.label_ids
    probabilities = softmax(logits, axis=-1)
    loss = log_loss(labels.flatten(), probabilities.reshape(-1, probabilities.shape[-1]), labels=[i for i in range(logits.shape[-1])])
    perplexity = np.exp(loss)
    return {"perplexity": perplexity}

## Traning the model with Huggingface's `Trainer` class

In [None]:
def train_chatbot():
    model_output_path = '/content/drive/MyDrive/data/models'

    # Set up the tokenizer and model
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
    model = GPT2LMHeadModel.from_pretrained("gpt2-medium")
    # model = torch.nn.DataParallel(model)

    # Prepare the dataset
    train_dataset = TextDataset(tokenizer=tokenizer, file_path="/content/drive/MyDrive/data/training.txt", block_size=128)
    val_dataset = TextDataset(tokenizer=tokenizer, file_path="/content/drive/MyDrive/data/validation.txt", block_size=128)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    # Set up the training arguments
    training_args = TrainingArguments(
        output_dir=model_output_path,
        overwrite_output_dir=True,
        num_train_epochs=5,
        per_device_train_batch_size=4,
        save_strategy='steps',
        save_steps=100,
        evaluation_strategy='steps',
        eval_steps=100,
        save_total_limit=2,
        report_to='none',
        prediction_loss_only=True,
        learning_rate = 5e-5
    )

    # Train the model
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    trainer.save_model(model_output_path)

    # Save the tokenizer
    tokenizer.save_pretrained(model_output_path)


In [None]:
train_chatbot()

## Generating response

In [None]:
def generate_response(prompt):
    tokenizer = GPT2Tokenizer.from_pretrained('/content/drive/MyDrive/data/models/gpt2-large-ft')
    model = GPT2LMHeadModel.from_pretrained('/content/drive/MyDrive/data/models/gpt2-large-ft/checkpoint-400')
    input_ids = tokenizer.encode(prompt, return_tensors="pt")

    # Create the attention mask and pad token id
    attention_mask = torch.ones_like(input_ids)
    pad_token_id = tokenizer.eos_token_id

    output = model.generate(
        input_ids,
        max_length=256,
        attention_mask=attention_mask,
        num_return_sequences=1, # Generate a single sequence
        temperature=0.85,       # Controls randomness (higher for more diversity)
        repetition_penalty=1.2, # Controls repetition (higher for less repetition)
        pad_token_id=tokenizer.eos_token_id,  # ID of the padding token ("<EOS>")
        early_stopping=True,
        top_k = 50
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)