In [None]:
!pip install transformers torch sentencepiece datasets accelerate scikit-learn nltk


In [None]:
# --- 2. Import Core Libraries ---
import torch
import os
import math
import re # Used for potential text cleaning, though minimal in this version
import nltk
from nltk.corpus import gutenberg
from sklearn.model_selection import train_test_split # Used for splitting data for evaluation
from torch.utils.data import Dataset # Base class for custom datasets

# Hugging Face Transformers specific imports
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    TextDataset # Note: TextDataset is a legacy class, CustomTextDataset is preferred for flexibility
)


In [None]:
# --- 3. Hardware Setup: Verify GPU Availability ---
# Check if a CUDA-enabled GPU is available and set the device accordingly.
# This ensures the model training and inference runs on GPU if possible, for speed.
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")


In [None]:
# --- 4. Data Collection and Preparation ---
# Download the 'gutenberg' corpus from NLTK, which contains Jane Austen's 'Emma'.
print("Downloading NLTK 'gutenberg' corpus...")
nltk.download('gutenberg')


In [None]:
# Load the raw text data for 'austen-emma.txt' (Emma) from the Gutenberg corpus.
# This loads the text including any Project Gutenberg boilerplate.
data = gutenberg.raw('austen-emma.txt')


In [None]:
# Define a simple cleaning function to remove Project Gutenberg boilerplate.
# This helps the model focus on the actual narrative.
def clean_gutenberg_text(text):
    # Markers to identify the start and end of the actual book content
    start_marker = "*** START OF THE PROJECT GUTENBERG EBOOK EMMA ***"
    end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK EMMA ***"

    # Find the start and end indices of the book content
    start_idx = text.find(start_marker)
    if start_idx != -1:
        text = text[start_idx + len(start_marker):] # Slice from after the start marker

    end_idx = text.find(end_marker)
    if end_idx != -1:
        text = text[:end_idx] # Slice up to the end marker

    # Standardize newlines and remove excess spaces
    text = text.replace('\r\n', '\n') # Replace Windows-style newlines with Unix-style
    text = re.sub(r'\n\s*\n', ' ', text) # Replace multiple newlines with a single space
    text = text.replace('\n', ' ')       # Replace single newlines with a single space
    text = ' '.join(text.split())        # Remove any extra spaces and standardize spacing

    # Remove common Gutenberg artifacts (e.g., illustration tags)
    text = re.sub(r'\[Illustration:.*?\]', '', text)
    text = re.sub(r'\[.*?\]', '', text) # General catch-all for bracketed notes

    return text

# Apply the cleaning function to the raw Emma text
cleaned_emma_text = clean_gutenberg_text(data)
print(f"Original Emma text length: {len(data)} characters")
print(f"Cleaned Emma text length: {len(cleaned_emma_text)} characters")


In [None]:
# Save the cleaned text to a local file. This file will be used by TextDataset.
train_file_path = 'emma_cleaned.txt' # Changed filename to indicate it's cleaned
with open(train_file_path, 'w', encoding='utf-8') as file:
    file.write(cleaned_emma_text)
print(f"Cleaned text saved to: {train_file_path}")

In [None]:
# --- 5. Custom Dataset for Evaluation ---
# Define a custom PyTorch Dataset for handling text data.
# This is more flexible than TextDataset, especially for splitting data or
# when data is already in memory.
class CustomTextDataset(Dataset):
    def __init__(self, tokenizer, text_data, block_size):
        self.tokenizer = tokenizer
        self.block_size = block_size

        # Tokenize the entire text. Add EOS token at the very end for consistent sequence ending.
        # truncation=False and padding=False mean we handle chunking manually.
        tokenized_data = tokenizer(text_data + tokenizer.eos_token, return_tensors='pt', truncation=False, padding=False)
        self.input_ids = tokenized_data['input_ids'][0] # Get the tensor of token IDs

        # Ensure we only work with full blocks. Discard any leftover tokens at the end.
        self.num_blocks = len(self.input_ids) // self.block_size
        self.input_ids = self.input_ids[:self.num_blocks * self.block_size]

        print(f"Dataset initialized with {len(self.input_ids)} total tokens, split into {self.num_blocks} blocks of size {self.block_size}.")

    def __len__(self):
        # The length of the dataset is the number of full blocks
        return self.num_blocks

    def __getitem__(self, idx):
        # Return a single block of tokens and its corresponding labels.
        # For causal language modeling (CLM), labels are the same as input_ids,
        # as the model predicts the next token in the sequence.
        chunk = self.input_ids[idx * self.block_size : (idx + 1) * self.block_size]
        return {"input_ids": chunk, "labels": chunk.clone()}


In [None]:
# --- 6. Data Loading and Collator Functions for Training ---
# These functions are used by the Hugging Face Trainer.

# Function to load the dataset using TextDataset (legacy but used in original code)
def load_dataset_for_training(file_path, tokenizer, block_size=128):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size,
    )
    return dataset

# Function to load the Data Collator for Language Modeling.
# mlm=False indicates Causal Language Modeling (CLM) for GPT-2,
# where the model predicts the next token in a sequence.
def load_data_collator(tokenizer, mlm=False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator


In [None]:
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

In [None]:
train_dataset = load_dataset_for_training(train_file_path, tokenizer)

In [None]:
data_collator = load_data_collator(tokenizer)

In [None]:
data_collator

In [None]:
#Training Function ---
# Encapsulates the entire fine-tuning process.
def train_model(train_file_path, model_name,
              output_dir,
              overwrite_output_dir,
              per_device_train_batch_size,
              num_train_epochs,
              save_steps): # save_steps is passed but not used in TrainingArguments here

    # Initialize tokenizer from pre-trained model
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    # Load the training dataset using the specified file path and tokenizer
    train_dataset = load_dataset_for_training(train_file_path, tokenizer)
    # Load the data collator
    data_collator = load_data_collator(tokenizer)

    # Save the tokenizer to the output directory. This is important for loading later.
    tokenizer.save_pretrained(output_dir)
    # Set pad_token to eos_token for GPT-2, crucial for batching and generation
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Load the pre-trained GPT-2 model and move it to the specified device (GPU/CPU)
    model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
    # Ensure model's pad_token_id matches tokenizer's
    model.config.pad_token_id = tokenizer.pad_token_id


    # Save the initial (untrained) model to the output directory.
    # This line is redundant if trainer.save_model() is called later,
    # but kept as per original code.
    model.save_pretrained(output_dir)

    # Configure training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=overwrite_output_dir,
        per_device_train_batch_size=per_device_train_batch_size,
        num_train_epochs=num_train_epochs,
        # No explicit eval_dataset or evaluation_strategy in original train function
        # Logging and saving steps are also not explicitly set here but are in the outer call
        logging_steps=100, # Default logging frequency
        save_steps=save_steps, # Default saving frequency
        report_to="wandb" # Report metrics to Weights & Biases
    )

    # Initialize the Hugging Face Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
    )

    # Start the training process
    print("Starting model training...")
    trainer.train()
    print("Training complete.")

    # Save the final trained model
    trainer.save_model()
    print(f"Fine-tuned model saved to {output_dir}")

In [None]:
# --- 8. Set Training Parameters and Execute Training ---
# Define the specific parameters for this training run.
model_name = 'gpt2'
output_dir = './fine_tuned_gpt2_alice' # Output directory for the fine-tuned model
overwrite_output_dir = False # Set to False as per original notebook
per_device_train_batch_size = 4
num_train_epochs = 5
save_steps = 500 # This parameter is passed to the train function


In [None]:
# Set Weights & Biases API key for logging (if using WandB)
os.environ["WANDB_API_KEY"] = "ff4e8f8113901dcad2887ab7d4459b8cdc5e62f9"

# Execute the training function
train_model(
    train_file_path=train_file_path, # Path to the cleaned Emma text file
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps # Pass save_steps to the function
)


In [None]:
# --- 9. Text Generation Functionality ---
# Functions to load the model and tokenizer for text generation.
def load_model_for_generation(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model

def load_tokenizer_for_generation(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer

# Function to generate text using the fine-tuned model.
def generate_text(model_path, sequence, max_length):
    # Load model and tokenizer within the function (less efficient for multiple calls)
    model = load_model_for_generation(model_path)
    tokenizer = load_tokenizer_for_generation(model_path)

    # Ensure tokenizer has pad_token defined and model.config.eos_token_id is set
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.pad_token_id

    # Move model to GPU if available and set to evaluation mode
    model.to(device)
    model.eval()

    # Encode the input sequence to token IDs
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt').to(device)

    # Generate text using various sampling strategies
    final_outputs = model.generate(
        ids,
        do_sample=True, # Enable sampling for more diverse outputs
        max_length=max_length, # Maximum length of the generated sequence (including prompt)
        pad_token_id=model.config.eos_token_id, # Token to use for padding
        top_k=50, # Consider only the top 50 most likely tokens
        top_p=0.95, # Nucleus sampling: consider tokens whose cumulative probability is 0.95
        # repetition_penalty=1.2, # Optional: Penalize repeated tokens (can be added for more control)
    )

    # Decode the generated token IDs back to human-readable text
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))


In [None]:
# --- 10. Example Text Generation ---
# Set the model path to the output directory where the fine-tuned model is saved.
model_path_for_generation = output_dir
sequence_to_generate = "Emma, in her confusion, declared that"
max_generation_length = 50 # Total length including the prompt

# Generate text using the fine-tuned model
print("\n--- Generating Text Sample ---")
generate_text(model_path_for_generation, sequence_to_generate, max_generation_length)


In [None]:
# --- 11. Save Model to Google Drive ---
# Mount Google Drive to save the trained model persistently.
# This requires user authentication in Colab.
print("\n--- Mounting Google Drive ---")
from google.colab import drive
drive.mount('/content/drive')


In [None]:
# Define paths for copying the model to Google Drive.
colab_model_output_dir = '/content/fine_tuned_gpt2_alice' # Source directory in Colab
# Destination directory in Google Drive. Using a date for unique naming.
drive_save_dir = '/content/drive/MyDrive/MyNLPModels/EmmaGPT2_model_10July2025'


In [None]:
import shutil
# Create the target directory in Google Drive if it doesn't already exist.
os.makedirs(drive_save_dir, exist_ok=True)

# Copy the entire fine-tuned model directory from Colab's local storage to Google Drive.
# `dirs_exist_ok=True` prevents error if directory already exists.
print(f"Copying model from {colab_model_output_dir} to {drive_save_dir}...")
try:
    shutil.copytree(colab_model_output_dir, drive_save_dir, dirs_exist_ok=True)
    print(f"Model successfully saved to Google Drive at: {drive_save_dir}")
except Exception as e:
    print(f"Error saving model to Google Drive: {e}")


In [None]:
# --- 12. Quantitative Evaluation: Perplexity Calculation ---
# This section performs the evaluation of the trained model on the full dataset
# to calculate perplexity.

# Load the raw Emma text data for evaluation.
# Note: This uses the raw text, as per your instruction to not add more cleaning.
# If you prefer to evaluate on the *cleaned* text, use `cleaned_emma_text` here.
emma_raw_text_for_eval = gutenberg.raw('austen-emma.txt')

# Initialize tokenizer and model for evaluation.
# IMPORTANT: Ensure `model_load_path_eval` points to your fine-tuned model.
model_load_path_eval = '/content/fine_tuned_gpt2_alice' # Should match output_dir

tokenizer_eval = GPT2Tokenizer.from_pretrained(model_load_path_eval)
model_eval = GPT2LMHeadModel.from_pretrained(model_load_path_eval)

# Ensure tokenizer has pad_token defined and model.config.pad_token_id is set
if tokenizer_eval.pad_token is None:
    tokenizer_eval.pad_token = tokenizer_eval.eos_token
model_eval.config.pad_token_id = tokenizer_eval.pad_token_id

model_eval.to(device) # Move model to GPU if available
model_eval.eval() # Set model to evaluation mode

print(f"\nModel and tokenizer loaded for evaluation from: {model_load_path_eval}")

# Prepare the Evaluation Dataset using the CustomTextDataset.
# Use the same block_size that was used during training.
block_size_eval = 128 # Must match training block_size

# Create the evaluation dataset using the entire raw text.
eval_dataset = CustomTextDataset(tokenizer=tokenizer_eval, text_data=emma_raw_text_for_eval, block_size=block_size_eval)

# Data Collator for Language Modeling (necessary for batching during evaluation)
data_collator_eval = DataCollatorForLanguageModeling(
    tokenizer=tokenizer_eval,
    mlm=False # Set to False for Causal Language Modeling (like GPT-2)
)

# Set up Trainer for Evaluation.
# We need minimal TrainingArguments for this evaluation pass.
eval_args = TrainingArguments(
    output_dir="./evaluation_results", # A temporary directory for evaluation logs
    per_device_eval_batch_size=4,       # Use a suitable batch size for evaluation
    do_eval=True,                       # Indicate that we are doing evaluation
    report_to="none",                   # No external reporting needed for this simple evaluation
)

trainer_eval = Trainer(
    model=model_eval,
    args=eval_args,
    eval_dataset=eval_dataset, # Pass the evaluation dataset
    data_collator=data_collator_eval,
)

# Run Evaluation and Calculate Perplexity.
print("\n--- Starting Model Evaluation (Perplexity Calculation) ---")
# The `evaluate` method will compute the loss on the eval_dataset
eval_results = trainer_eval.evaluate()

# Extract the evaluation loss from the results
eval_loss = eval_results["eval_loss"]

# Calculate Perplexity: Perplexity = exp(loss)
perplexity = math.exp(eval_loss)

print(f"Evaluation Loss on 'emma.txt' (raw): {eval_loss:.4f}")
print(f"Perplexity on 'emma.txt' (raw): {perplexity:.2f}")
print("Full Evaluation results:", eval_results)
print("\n--- Evaluation Complete ---")