In [None]:
!pip install torch pandas numpy datasets transformers peft mlflow

In [2]:
# GPT-2 QuoteBot Fine-tuning with PEFT (LoRA)
# Here, we are fine-tuning GPT-2 to generate quote-style outputs using the english_quotes dataset
import torch
import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, 
    TrainingArguments, Trainer,
    DataCollatorForLanguageModeling, pipeline)
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
import json
import os
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import warnings
warnings.filterwarnings('ignore')
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}.")
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["MLFLOW_TRACKING_DISABLED"] = "true"
MODEL_NAME = "gpt2"
DATASET_NAME = "Abirate/english_quotes"
OUTPUT_DIR = "./quotebot-gpt2-lora"
MAX_LENGTH = 128
BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 4
LEARNING_RATE = 2e-4
NUM_EPOCHS = 3
SAVE_STEPS = 500
print("\nGPT-2 QuoteBot Fine-Tuning Setup:\n")
print("Loading the dataset...")
dataset = load_dataset(DATASET_NAME)
print(f"Dataset loaded: {dataset}")
print(f"Train samples: {len(dataset['train'])}")
sample = dataset['train'][0]
print(f"\nSample data structure: {sample.keys()}")
print(f"Sample quote: {sample}")
def check_existing_model():
    model_files = ['adapter_config.json', 'adapter_model.bin'] 
    if os.path.exists(OUTPUT_DIR):
        existing_files = [f for f in model_files if os.path.exists(os.path.join(OUTPUT_DIR, f))]
        return len(existing_files) > 0
    return False
if check_existing_model():
    print(f"\nFound existing fine-tuned model at: {OUTPUT_DIR}.")
    print("Loading fine-tuned QuoteBot...")    
    tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR)
    base_model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float16,
        device_map="auto")
    model = PeftModel.from_pretrained(base_model, OUTPUT_DIR)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    print(f"Fine-tuned model loaded successfully.")
    print(f"Model loaded on device: {next(model.parameters()).device}")    
    SKIP_TRAINING = True
else:
    print("\nNo existing fine-tuned model found. Loading base model for training...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float16,
        device_map="auto")
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token    
    print(f"Model loaded on device: {next(model.parameters()).device}")
    print(f"Model parameters: {model.num_parameters():,}")
    SKIP_TRAINING = False
if not SKIP_TRAINING:
    def preprocess_function(examples):
        texts = []
        for quote, author, tags in zip(examples['quote'], examples['author'], examples['tags']):
            if tags and len(tags) > 0:
                topic = tags[0] if isinstance(tags, list) else str(tags)
            else:
                topic = "wisdom"
            topic = str(topic).strip()[:50]
            quote = str(quote).strip()
            text = f"Topic: {topic}\nQuote: \"{quote}\"<|endoftext|>"
            texts.append(text)
        all_input_ids = []
        all_attention_masks = []        
        for text in texts:
            encoded = tokenizer(
                text,
                truncation=True,
                max_length=MAX_LENGTH,
                padding=False,
                return_tensors=None)
            if len(encoded['input_ids']) >= 10:
                all_input_ids.append(encoded['input_ids'])
                all_attention_masks.append(encoded['attention_mask'])        
        return {
            'input_ids': all_input_ids,
            'attention_mask': all_attention_masks,
            'labels': [ids.copy() for ids in all_input_ids]}
    print("\nPreprocessing the dataset...")
    tokenized_dataset = dataset.map(
        preprocess_function,
        batched=True,
        batch_size=100, 
        remove_columns=dataset['train'].column_names,
        desc="Tokenizing",
        num_proc=1)
    print(f"Tokenized dataset: {tokenized_dataset}")
    print(f"Dataset size: {len(tokenized_dataset['train'])}")
    sample_tokenized = tokenized_dataset['train'][0]
    sample_text = tokenizer.decode(sample_tokenized['input_ids'])
    print(f"\nSample tokenized text: {sample_text}")
    print(f"Input IDs length: {len(sample_tokenized['input_ids'])}")
    print(f"Labels length: {len(sample_tokenized['labels'])}")
    print(f"Input IDs type: {type(sample_tokenized['input_ids'])}")
    print(f"Labels type: {type(sample_tokenized['labels'])}")
    print(f"First few input IDs: {sample_tokenized['input_ids'][:10]}")
    print("\nSetting up PEFT (LoRA) configuration...")
    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=16, lora_alpha=32, lora_dropout=0.1,
        target_modules=["c_attn", "c_proj", "c_fc"])
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()
    @dataclass
    class DataCollatorForCausalLM:
        tokenizer: Any
        max_length: int = MAX_LENGTH
        def __call__(self, examples: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
            input_ids = [example['input_ids'] for example in examples]
            labels = [example['labels'] for example in examples]            
            max_len = min(max(len(ids) for ids in input_ids), self.max_length)            
            padded_input_ids = []
            padded_labels = []
            attention_masks = []            
            for ids, lbls in zip(input_ids, labels):
                if len(ids) > max_len:
                    ids = ids[:max_len]
                    lbls = lbls[:max_len]                
                padding_length = max_len - len(ids)                
                padded_ids = ids + [self.tokenizer.pad_token_id] * padding_length
                padded_lbls = lbls + [-100] * padding_length  
                attention_mask = [1] * len(ids) + [0] * padding_length                
                padded_input_ids.append(padded_ids)
                padded_labels.append(padded_lbls)
                attention_masks.append(attention_mask)
            return {
                'input_ids': torch.tensor(padded_input_ids, dtype=torch.long),
                'attention_mask': torch.tensor(attention_masks, dtype=torch.long),
                'labels': torch.tensor(padded_labels, dtype=torch.long)}
    data_collator = DataCollatorForCausalLM(tokenizer=tokenizer, max_length=MAX_LENGTH)
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        overwrite_output_dir=True,
        num_train_epochs=NUM_EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        learning_rate=LEARNING_RATE,
        weight_decay=0.01,
        logging_steps=50,
        save_steps=SAVE_STEPS,
        save_total_limit=2,
        prediction_loss_only=True,
        remove_unused_columns=False,
        dataloader_pin_memory=False,  
        fp16=False,  
        bf16=True,
        dataloader_num_workers=0,  
        report_to=[],  
        load_best_model_at_end=False,
        warmup_steps=100,
        lr_scheduler_type="cosine",
        disable_tqdm=False, 
        log_level="warning",)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset['train'],
        data_collator=data_collator,
        tokenizer=tokenizer,)
    print("\nStarting training...")
    try:
        import mlflow
        mlflow.end_run()
    except:
        pass
    trainer.train()
    print("\nSaving the model...")
    trainer.save_model()
    tokenizer.save_pretrained(OUTPUT_DIR)
    print(f"Model saved to: {OUTPUT_DIR}.")
else:
    print("\nSkipping training, using existing fine-tuned model...")
def generate_quote(topic: str, max_length: int = 100, temperature: float = 0.8):
    prompt = f"Topic: {topic}\nQuote:"
    inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=max_length,
            temperature=temperature,
            do_sample=True,
            top_p=0.9,
            top_k=50,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            num_return_sequences=1)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text
def interactive_quote_generator():
    print("\nQuoteBot is set up and ready. Enter topics to generate quotes.")
    print("Type 'quit' to exit.")
    while True:
        topic = input("\nEnter a topic (type 'quit' or 'exit' to end the system): ").strip()
        if topic.lower() in ['quit', 'exit', 'q']:
            print("The system ends.")
            break
        if topic:
            try:
                quote = generate_quote(topic, max_length=100, temperature=0.8)
                print(f"\nQuoteBot says:")
                print(f"{quote}")
            except Exception as e:
                print(f"Error generating quote: {e}.")
        else:
            print("Please enter a valid topic.")
def evaluate_model_perplexity():
    print("\nEvaluating model perplexity...")
    dataset = load_dataset(DATASET_NAME)
    def preprocess_function(examples):
        texts = []
        for quote, author, tags in zip(examples['quote'], examples['author'], examples['tags']):
            if tags and len(tags) > 0:
                topic = tags[0] if isinstance(tags, list) else str(tags)
            else:
                topic = "wisdom"
            topic = str(topic).strip()[:50]
            quote = str(quote).strip()
            text = f"Topic: {topic}\nQuote: \"{quote}\"<|endoftext|>"
            texts.append(text)
        all_input_ids = []
        all_attention_masks = []
        for text in texts:
            encoded = tokenizer(
                text,
                truncation=True,
                max_length=MAX_LENGTH,
                padding=False,
                return_tensors=None)
            if len(encoded['input_ids']) >= 10:
                all_input_ids.append(encoded['input_ids'])
                all_attention_masks.append(encoded['attention_mask'])
        return {
            'input_ids': all_input_ids,
            'attention_mask': all_attention_masks,
            'labels': [ids.copy() for ids in all_input_ids]}
    tokenized_dataset = dataset.map(
        preprocess_function,
        batched=True,
        batch_size=100,
        remove_columns=dataset['train'].column_names,
        desc="Tokenizing for evaluation",
        num_proc=1)
    @dataclass
    class DataCollatorForCausalLM:
        tokenizer: Any
        max_length: int = MAX_LENGTH
        def __call__(self, examples: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
            input_ids = [example['input_ids'] for example in examples]
            labels = [example['labels'] for example in examples]
            max_len = min(max(len(ids) for ids in input_ids), self.max_length)
            padded_input_ids = []
            padded_labels = []
            attention_masks = []
            for ids, lbls in zip(input_ids, labels):
                if len(ids) > max_len:
                    ids = ids[:max_len]
                    lbls = lbls[:max_len]
                padding_length = max_len - len(ids)
                padded_ids = ids + [self.tokenizer.pad_token_id] * padding_length
                padded_lbls = lbls + [-100] * padding_length
                attention_mask = [1] * len(ids) + [0] * padding_length
                padded_input_ids.append(padded_ids)
                padded_labels.append(padded_lbls)
                attention_masks.append(attention_mask)
            return {
                'input_ids': torch.tensor(padded_input_ids, dtype=torch.long),
                'attention_mask': torch.tensor(attention_masks, dtype=torch.long),
                'labels': torch.tensor(padded_labels, dtype=torch.long)}
    data_collator = DataCollatorForCausalLM(tokenizer=tokenizer, max_length=MAX_LENGTH)
    eval_dataset = tokenized_dataset['train'].select(range(100))
    eval_dataloader = torch.utils.data.DataLoader(
        eval_dataset, 
        batch_size=2, 
        collate_fn=data_collator)
    model.eval()
    total_loss = 0
    total_tokens = 0
    with torch.no_grad():
        for batch in eval_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.item() * batch['input_ids'].numel()
            total_tokens += batch['input_ids'].numel()
    avg_loss = total_loss / total_tokens
    perplexity = torch.exp(torch.tensor(avg_loss))
    print(f"Average Loss: {avg_loss:.4f}")
    print(f"Perplexity: {perplexity:.4f}")
    return perplexity
perplexity = evaluate_model_perplexity()
if not SKIP_TRAINING:
    def make_json_serializable(obj):
        if isinstance(obj, dict):
            return {k: make_json_serializable(v) for k, v in obj.items()}
        elif isinstance(obj, (list, tuple)):
            return [make_json_serializable(item) for item in obj]
        elif isinstance(obj, set):
            return list(obj) 
        elif hasattr(obj, '__dict__'):
            return make_json_serializable(obj.__dict__)
        elif isinstance(obj, (torch.dtype, type)):
            return str(obj)
        else:
            try:
                json.dumps(obj)  
                return obj
            except TypeError:
                return str(obj)
    model_info = {
        "model_name": MODEL_NAME,
        "dataset": DATASET_NAME,
        "peft_config": make_json_serializable(peft_config.to_dict()),
        "training_config": {
            "batch_size": BATCH_SIZE,
            "learning_rate": LEARNING_RATE,
            "num_epochs": NUM_EPOCHS,
            "max_length": MAX_LENGTH,
            "gradient_accumulation_steps": GRADIENT_ACCUMULATION_STEPS,},
        "perplexity": float(perplexity),
        "max_length": MAX_LENGTH,
        "num_epochs": NUM_EPOCHS}
    with open(f"{OUTPUT_DIR}/model_info.json", "w") as f:
        json.dump(model_info, f, indent=2)
    print(f"\nModel info saved to: {OUTPUT_DIR}/model_info.json.")
if SKIP_TRAINING:
    print("QuoteBot loaded from existing model.")
else:
    print("QuoteBot training completed.")
interactive_quote_generator()

Using device: mps.

GPT-2 QuoteBot Fine-Tuning Setup:

Loading the dataset...
Dataset loaded: DatasetDict({
    train: Dataset({
        features: ['quote', 'author', 'tags'],
        num_rows: 2508
    })
})
Train samples: 2508

Sample data structure: dict_keys(['quote', 'author', 'tags'])
Sample quote: {'quote': '“Be yourself; everyone else is already taken.”', 'author': 'Oscar Wilde', 'tags': ['be-yourself', 'gilbert-perreira', 'honesty', 'inspirational', 'misattributed-oscar-wilde', 'quote-investigator']}

Found existing fine-tuned model at: ./quotebot-gpt2-lora.
Loading fine-tuned QuoteBot...
Fine-tuned model loaded successfully.
Model loaded on device: mps:0

Skipping training, using existing fine-tuned model...

Evaluating model perplexity...
Average Loss: 2.5214
Perplexity: 12.4466
QuoteBot loaded from existing model.

QuoteBot is set up and ready. Enter topics to generate quotes.
Type 'quit' to exit.

QuoteBot says:
Topic: love
Quote: "“Love can be the most beautiful thing you