In [3]:
import torch
import os
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
print("PyTorch version:", torch.__version__)
print("MPS available:", torch.backends.mps.is_available())

PyTorch version: 2.5.1
MPS available: True


In [4]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer, LlamaForCausalLM, Trainer, TrainingArguments,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType

# Enable fallback for unsupported MPS operations
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

# --- Load Dataset ---
print("Loading dataset...")
dataset = load_dataset("facebook/empathetic_dialogues")

# Fix for batched=True preprocessing
def preprocess_function(examples):
    input_texts = [
        f"Contest: {context}\nQuestion: {prompt}\nAnswer: {utterance}"
        for context, prompt, utterance in zip(examples['context'], examples['prompt'], examples['utterance'])
    ]
    return {"input_text": input_texts}


# Apply preprocessing
print("Preprocessing dataset...")
dataset = dataset.map(preprocess_function, batched=True)
dataset = dataset.remove_columns(['conv_id', 'utterance_idx', 'speaker_idx', 
                                  'selfeval', 'tags'])

#['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags
# Load tokenizer
print("Loading tokenizer...")
model_name = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(
        examples["input_text"], 
        truncation=True, 
        padding="max_length", 
        max_length=128
    )

print("Tokenizing dataset...")
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["input_text"])

# Split dataset into train and evaluation
split_datasets = tokenized_dataset["train"].train_test_split(test_size=0.1)
train_dataset = split_datasets["train"]
eval_dataset = split_datasets["test"]

# Data collator for causal LM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False, pad_to_multiple_of=8
)

# --- Load Model with LoRA ---
print("Loading model with LoRA...")
model = LlamaForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float32  # Use float32 for MPS
)

# Configure LoRA
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,                          # Low rank for LoRA
    lora_alpha=16,                # Alpha scaling
    lora_dropout=0.1,             # Dropout for LoRA layers
    bias="none"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# --- Training Arguments ---
training_args = TrainingArguments(
    output_dir="./results_empathetic_dialogues",
    per_device_train_batch_size=2,         # Small batch size for MPS
    gradient_accumulation_steps=8,         # Simulates larger batch size
    num_train_epochs=1,
    learning_rate=5e-5,
    logging_steps=10,
    save_steps=500,
    save_total_limit=1,
    evaluation_strategy="steps",
    eval_steps=500,
    bf16=False,                            # Use float32 instead of bf16 on MPS
    fp16=False,                            # FP16 not supported on MPS
    report_to="none",
)

# --- Trainer ---
print("Initializing Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# --- Train Model ---
print("Starting training...")
trainer.train()

# Save the fine-tuned model
print("Saving the model...")
model.save_pretrained("./llama-lora-empathetic_dialogues")
tokenizer.save_pretrained("./llama-lora-empathetic_dialogues")
print("Training complete!")

Loading dataset...
Preprocessing dataset...
Loading tokenizer...
Tokenizing dataset...


Map: 100%|██████████████████████| 10943/10943 [00:00<00:00, 21333.42 examples/s]


Loading model with LoRA...
trainable params: 851,968 || all params: 1,236,666,368 || trainable%: 0.0689
Initializing Trainer...
Starting training...


  trainer = Trainer(


Step,Training Loss,Validation Loss
500,2.2588,2.343128
1000,2.3246,2.300753
1500,2.2163,2.277542
2000,2.3072,2.261853
2500,2.135,2.24684
3000,2.2358,2.233457
3500,2.2348,2.22284
4000,2.1615,2.215492


Saving the model...
Training complete!


In [20]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from datasets import load_dataset
import evaluate
import numpy as np

# -------------------------------
# Configuration
# -------------------------------

MODEL_NAME = "./llama-lora-empathetic_dialogues"  # Replace with your trained model name
DATASET_NAME = "facebook/empathetic_dialogues"  # Replace with your evaluation dataset
TEXT_COLUMN = "prompt"  # Column containing the input text
REFERENCE_COLUMN = "utterance"  # Column containing the reference responses
MAX_LENGTH = 128  # Max token length
DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE)

# -------------------------------
# Metric 1: Perplexity
# -------------------------------
def calculate_perplexity(model, tokenizer, dataset, column, max_length):
    """Calculate perplexity of the model on the dataset."""
    model.eval()
    losses = []
    for sample in dataset[column]:
        inputs = tokenizer(sample, return_tensors="pt", truncation=True, max_length=max_length).to(DEVICE)
        with torch.no_grad():
            outputs = model(**inputs, labels=inputs["input_ids"])
            loss = outputs.loss
            losses.append(loss.item())
    perplexity = np.exp(np.mean(losses))
    return perplexity

# -------------------------------
# Metric 2: ROUGE
# -------------------------------
def calculate_rouge(model, tokenizer, dataset, text_column, ref_column, max_length):
    """Calculate ROUGE score for summarization or response generation tasks."""
    rouge = evaluate.load("rouge")
    predictions, references = [], []
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if DEVICE == "cuda" else -1)
    
    for row in dataset:
        input_text = row[text_column]
        reference = row[ref_column]
        generated = generator(input_text, max_length=max_length, truncation=True, do_sample=False)[0]['generated_text']
        predictions.append(generated)
        references.append(reference)
    
    rouge_results = rouge.compute(predictions=predictions, references=references)
    return rouge_results

# -------------------------------
# Metric 3: Diversity
# -------------------------------

def calculate_diversity(model, tokenizer, dataset, text_column, max_length):
    """Calculate diversity by measuring unique n-grams."""
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if DEVICE == "cuda" else -1)
    all_outputs = []
    
    prompts = [dataset[text_column][i] for i in range(min(10, len(dataset[text_column])))]
    
    for prompt in prompts:
        outputs = generator(prompt, max_length=max_length, num_return_sequences=5, do_sample=True)
        texts = [output["generated_text"] for output in outputs]
        all_outputs.extend(texts)
    
    # Calculate unique n-grams
    def get_ngrams(text, n=2):
        tokens = text.split()
        return set(zip(*[tokens[i:] for i in range(n)]))
    
    ngrams = [get_ngrams(output) for output in all_outputs]
    total_ngrams = set.union(*ngrams)
    diversity_score = len(total_ngrams) / len(all_outputs)
    return diversity_score

# -------------------------------
# Metric 4: Human Evaluation (Placeholder)
# -------------------------------
def placeholder_human_evaluation():
    """Placeholder for human evaluation results."""
    print("Human evaluation should be performed manually by assessing relevance, coherence, and empathy.")
    return "Pending human feedback"

# -------------------------------
# Main Evaluation Script
# -------------------------------
if __name__ == "__main__":
    print("Loading dataset...")
    dataset = load_dataset(DATASET_NAME, split="test")
    dataset = dataset.select(range(100))  # Use a subset for quick evaluation

    print("\nCalculating Perplexity...")
    perplexity = calculate_perplexity(model, tokenizer, dataset, TEXT_COLUMN, MAX_LENGTH)
    print(f"Perplexity: {perplexity:.2f}")

    print("\nCalculating ROUGE Scores...")
    rouge_scores = calculate_rouge(model, tokenizer, dataset, TEXT_COLUMN, REFERENCE_COLUMN, MAX_LENGTH)
    print(f"ROUGE Scores: {rouge_scores}")

    print("\nCalculating Diversity...")
    prompts = [dataset[i][TEXT_COLUMN] for i in range(min(10, len(dataset)))]
    diversity = calculate_diversity(model, tokenizer, dataset, text_column=TEXT_COLUMN, max_length=MAX_LENGTH)
    print(f"Diversity Score: {diversity:.4f}")

    print("\nHuman Evaluation...")
    human_evaluation = placeholder_human_evaluation()
    print(f"Human Evaluation: {human_evaluation}")


Loading dataset...

Calculating Perplexity...
Perplexity: 34.63

Calculating ROUGE Scores...


Device set to use mps:0
Device set to use mps:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


ROUGE Scores: {'rouge1': 0.10824971452340112, 'rouge2': 0.037257243294264844, 'rougeL': 0.08381821645458397, 'rougeLsum': 0.08472251331115399}

Calculating Diversity...
Diversity Score: 19.9600

Human Evaluation...
Human evaluation should be performed manually by assessing relevance, coherence, and empathy.
Human Evaluation: Pending human feedback
