In [1]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer, LlamaForCausalLM, Trainer, TrainingArguments,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType

# Enable fallback for unsupported MPS operations
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

# -------------------------------
# Load and Preprocess Dataset
# -------------------------------
print("Loading dataset...")
dataset = load_dataset("go_emotions")

# Preprocess the dataset
def preprocess_function(examples):
    input_texts = [
        f"Emotion Analysis\nText: {text}\nLabel: {label}" 
        for text, label in zip(examples["text"], examples["labels"])
    ]
    return {"input_text": input_texts}

print("Preprocessing dataset...")
dataset = dataset.map(preprocess_function, batched=True)
dataset = dataset.remove_columns(['text', 'labels', 'id'])

# Load tokenizer
print("Loading tokenizer...")
model_name = "./llama-lora-empathetic_dialogues-finetuned-with-counsel-chat"
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(
        examples["input_text"], 
        truncation=True, 
        padding="max_length", 
        max_length=128
    )

print("Tokenizing dataset...")
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["input_text"])

# Split dataset into train and evaluation
split_datasets = tokenized_dataset["train"].train_test_split(test_size=0.1)
train_dataset = split_datasets["train"]
eval_dataset = split_datasets["test"]

# Data collator for causal LM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False, pad_to_multiple_of=8
)

# -------------------------------
# Load Model with LoRA
# -------------------------------
print("Loading model with LoRA...")
model = LlamaForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float32  # Use float32 for MPS
)

# Configure LoRA
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,                          # Low rank for LoRA
    lora_alpha=16,                # Alpha scaling
    lora_dropout=0.1,             # Dropout for LoRA layers
    bias="none"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# -------------------------------
# Training Arguments
# -------------------------------
training_args = TrainingArguments(
    output_dir="./llama-lora-empathetic_dialogues-and-counsel-chat-finetuned-with-go-emotions",
    per_device_train_batch_size=2,         # Small batch size for MPS
    gradient_accumulation_steps=8,         # Simulates larger batch size
    num_train_epochs=1,
    learning_rate=5e-5,
    logging_steps=10,
    save_steps=500,
    save_total_limit=1,
    evaluation_strategy="steps",
    eval_steps=500,
    bf16=False,                            # Use float32 instead of bf16 on MPS
    fp16=False,                            # FP16 not supported on MPS
    report_to="none",
)

# -------------------------------
# Trainer
# -------------------------------
print("Initializing Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# -------------------------------
# Train Model
# -------------------------------
print("Starting training...")
trainer.train()

# Save the fine-tuned model
print("Saving the model...")
model.save_pretrained("./llama-lora-empathetic_dialogues-and-counsel-chat-finetuned-with-go-emotionss")
tokenizer.save_pretrained("./llama-lora-empathetic_dialogues-and-counsel-chat-finetuned-with-go-emotions")
print("Training complete!")


  from .autonotebook import tqdm as notebook_tqdm


Loading dataset...
Preprocessing dataset...
Loading tokenizer...
Tokenizing dataset...


Map: 100%|██████████████████████| 43410/43410 [00:01<00:00, 35944.28 examples/s]
Map: 100%|████████████████████████| 5426/5426 [00:00<00:00, 38110.31 examples/s]
Map: 100%|████████████████████████| 5427/5427 [00:00<00:00, 37592.01 examples/s]


Loading model with LoRA...


  warn("The installed version of bitsandbytes was compiled without GPU support. "
  trainer = Trainer(


'NoneType' object has no attribute 'cadam32bit_grad_fp32'
trainable params: 851,968 || all params: 1,236,666,368 || trainable%: 0.0689
Initializing Trainer...
Starting training...


Step,Training Loss,Validation Loss
500,2.6701,2.781192
1000,2.6107,2.738399
1500,2.6293,2.713769
2000,2.7503,2.717334


Saving the model...
Training complete!


In [3]:
import os
import torch
from transformers import AutoTokenizer, LlamaForCausalLM, pipeline
import evaluate
import numpy as np
from datasets import load_dataset

# -------------------------------
# Configuration for Evaluation
# -------------------------------
MODEL_NAME = "./llama-lora-empathetic_dialogues-and-counsel-chat-finetuned-with-go-emotionss" # Path to the trained model
Tokenizer = "./llama-lora-empathetic_dialogues-and-counsel-chat-finetuned-with-go-emotions"
TEXT_COLUMN = "input_text"  # Column in the dataset with input text
REFERENCE_COLUMN = "reference"  # Column in the dataset with reference text
MAX_LENGTH = 128  # Max token length
MAX_NEW_TOKENS = 50  # Max tokens for text generation
DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# -------------------------------
# Load Dataset for Evaluation
# -------------------------------
print("Loading dataset...")
dataset = load_dataset("go_emotions")

# Preprocess the dataset to match the training structure
def preprocess_function(examples):
    input_texts = [
        f"Emotion Analysis\nText: {text}\nLabel: {label}" 
        for text, label in zip(examples["text"], examples["labels"])
    ]
    return {"input_text": input_texts}

print("Preprocessing dataset for evaluation...")
dataset = dataset.map(preprocess_function, batched=True)
dataset = dataset.rename_column("text", "reference")
dataset = dataset.remove_columns(["labels", "id"])

# Subset for quick evaluation
eval_dataset = dataset["train"].select(range(100))

print("Processed dataset columns:", eval_dataset.column_names)

# -------------------------------
# Load Model and Tokenizer
# -------------------------------
print("Loading model for evaluation...")
tokenizer = AutoTokenizer.from_pretrained(Tokenizer)
model = LlamaForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE)

# -------------------------------
# Metric 1: Perplexity
# -------------------------------
def calculate_perplexity(model, tokenizer, dataset, column, max_length):
    """Calculate perplexity of the model on the dataset."""
    model.eval()
    losses = []
    for sample in dataset[column]:
        inputs = tokenizer(sample, return_tensors="pt", truncation=True, max_length=max_length).to(DEVICE)
        with torch.no_grad():
            outputs = model(**inputs, labels=inputs["input_ids"])
            losses.append(outputs.loss.item())
    return np.exp(np.mean(losses))

# -------------------------------
# Metric 2: ROUGE
# -------------------------------
def calculate_rouge(model, tokenizer, dataset, text_column, ref_column, max_new_tokens):
    """Calculate ROUGE score for summarization or response generation tasks."""
    rouge = evaluate.load("rouge")
    predictions, references = [], []
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if DEVICE == "cuda" else -1)
    
    for row in dataset:
        input_text = row[text_column]
        reference = row[ref_column]
        generated = generator(input_text, max_new_tokens=max_new_tokens, truncation=True, do_sample=False)[0]['generated_text']
        predictions.append(generated)
        references.append(reference)
    
    return rouge.compute(predictions=predictions, references=references)

# -------------------------------
# Metric 3: Diversity
# -------------------------------
def calculate_diversity(model, tokenizer, prompts, max_new_tokens):
    """Calculate diversity by measuring unique n-grams."""
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if DEVICE == "cuda" else -1)
    all_outputs = []
    
    for prompt in prompts:
        outputs = generator(prompt, max_new_tokens=max_new_tokens, num_return_sequences=5, do_sample=True)
        texts = [output["generated_text"] for output in outputs]
        all_outputs.extend(texts)
    
    ngrams = [set(zip(*[text.split()[i:] for i in range(2)])) for text in all_outputs]
    total_ngrams = set.union(*ngrams)
    return len(total_ngrams) / len(all_outputs)

# -------------------------------
# Evaluation
# -------------------------------
if __name__ == "__main__":
    print("\nCalculating Perplexity...")
    perplexity = calculate_perplexity(model, tokenizer, eval_dataset, TEXT_COLUMN, MAX_LENGTH)
    print(f"Perplexity: {perplexity:.2f}")

    print("\nCalculating ROUGE Scores...")
    rouge_scores = calculate_rouge(model, tokenizer, eval_dataset, TEXT_COLUMN, REFERENCE_COLUMN, max_new_tokens=MAX_NEW_TOKENS)
    print(f"ROUGE Scores: {rouge_scores}")

    print("\nCalculating Diversity...")
    prompts = eval_dataset["input_text"][:10]  # Correctly extract input_text column
    diversity = calculate_diversity(model, tokenizer, prompts, max_new_tokens=MAX_NEW_TOKENS)
    print(f"Diversity Score: {diversity:.4f}")


Loading dataset...
Preprocessing dataset for evaluation...
Processed dataset columns: ['reference', 'input_text']
Loading model for evaluation...

Calculating Perplexity...
Perplexity: 12.89

Calculating ROUGE Scores...


Device set to use mps:0
Device set to use mps:0


ROUGE Scores: {'rouge1': 0.4609488227129698, 'rouge2': 0.43026926848917024, 'rougeL': 0.46070892177875267, 'rougeLsum': 0.46029475381189044}

Calculating Diversity...
Diversity Score: 5.1200
