In [2]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer, LlamaForCausalLM, Trainer, TrainingArguments,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType

# Enable fallback for unsupported MPS operations
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

# --- Load Dataset ---
print("Loading dataset...")
dataset = load_dataset("nbertagnolli/counsel-chat")

# Fix for batched=True preprocessing
def preprocess_function(examples):
    input_texts = [
        f"Question Title: {title}\nQuestion: {text}\nAnswer: {answer}"
        for title, text, answer in zip(examples['questionTitle'], examples['questionText'], examples['answerText'])
    ]
    return {"input_text": input_texts}


# Apply preprocessing
print("Preprocessing dataset...")
dataset = dataset.map(preprocess_function, batched=True)
dataset = dataset.remove_columns(['questionID', 'questionTitle', 'questionText', 
                                  'questionLink', 'topic', 'therapistInfo', 
                                  'therapistURL', 'answerText', 'upvotes', 'views'])

# Load tokenizer
print("Loading tokenizer...")
model_name = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(
        examples["input_text"], 
        truncation=True, 
        padding="max_length", 
        max_length=128
    )

print("Tokenizing dataset...")
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["input_text"])

# Split dataset into train and evaluation
split_datasets = tokenized_dataset["train"].train_test_split(test_size=0.1)
train_dataset = split_datasets["train"]
eval_dataset = split_datasets["test"]

# Data collator for causal LM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False, pad_to_multiple_of=8
)

# --- Load Model with LoRA ---
print("Loading model with LoRA...")
model = LlamaForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float32  # Use float32 for MPS
)

# Configure LoRA
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,                          # Low rank for LoRA
    lora_alpha=16,                # Alpha scaling
    lora_dropout=0.1,             # Dropout for LoRA layers
    bias="none"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# --- Training Arguments ---
training_args = TrainingArguments(
    output_dir="./counsel_chat_prueba_LlamaForCausalLM",
    per_device_train_batch_size=2,         # Small batch size for MPS
    gradient_accumulation_steps=8,         # Simulates larger batch size
    num_train_epochs=4,
    learning_rate=5e-5,
    logging_steps=10,
    save_steps=500,
    save_total_limit=1,
    evaluation_strategy="steps",
    eval_steps=500,
    bf16=False,                            # Use float32 instead of bf16 on MPS
    fp16=False,                            # FP16 not supported on MPS
    report_to="none",
)

# --- Trainer ---
print("Initializing Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# --- Train Model ---
print("Starting training...")
trainer.train()

# Save the fine-tuned model
print("Saving the model...")
model.save_pretrained("./llama-lora-counsel-chat-LLamaForCausalLM")
tokenizer.save_pretrained("./llama-lora-counsel-chat-LLamaForCausalLM")
print("Training complete!")


Loading dataset...


Repo card metadata block was not found. Setting CardData to empty.


Preprocessing dataset...
Loading tokenizer...
Tokenizing dataset...
Loading model with LoRA...


  warn("The installed version of bitsandbytes was compiled without GPU support. "
  trainer = Trainer(


'NoneType' object has no attribute 'cadam32bit_grad_fp32'
trainable params: 851,968 || all params: 1,236,666,368 || trainable%: 0.0689
Initializing Trainer...
Starting training...


Step,Training Loss,Validation Loss
500,2.3443,2.411849


Saving the model...
Training complete!


In [3]:
metrics = trainer.evaluate()

In [4]:
metrics

{'eval_loss': 2.4064571857452393,
 'eval_runtime': 14.6764,
 'eval_samples_per_second': 18.942,
 'eval_steps_per_second': 2.385,
 'epoch': 3.979983987189752}

In [45]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, pipeline
)
import evaluate
import numpy as np

# Enable fallback for unsupported MPS operations
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

# -------------------------------
# Configuration
# -------------------------------
MODEL_NAME = "./llama-lora-counsel-chat-LLamaForCausalLM"  # Replace with your trained model name
DATASET_NAME = "nbertagnolli/counsel-chat"  # Example dataset for evaluation
TEXT_COLUMN = "input_text"  # Column in dataset containing input text
REFERENCE_COLUMN = "reference"  # Column for reference text
MAX_NEW_TOKENS = 50  # Max tokens to generate
MAX_LENGTH = 128  # Max token length for evaluation
DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# -------------------------------
# Preprocessing Dataset
# -------------------------------
def preprocess_function(examples):
    """Preprocess dataset to create input_text."""
    input_texts = [
        f"Question Title: {title}\nQuestion: {text}"
        for title, text in zip(examples['questionTitle'], examples['questionText'])
    ]
    return {"input_text": input_texts}

print("Loading dataset...")
dataset = load_dataset(DATASET_NAME)

# Preprocess and rename columns
print("Preprocessing dataset for evaluation...")
dataset = dataset.map(preprocess_function, batched=True)
dataset = dataset.rename_column("answerText", "reference")
dataset = dataset.remove_columns(['questionID', 'questionTitle', 'questionText', 
                                  'questionLink', 'topic', 'therapistInfo', 
                                  'therapistURL', 'upvotes', 'views'])

print("Processed dataset columns:", dataset["train"].column_names)

# -------------------------------
# Load Model and Tokenizer
# -------------------------------
print("Loading tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE)

# -------------------------------
# Metric 1: Perplexity
# -------------------------------
def calculate_perplexity(model, tokenizer, dataset, column, max_length):
    """Calculate perplexity of the model on the dataset."""
    model.eval()
    losses = []
    for sample in dataset[column]:
        inputs = tokenizer(sample, return_tensors="pt", truncation=True, max_length=max_length).to(DEVICE)
        with torch.no_grad():
            outputs = model(**inputs, labels=inputs["input_ids"])
            loss = outputs.loss
            losses.append(loss.item())
    perplexity = np.exp(np.mean(losses))
    return perplexity

# -------------------------------
# Metric 2: ROUGE
# -------------------------------
def calculate_rouge(model, tokenizer, dataset, text_column, ref_column, max_new_tokens):
    """Calculate ROUGE score for summarization or response generation tasks."""
    rouge = evaluate.load("rouge")
    predictions, references = [], []
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if DEVICE == "cuda" else -1)
    
    for row in dataset:
        input_text = row[text_column]
        reference = row[ref_column]
        generated = generator(input_text, max_new_tokens=max_new_tokens, truncation=True, do_sample=False)[0]['generated_text']
        predictions.append(generated)
        references.append(reference)
    
    rouge_results = rouge.compute(predictions=predictions, references=references)
    return rouge_results

# -------------------------------
# Metric 3: Diversity
# -------------------------------
def calculate_diversity(model, tokenizer, prompts, max_new_tokens):
    """Calculate diversity by measuring unique n-grams."""
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if DEVICE == "cuda" else -1)
    all_outputs = []
    
    for prompt in prompts:
        outputs = generator(prompt, max_new_tokens=max_new_tokens, num_return_sequences=5, do_sample=True)
        texts = [output["generated_text"] for output in outputs]
        all_outputs.extend(texts)
    
    # Calculate unique n-grams
    def get_ngrams(text, n=2):
        tokens = text.split()
        return set(zip(*[tokens[i:] for i in range(n)]))
    
    ngrams = [get_ngrams(output) for output in all_outputs]
    total_ngrams = set.union(*ngrams)
    diversity_score = len(total_ngrams) / len(all_outputs)
    return diversity_score

# -------------------------------
# Metric 4: Human Evaluation (Placeholder)
# -------------------------------
def placeholder_human_evaluation():
    """Placeholder for human evaluation results."""
    print("Human evaluation should be performed manually by assessing relevance, coherence, and empathy.")
    return "Pending human feedback"

# -------------------------------
# Main Evaluation Script
# -------------------------------
if __name__ == "__main__":
    print("Using a subset for evaluation...")
    eval_dataset = dataset["train"].select(range(100))  # Use a small subset for quick evaluation

    print("\nCalculating Perplexity...")
    perplexity = calculate_perplexity(model, tokenizer, eval_dataset, TEXT_COLUMN, MAX_LENGTH)
    print(f"Perplexity: {perplexity:.2f}")

    print("\nCalculating ROUGE Scores...")
    rouge_scores = calculate_rouge(model, tokenizer, eval_dataset, TEXT_COLUMN, REFERENCE_COLUMN, max_new_tokens=MAX_NEW_TOKENS)
    print(f"ROUGE Scores: {rouge_scores}")

    print("\nCalculating Diversity...")
    prompts = [row[TEXT_COLUMN] for row in eval_dataset]
    diversity = calculate_diversity(model, tokenizer, prompts, MAX_NEW_TOKENS)
    print(f"Diversity Score: {diversity:.4f}")

    print("\nHuman Evaluation...")
    human_evaluation = placeholder_human_evaluation()
    print(f"Human Evaluation: {human_evaluation}")



Loading dataset...


Repo card metadata block was not found. Setting CardData to empty.


Preprocessing dataset for evaluation...
Processed dataset columns: ['reference', 'input_text']
Loading tokenizer and model...
Using a subset for evaluation...

Calculating Perplexity...
Perplexity: 4.66

Calculating ROUGE Scores...


Device set to use mps:0
Device set to use mps:0


ROUGE Scores: {'rouge1': 0.2360869375992431, 'rouge2': 0.04786254003324687, 'rougeL': 0.14089765223275297, 'rougeLsum': 0.16715064775597616}

Calculating Diversity...
Diversity Score: 9.9140

Human Evaluation...
Human evaluation should be performed manually by assessing relevance, coherence, and empathy.
Human Evaluation: Pending human feedback
