In [None]:
!jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10

In [1]:
import torch

print("PyTorch version:", torch.__version__)
print("MPS available:", torch.backends.mps.is_available())

PyTorch version: 2.5.1
MPS available: True


In [2]:
import os
import torch
from datasets import load_dataset, concatenate_datasets
from transformers import (
    AutoTokenizer, LlamaForCausalLM, Trainer, TrainingArguments,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType

# Enable fallback for unsupported MPS operations
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

# -------------------------------
# Preprocess Functions for Datasets
# -------------------------------
def preprocess_go_emotions(examples):
    input_texts = [
        f"Emotion Analysis\nText: {text}\nLabel: {label}" 
        for text, label in zip(examples["text"], examples["labels"])
    ]
    return {"input_text": input_texts}

def preprocess_counsel_chat(examples):
    input_texts = [
        f"Question Title: {title}\nQuestion: {text}\nAnswer: {answer}" 
        for title, text, answer in zip(examples['questionTitle'], examples['questionText'], examples['answerText'])
    ]
    return {"input_text": input_texts}

def preprocess_empathetic_dialogues(examples):
    input_texts = [
        f"Contest: {context}\nQuestion: {prompt}\nAnswer: {utterance}" 
        for context, prompt, utterance in zip(examples['context'], examples['prompt'], examples['utterance'])
    ]
    return {"input_text": input_texts}

# -------------------------------
# Load and Preprocess Datasets
# -------------------------------
print("Loading and preprocessing datasets...")

# Load datasets
dataset_go_emotions = load_dataset("go_emotions").map(preprocess_go_emotions, batched=True).remove_columns(['text', 'labels', 'id'])
dataset_counsel_chat = load_dataset("nbertagnolli/counsel-chat").map(preprocess_counsel_chat, batched=True).remove_columns(['questionID', 'questionTitle', 'questionText', 'questionLink', 'topic', 'therapistInfo', 'therapistURL', 'answerText', 'upvotes', 'views'])
dataset_empathetic_dialogues = load_dataset("facebook/empathetic_dialogues").map(preprocess_empathetic_dialogues, batched=True).remove_columns(['conv_id', 'utterance_idx', 'speaker_idx', 'selfeval', 'tags'])

# Concatenate datasets
combined_dataset = concatenate_datasets([
    dataset_go_emotions["train"],
    dataset_counsel_chat["train"],
    dataset_empathetic_dialogues["train"]
])

# Split into training and evaluation
split_datasets = combined_dataset.train_test_split(test_size=0.1)
train_dataset = split_datasets["train"]
eval_dataset = split_datasets["test"]

print("Datasets combined and split.")

# -------------------------------
# Load Tokenizer and Model
# -------------------------------
print("Loading tokenizer...")
model_name = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Tokenize datasets
def tokenize_function(examples):
    return tokenizer(
        examples["input_text"], 
        truncation=True, 
        padding="max_length", 
        max_length=128
    )

print("Tokenizing datasets...")
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True).remove_columns(["input_text"])
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True).remove_columns(["input_text"])

# Data collator for causal LM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False, pad_to_multiple_of=8
)

# -------------------------------
# Load Model with LoRA
# -------------------------------
print("Loading model with LoRA...")
model = LlamaForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float32  # Use float32 for MPS
)

# Configure LoRA
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,                          # Low rank for LoRA
    lora_alpha=16,                # Alpha scaling
    lora_dropout=0.1,             # Dropout for LoRA layers
    bias="none"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# -------------------------------
# Training Arguments
# -------------------------------
training_args = TrainingArguments(
    output_dir="./combined_model_Llama_3_datasets_together",
    per_device_train_batch_size=2,         # Small batch size for MPS
    gradient_accumulation_steps=8,         # Simulates larger batch size
    num_train_epochs=1,
    learning_rate=5e-5,
    logging_steps=10,
    save_steps=500,
    save_total_limit=1,
    evaluation_strategy="steps",
    eval_steps=500,
    bf16=False,                            # Use float32 instead of bf16 on MPS
    fp16=False,                            # FP16 not supported on MPS
    report_to="none",
)

# -------------------------------
# Trainer
# -------------------------------
print("Initializing Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# -------------------------------
# Train Model
# -------------------------------
print("Starting training...")
trainer.train()

# Save the fine-tuned model
print("Saving the model...")
model.save_pretrained("./combined_model_Llama_3_datasets_together")
tokenizer.save_pretrained("./combined_model_Llama_3_datasets_together")
print("Training complete!")


  from .autonotebook import tqdm as notebook_tqdm


Loading and preprocessing datasets...


Repo card metadata block was not found. Setting CardData to empty.


Datasets combined and split.
Loading tokenizer...
Tokenizing datasets...


Map: 100%|████████████████████| 110572/110572 [00:05<00:00, 20280.08 examples/s]
Map: 100%|██████████████████████| 12286/12286 [00:00<00:00, 19705.12 examples/s]


Loading model with LoRA...


  warn("The installed version of bitsandbytes was compiled without GPU support. "
  trainer = Trainer(


'NoneType' object has no attribute 'cadam32bit_grad_fp32'
trainable params: 851,968 || all params: 1,236,666,368 || trainable%: 0.0689
Initializing Trainer...
Starting training...


Step,Training Loss,Validation Loss
500,2.4508,2.5035
1000,2.3696,2.45635
1500,2.3868,2.434157
2000,2.3703,2.417534
2500,2.3266,2.404863
3000,2.3363,2.394717
3500,2.3491,2.387337
4000,2.3798,2.377759
4500,2.337,2.370509
5000,2.235,2.363772


Saving the model...
Training complete!


In [None]:
import os
import torch
from datasets import load_dataset, concatenate_datasets
from transformers import (
    AutoTokenizer, LlamaForCausalLM, pipeline
)
import evaluate
import numpy as np

# -------------------------------
# Configuration for Evaluation
# -------------------------------
MODEL_NAME = "./combined_model_Llama_3_datasets_together"  # Path to the trained combined model
TEXT_COLUMN = "input_text"  # Column in the dataset with input text
REFERENCE_COLUMN = "reference"  # Column in the dataset with reference text
MAX_LENGTH = 128  # Max token length
MAX_NEW_TOKENS = 50  # Max tokens for text generation
DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# -------------------------------
# Preprocess Dataset
# -------------------------------
def preprocess_function_combined(examples):
    """Preprocess the dataset to create input_text."""
    if "labels" in examples:  # go_emotions
        input_texts = [f"Emotion Analysis\nText: {text}\nLabel: {label}" 
                       for text, label in zip(examples["text"], examples["labels"])]
    elif "questionText" in examples:  # counsel-chat
        input_texts = [f"Question Title: {title}\nQuestion: {text}" 
                       for title, text in zip(examples["questionTitle"], examples["questionText"])]
    elif "context" in examples:  # empathetic_dialogues
        input_texts = [f"Context: {context}\nQuestion: {prompt}" 
                       for context, prompt in zip(examples["context"], examples["prompt"])]
    else:
        input_texts = [""] * len(examples)  # Default empty strings for safety
    return {"input_text": input_texts}

print("Loading datasets...")
datasets = [
    {"name": "go_emotions", "data": load_dataset("go_emotions")},
    {"name": "counsel_chat", "data": load_dataset("nbertagnolli/counsel-chat")},
    {"name": "empathetic_dialogues", "data": load_dataset("facebook/empathetic_dialogues")},
]

# Preprocess and combine datasets
processed_datasets = []
for dataset_info in datasets:
    dataset_name = dataset_info["name"]
    dataset = dataset_info["data"]
    print(f"Preprocessing dataset: {dataset_name}")
    dataset = dataset["train"].map(preprocess_function_combined, batched=True)
    if "text" in dataset.column_names:
        dataset = dataset.rename_column("text", "reference")
    elif "answerText" in dataset.column_names:
        dataset = dataset.rename_column("answerText", "reference")
    elif "utterance" in dataset.column_names:
        dataset = dataset.rename_column("utterance", "reference")
    dataset = dataset.remove_columns([col for col in dataset.column_names if col not in ["input_text", "reference"]])
    processed_datasets.append(dataset)

# Concatenate all processed datasets
combined_dataset = concatenate_datasets(processed_datasets)
eval_dataset = combined_dataset  # Using the entire dataset as eval, modify as needed

print("Dataset combined and preprocessed.")

# -------------------------------
# Load Model and Tokenizer
# -------------------------------
print("Loading model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = LlamaForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE)

# -------------------------------
# Metric 1: Perplexity
# -------------------------------
def calculate_perplexity(model, tokenizer, dataset, column, max_length):
    """Calculate perplexity of the model on the dataset."""
    model.eval()
    losses = []
    for sample in dataset[column]:
        inputs = tokenizer(sample, return_tensors="pt", truncation=True, max_length=max_length).to(DEVICE)
        with torch.no_grad():
            outputs = model(**inputs, labels=inputs["input_ids"])
            losses.append(outputs.loss.item())
    return np.exp(np.mean(losses))

# -------------------------------
# Metric 2: ROUGE
# -------------------------------
def calculate_rouge(model, tokenizer, dataset, text_column, ref_column, max_new_tokens):
    """Calculate ROUGE score for summarization or response generation tasks."""
    rouge = evaluate.load("rouge")
    predictions, references = [], []
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if DEVICE == "cuda" else -1)
    
    for row in dataset:
        input_text = row[text_column]
        reference = row[ref_column]
        generated = generator(input_text, max_new_tokens=max_new_tokens, truncation=True, do_sample=False)[0]['generated_text']
        predictions.append(generated)
        references.append(reference)
    
    return rouge.compute(predictions=predictions, references=references)

# -------------------------------
# Metric 3: Diversity
# -------------------------------
def calculate_diversity(model, tokenizer, prompts, max_new_tokens):
    """Calculate diversity by measuring unique n-grams."""
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if DEVICE == "cuda" else -1)
    all_outputs = []
    
    for prompt in prompts:
        outputs = generator(prompt, max_new_tokens=max_new_tokens, num_return_sequences=5, do_sample=True)
        texts = [output["generated_text"] for output in outputs]
        all_outputs.extend(texts)
    
    ngrams = [set(zip(*[text.split()[i:] for i in range(2)])) for text in all_outputs]
    total_ngrams = set.union(*ngrams)
    return len(total_ngrams) / len(all_outputs)

# -------------------------------
# Evaluation
# -------------------------------
if __name__ == "__main__":
    print("\nCalculating Perplexity...")
    perplexity = calculate_perplexity(model, tokenizer, eval_dataset, TEXT_COLUMN, MAX_LENGTH)
    print(f"Perplexity: {perplexity:.2f}")

    print("\nCalculating ROUGE Scores...")
    rouge_scores = calculate_rouge(model, tokenizer, eval_dataset, TEXT_COLUMN, REFERENCE_COLUMN, max_new_tokens=MAX_NEW_TOKENS)
    print(f"ROUGE Scores: {rouge_scores}")

    print("\nCalculating Diversity...")
    prompts = eval_dataset["input_text"][:10]  # Adjust the range as needed
    diversity = calculate_diversity(model, tokenizer, prompts, max_new_tokens=MAX_NEW_TOKENS)
    print(f"Diversity Score: {diversity:.4f}")


  from .autonotebook import tqdm as notebook_tqdm


Loading datasets...


Repo card metadata block was not found. Setting CardData to empty.


Preprocessing dataset: go_emotions
Preprocessing dataset: counsel_chat
Preprocessing dataset: empathetic_dialogues
Dataset combined and preprocessed.
Loading model and tokenizer...


  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'

Calculating Perplexity...
Perplexity: 18.83

Calculating ROUGE Scores...


Device set to use mps:0


In [1]:
import os
import torch
from datasets import load_dataset, concatenate_datasets
from transformers import (
    AutoTokenizer, LlamaForCausalLM, pipeline
)
import evaluate
import numpy as np

# -------------------------------
# Configuration for Evaluation
# -------------------------------
MODEL_NAME = "./combined_model_Llama_3_datasets_together"  # Path to the trained combined model
TEXT_COLUMN = "input_text"  # Column in the dataset with input text
REFERENCE_COLUMN = "reference"  # Column in the dataset with reference text
MAX_LENGTH = 128  # Max token length
MAX_NEW_TOKENS = 50  # Max tokens for text generation
DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# -------------------------------
# Preprocess Dataset
# -------------------------------
def preprocess_function_combined(examples):
    """Preprocess the dataset to create input_text."""
    if "labels" in examples:  # go_emotions
        input_texts = [f"Emotion Analysis\nText: {text}\nLabel: {label}" 
                       for text, label in zip(examples["text"], examples["labels"])]
    elif "questionText" in examples:  # counsel-chat
        input_texts = [f"Question Title: {title}\nQuestion: {text}" 
                       for title, text in zip(examples["questionTitle"], examples["questionText"])]
    elif "context" in examples:  # empathetic_dialogues
        input_texts = [f"Context: {context}\nQuestion: {prompt}" 
                       for context, prompt in zip(examples["context"], examples["prompt"])]
    else:
        input_texts = [""] * len(examples)  # Default empty strings for safety
    return {"input_text": input_texts}

print("Loading datasets...")
datasets = [
    {"name": "go_emotions", "data": load_dataset("go_emotions")},
    {"name": "counsel_chat", "data": load_dataset("nbertagnolli/counsel-chat")},
    {"name": "empathetic_dialogues", "data": load_dataset("facebook/empathetic_dialogues")},
]

# Preprocess and combine datasets
processed_datasets = []
for dataset_info in datasets:
    dataset_name = dataset_info["name"]
    dataset = dataset_info["data"]
    print(f"Preprocessing dataset: {dataset_name}")
    dataset = dataset["train"].map(preprocess_function_combined, batched=True)
    if "text" in dataset.column_names:
        dataset = dataset.rename_column("text", "reference")
    elif "answerText" in dataset.column_names:
        dataset = dataset.rename_column("answerText", "reference")
    elif "utterance" in dataset.column_names:
        dataset = dataset.rename_column("utterance", "reference")
    dataset = dataset.remove_columns([col for col in dataset.column_names if col not in ["input_text", "reference"]])
    processed_datasets.append(dataset)

# Concatenate all processed datasets
combined_dataset = concatenate_datasets(processed_datasets)

# Use a subset for quick evaluation
eval_dataset = combined_dataset.select(range(100))  # Select the first 100 samples for evaluation

print("Dataset combined and preprocessed.")

# -------------------------------
# Load Model and Tokenizer
# -------------------------------
print("Loading model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = LlamaForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE)

# -------------------------------
# Metric 1: Perplexity
# -------------------------------
def calculate_perplexity(model, tokenizer, dataset, column, max_length):
    """Calculate perplexity of the model on the dataset."""
    model.eval()
    losses = []
    for sample in dataset[column]:
        inputs = tokenizer(sample, return_tensors="pt", truncation=True, max_length=max_length).to(DEVICE)
        with torch.no_grad():
            outputs = model(**inputs, labels=inputs["input_ids"])
            losses.append(outputs.loss.item())
    return np.exp(np.mean(losses))

# -------------------------------
# Metric 2: ROUGE
# -------------------------------
def calculate_rouge(model, tokenizer, dataset, text_column, ref_column, max_new_tokens):
    """Calculate ROUGE score for summarization or response generation tasks."""
    rouge = evaluate.load("rouge")
    predictions, references = [], []
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if DEVICE == "cuda" else -1)
    
    for row in dataset:
        input_text = row[text_column]
        reference = row[ref_column]
        generated = generator(input_text, max_new_tokens=max_new_tokens, truncation=True, do_sample=False)[0]['generated_text']
        predictions.append(generated)
        references.append(reference)
    
    return rouge.compute(predictions=predictions, references=references)

# -------------------------------
# Metric 3: Diversity
# -------------------------------
def calculate_diversity(model, tokenizer, prompts, max_new_tokens):
    """Calculate diversity by measuring unique n-grams."""
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if DEVICE == "cuda" else -1)
    all_outputs = []
    
    for prompt in prompts:
        outputs = generator(prompt, max_new_tokens=max_new_tokens, num_return_sequences=5, do_sample=True)
        texts = [output["generated_text"] for output in outputs]
        all_outputs.extend(texts)
    
    ngrams = [set(zip(*[text.split()[i:] for i in range(2)])) for text in all_outputs]
    total_ngrams = set.union(*ngrams)
    return len(total_ngrams) / len(all_outputs)

# -------------------------------
# Evaluation
# -------------------------------
if __name__ == "__main__":
    print("\nCalculating Perplexity...")
    perplexity = calculate_perplexity(model, tokenizer, eval_dataset, TEXT_COLUMN, MAX_LENGTH)
    print(f"Perplexity: {perplexity:.2f}")

    print("\nCalculating ROUGE Scores...")
    rouge_scores = calculate_rouge(model, tokenizer, eval_dataset, TEXT_COLUMN, REFERENCE_COLUMN, max_new_tokens=MAX_NEW_TOKENS)
    print(f"ROUGE Scores: {rouge_scores}")

    print("\nCalculating Diversity...")
    prompts = eval_dataset["input_text"][:10]  # Adjust the range as needed
    diversity = calculate_diversity(model, tokenizer, prompts, max_new_tokens=MAX_NEW_TOKENS)
    print(f"Diversity Score: {diversity:.4f}")


  from .autonotebook import tqdm as notebook_tqdm


Loading datasets...


Repo card metadata block was not found. Setting CardData to empty.


Preprocessing dataset: go_emotions
Preprocessing dataset: counsel_chat
Preprocessing dataset: empathetic_dialogues
Dataset combined and preprocessed.
Loading model and tokenizer...


  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'

Calculating Perplexity...
Perplexity: 12.95

Calculating ROUGE Scores...


Device set to use mps:0
Device set to use mps:0


ROUGE Scores: {'rouge1': 0.4565373122239609, 'rouge2': 0.42365657314540317, 'rougeL': 0.4558910100805962, 'rougeLsum': 0.45601490239235676}

Calculating Diversity...
Diversity Score: 7.5400
