In [4]:
!pip install datasets
!pip install transformers
!pip install torch
!pip install scikit-learn
!pip install wandb

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [9]:
import torch
import numpy as np
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load AG News dataset
dataset = load_dataset("ag_news")

# Initialize model and tokenizer
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=4,
    torch_dtype=torch.float32  # Changed to float32
).to(device)

# Configure LoRA
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["query", "key", "value"]
)

# Create PEFT model
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

# Prepare smaller dataset
train_dataset = dataset["train"].shuffle(seed=42).select(range(1000))
eval_dataset = dataset["test"].shuffle(seed=42).select(range(200))

# Tokenize datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_eval = eval_dataset.map(tokenize_function, batched=True)

# Convert datasets to PyTorch format
tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])
tokenized_eval.set_format("torch", columns=["input_ids", "attention_mask", "label"])

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    accuracy = accuracy_score(labels, predictions)
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./peft_results",
    learning_rate=1e-3,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir='./peft_logs',
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
    fp16=False,  # Disabled mixed precision training
    remove_unused_columns=False
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    compute_metrics=compute_metrics,
)

# Evaluate before training
print("Metrics before fine-tuning:")
initial_metrics = trainer.evaluate()
print(initial_metrics)

# Train
print("\nFine-tuning with LoRA...")
trainer.train()

# Final evaluation
print("\nMetrics after fine-tuning:")
final_metrics = trainer.evaluate()
print(final_metrics)

# Print improvements
print("\nImprovement Summary:")
for metric in ['accuracy', 'f1', 'precision', 'recall']:
    initial_value = initial_metrics[f'eval_{metric}']
    final_value = final_metrics[f'eval_{metric}']
    improvement = final_value - initial_value
    print(f"{metric}: {improvement:.4f} improvement (from {initial_value:.4f} to {final_value:.4f})")

# Show model size comparison
original_size = sum(p.numel() * p.element_size() for p in model.parameters()) / (1024 * 1024)  # MB
trainable_size = sum(p.numel() * p.element_size() for p in model.parameters() if p.requires_grad) / (1024 * 1024)  # MB
print(f"\nModel Size Comparison:")
print(f"Full model size: {original_size:.2f} MB")
print(f"Trainable parameters size: {trainable_size:.2f} MB")
print(f"Size reduction: {(1 - trainable_size/original_size)*100:.2f}%")

Using device: cuda


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,036,036 || all params: 125,684,744 || trainable%: 0.8243
Metrics before fine-tuning:




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3974658250808716, 'eval_model_preparation_time': 0.0337, 'eval_accuracy': 0.225, 'eval_f1': 0.12270571346540338, 'eval_precision': 0.11164750957854405, 'eval_recall': 0.225, 'eval_runtime': 0.9513, 'eval_samples_per_second': 210.244, 'eval_steps_per_second': 13.666}

Fine-tuning with LoRA...


Epoch,Training Loss,Validation Loss,Model Preparation Time,Accuracy,F1,Precision,Recall
1,0.6999,0.704658,0.0337,0.79,0.792573,0.845637,0.79
2,0.3348,0.511589,0.0337,0.85,0.850346,0.866818,0.85
3,0.2175,0.500315,0.0337,0.86,0.859845,0.871126,0.86



Metrics after fine-tuning:


{'eval_loss': 0.5003151297569275, 'eval_model_preparation_time': 0.0337, 'eval_accuracy': 0.86, 'eval_f1': 0.8598447695959499, 'eval_precision': 0.8711260269025911, 'eval_recall': 0.86, 'eval_runtime': 0.6462, 'eval_samples_per_second': 309.491, 'eval_steps_per_second': 20.117, 'epoch': 3.0}

Improvement Summary:
accuracy: 0.6350 improvement (from 0.2250 to 0.8600)
f1: 0.7371 improvement (from 0.1227 to 0.8598)
precision: 0.7595 improvement (from 0.1116 to 0.8711)
recall: 0.6350 improvement (from 0.2250 to 0.8600)

Model Size Comparison:
Full model size: 479.45 MB
Trainable parameters size: 3.95 MB
Size reduction: 99.18%
