# Full Fine-tuning: DistilBERT on Financial PhraseBank

This notebook implements full fine-tuning of DistilBERT for sentiment classification on financial news.

In [1]:
import torch
from datasets import load_from_disk
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments, 
    Trainer
)
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load preprocessed datasets
train_dataset = load_from_disk('processed_data/train')
val_dataset = load_from_disk('processed_data/validation')
test_dataset = load_from_disk('processed_data/test')

# Load label mappings
with open('processed_data/label_mappings.json', 'r') as f:
    label_mappings = json.load(f)
    id2label = {int(k): v for k, v in label_mappings['id2label'].items()}
    label2id = label_mappings['label2id']

In [4]:
# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Verify ALL parameters require gradients (full fine-tuning)
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable params: {trainable:,} / {total:,} ({100*trainable/total:.1f}%)")
print("All parameters will be updated during training (Full Fine-tuning)")

Trainable params: 66,955,779 / 66,955,779 (100.0%)
All parameters will be updated during training (Full Fine-tuning)


In [7]:
# Define metrics computation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_score(labels, predictions)
    f1_macro = f1_score(labels, predictions, average='macro')
    f1_weighted = f1_score(labels, predictions, average='weighted')
    precision = precision_score(labels, predictions, average='macro')
    recall = recall_score(labels, predictions, average='macro')
    
    return {
        'accuracy': accuracy,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'precision': precision,
        'recall': recall
    }

In [8]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./full_finetuning_results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    save_total_limit=2,
    seed=42
)

In [9]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [10]:
#Full fine-tuning
train_result = trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted,Precision,Recall
1,0.4786,0.412099,0.82268,0.809718,0.826223,0.787439,0.844868
2,0.2716,0.374618,0.849485,0.843277,0.850128,0.829772,0.859047
3,0.1695,0.409224,0.847423,0.839691,0.847168,0.829762,0.851422




In [11]:
# Evaluate on validation set
val_results = trainer.evaluate()
print("Validation Results:")
for key, value in val_results.items():
    print(f"  {key}: {value:.4f}")



Validation Results:
  eval_loss: 0.3746
  eval_accuracy: 0.8495
  eval_f1_macro: 0.8433
  eval_f1_weighted: 0.8501
  eval_precision: 0.8298
  eval_recall: 0.8590
  eval_runtime: 10.1373
  eval_samples_per_second: 47.8430
  eval_steps_per_second: 3.0580
  epoch: 3.0000


In [12]:
# Evaluate on test set
test_results = trainer.evaluate(test_dataset)
print("\nTest Results:")
for key, value in test_results.items():
    print(f"  {key}: {value:.4f}")




Test Results:
  eval_loss: 0.3490
  eval_accuracy: 0.8639
  eval_f1_macro: 0.8465
  eval_f1_weighted: 0.8651
  eval_precision: 0.8278
  eval_recall: 0.8710
  eval_runtime: 10.2386
  eval_samples_per_second: 47.3700
  eval_steps_per_second: 3.0280
  epoch: 3.0000


In [13]:
# Get predictions for confusion matrix
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)
true_labels = predictions.label_ids

# Confusion matrix
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(true_labels, pred_labels)
print("\nConfusion Matrix:")
print(cm)

print("\nClassification Report:")
print(classification_report(true_labels, pred_labels, target_names=list(id2label.values())))




Confusion Matrix:
[[ 55   1   4]
 [ 14 251  23]
 [  4  20 113]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.75      0.92      0.83        60
     neutral       0.92      0.87      0.90       288
    positive       0.81      0.82      0.82       137

    accuracy                           0.86       485
   macro avg       0.83      0.87      0.85       485
weighted avg       0.87      0.86      0.87       485



In [14]:
# Save model and results
trainer.save_model("./full_finetuning_model")

results_summary = {
    'validation': val_results,
    'test': test_results
}

with open('./full_finetuning_results/results_summary.json', 'w') as f:
    json.dump(results_summary, f, indent=2)