# Neural Nets on Financial Time Seris Forecasting on Text with JAX

In [7]:
#!pip install datasets
from datasets import load_dataset

In [8]:
acl_train = load_dataset("TheFinAI/flare-sm-acl", split="train")
acl_test = load_dataset("TheFinAI/flare-sm-acl", split="test")
acl_valid = load_dataset("TheFinAI/flare-sm-acl", split="valid")

acl_train_df = acl_train.to_pandas()[['gold', 'text']]
acl_valid_df = acl_valid.to_pandas()[['gold', 'text']]
acl_test_df = acl_test.to_pandas()[['gold', 'text']]

## Preprocess data

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
model = SentenceTransformer('all-MiniLM-L6-v2') # load pre-trained model for text embedding

# Text embedding with batch processing
def batch_encode(texts, batch_size=32):
    def get_sbert_embeddings(texts):
        embeddings = model.encode(texts, convert_to_numpy=True)
        return embeddings
    embeddings_list = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        batch_embeddings = get_sbert_embeddings(batch)
        embeddings_list.append(batch_embeddings)
    return np.vstack(embeddings_list)

In [None]:
import jax
import jax.numpy as jnp

# training data processing
texts = acl_train_df['text'].tolist()
X_train_embeddings = batch_encode(texts)
X_train_embeddings = np.array(X_train_embeddings, dtype=np.float32)
acl_train_embedded = np.concatenate([acl_train_df[['gold']], X_train_embeddings], axis=1)
acl_train_embedded_jax = jnp.array(acl_train_embedded, dtype=jnp.float32)

In [None]:
# validation data processing
valid_texts = acl_valid_df['text'].tolist()
X_valid_embeddings = batch_encode(valid_texts)
X_valid_embeddings = np.array(X_valid_embeddings, dtype=np.float32)
acl_valid_embedded = np.concatenate([acl_valid_df[['gold']], X_valid_embeddings], axis=1)
acl_valid_embedded_jax = jnp.array(acl_valid_embedded, dtype=jnp.float32)

# test data processing
test_texts = acl_test_df['text'].tolist()
X_test_embeddings = batch_encode(test_texts)
X_test_embeddings = np.array(X_test_embeddings, dtype=np.float32)
acl_test_embedded = np.concatenate([acl_test_df[['gold']], X_test_embeddings], axis=1)
acl_test_embedded_jax = jnp.array(acl_test_embedded, dtype=jnp.float32)

In [None]:
print(acl_train_embedded[:3])
print(acl_train_embedded_jax[:3])
print(acl_train_embedded.shape)
print(acl_valid_embedded.shape)

[[ 1.         -0.00104391 -0.0153452  ... -0.06044361 -0.04032699
   0.04581903]
 [ 1.         -0.01484502 -0.01614808 ... -0.05998765 -0.04143057
   0.04037762]
 [ 0.         -0.00950441 -0.01646534 ... -0.06522585 -0.03886713
   0.03963047]]
[[ 1.         -0.00104391 -0.0153452  ... -0.06044361 -0.04032699
   0.04581903]
 [ 1.         -0.01484502 -0.01614808 ... -0.05998765 -0.04143057
   0.04037762]
 [ 0.         -0.00950441 -0.01646534 ... -0.06522585 -0.03886713
   0.03963047]]
(20781, 385)
(2555, 385)


## Train the models

### 1. Logistic regression

Prepare data for training. Split training data into X and response Y.

In [None]:
y_train = acl_train_embedded[:, 0].reshape(-1, 1)
X_train = acl_train_embedded[:, 1:]
y_valid = acl_valid_embedded[:, 0].reshape(-1, 1)
X_valid = acl_valid_embedded[:, 1:]

Define functions

In [None]:
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

# Train logistic regression model
maxiter = 30
lr_model = LogisticRegression(max_iter=maxiter, random_state=42)
lr_model.fit(X_train, y_train)

# Evaluate on validation set
y_pred = lr_model.predict(X_valid)

# Print performance metrics
print("\nLogistic Regression Results:")
print(f"Validation Accuracy: {lr_model.score(X_valid, y_valid):.10f}")

  y = column_or_1d(y, warn=True)



Logistic Regression Results:
Validation Accuracy: 0.4845401174


### 2. Train a MLP with JAX

In [None]:
import jax
import jax.numpy as jnp
from jax import random
from typing import List, Tuple, Any
import optax  # For Adam optimizer
from functools import partial

# Prepare data for modelling
y_train = acl_train_embedded_jax[:, 0].reshape(-1, 1)
X_train = acl_train_embedded_jax[:, 1:]
y_valid = acl_valid_embedded_jax[:, 0].reshape(-1, 1)
X_valid = acl_valid_embedded_jax[:, 1:]
input_dim = X_train.shape[1]  # Get input dimension from training data

In [None]:
# Define MLP in JAX
def init_mlp_params(layer_sizes: List[int], key: Any) -> List[Tuple[jnp.ndarray, jnp.ndarray]]:
    params = []
    keys = random.split(key, len(layer_sizes))
    for in_dim, out_dim, k in zip(layer_sizes[:-1], layer_sizes[1:], keys):
        w_key, b_key = random.split(k)
        W = random.normal(w_key, (in_dim, out_dim)) * jnp.sqrt(2. / in_dim)
        b = jnp.zeros((out_dim,))
        params.append((W, b))
    return params

def mlp_forward(params: List[Tuple[jnp.ndarray, jnp.ndarray]], x: jnp.ndarray, dropout_rate: float = 0.0,
               train: bool = False, key: Any = None) -> jnp.ndarray:
    """Forward pass with dropout support"""
    for i, (W, b) in enumerate(params[:-1]):
        x = jnp.dot(x, W) + b
        x = jax.nn.relu(x)

        # Apply dropout during training
        if train and dropout_rate > 0:
            if key is None:
                raise ValueError("Random key required for dropout")
            dropout_key = random.fold_in(key, i)  # Different key for each layer
            mask = random.bernoulli(dropout_key, p=1-dropout_rate, shape=x.shape)
            x = x * mask / (1 - dropout_rate)  # Scale to maintain expected value

    W_last, b_last = params[-1]
    logits = jnp.dot(x, W_last) + b_last
    return logits

# Loss and training step
def binary_cross_entropy_loss(logits: jnp.ndarray, labels: jnp.ndarray) -> jnp.ndarray:
    preds = jax.nn.sigmoid(logits)
    return -jnp.mean(labels * jnp.log(preds + 1e-7) + (1 - labels) * jnp.log(1 - preds + 1e-7))

# Improved training step with Adam optimizer
@partial(jax.jit, static_argnums=(4, 5))
def train_step(params, X_batch, y_batch, opt_state, dropout_rate=0.2, train=True):
    """Single training step with Adam optimizer and dropout"""
    key = random.PRNGKey(0)  # For reproducibility

    def loss_fn(p):
        logits = mlp_forward(p, X_batch, dropout_rate=dropout_rate, train=train, key=key)
        return binary_cross_entropy_loss(logits, y_batch)

    loss, grads = jax.value_and_grad(loss_fn)(params)
    updates, new_opt_state = optimizer.update(grads, opt_state)
    new_params = optax.apply_updates(params, updates)
    return new_params, new_opt_state, loss

In [None]:
# Hyperparameter tuning
def tune_hyperparameters():
    best_accuracy = 0.0
    best_params = None
    best_config = {}

    # Define hyperparameter search space
    learning_rates = [1e-4]
    hidden_layer_configs = [
        [128, 64]
    ]
    dropout_rates = [0.0, 0.2, 0.3, 0.4, 0.5, 0.6, .7, .8, .9]
    batch_sizes = [32]  # For mini-batch training

    results = []

    for lr in learning_rates:
        for hidden_layers in hidden_layer_configs:
            for dropout_rate in dropout_rates:
                for batch_size in batch_sizes:
                    print(f"\nTrying: lr={lr}, layers={hidden_layers}, dropout={dropout_rate}, batch_size={batch_size}")

                    # Initialize model
                    key = random.PRNGKey(42)
                    layer_sizes = [input_dim] + hidden_layers + [1]
                    params = init_mlp_params(layer_sizes, key)

                    # Initialize optimizer
                    global optimizer  # Make it accessible in train_step
                    optimizer = optax.adam(learning_rate=lr)
                    opt_state = optimizer.init(params)

                    # Mini-batch training
                    num_batches = max(1, len(X_train) // batch_size)

                    for epoch in range(50):  # Fewer epochs for tuning
                        # Shuffle data
                        perm = random.permutation(key, len(X_train))
                        key = random.fold_in(key, epoch)  # Update key for next epoch

                        # Mini-batch updates
                        total_loss = 0.0
                        for i in range(num_batches):
                            batch_idx = perm[i * batch_size:(i + 1) * batch_size]
                            X_batch = X_train[batch_idx]
                            y_batch = y_train[batch_idx]

                            params, opt_state, loss = train_step(
                                params, X_batch, y_batch, opt_state,
                                dropout_rate=dropout_rate, train=True
                            )
                            total_loss += loss

                        avg_loss = total_loss / num_batches
                        if epoch % 10 == 0:
                            print(f"Epoch {epoch+1} | Avg Loss: {avg_loss:.4f}")

                    # Evaluate on validation set
                    val_accuracy = evaluate(params, X_valid, y_valid, dropout_rate=0.0, train=False)
                    print(f"Validation Accuracy: {val_accuracy:.4f}")

                    # Record result
                    config = {
                        'learning_rate': lr,
                        'hidden_layers': hidden_layers,
                        'dropout_rate': dropout_rate,
                        'batch_size': batch_size,
                        'val_accuracy': val_accuracy
                    }
                    results.append(config)

                    # Update best model
                    if val_accuracy > best_accuracy:
                        best_accuracy = val_accuracy
                        best_params = params
                        best_config = config

    print("\n=== Hyperparameter Tuning Results ===")
    for i, res in enumerate(sorted(results, key=lambda x: x['val_accuracy'], reverse=True)):
        print(f"{i+1}. Accuracy: {res['val_accuracy']:.4f} - LR: {res['learning_rate']}, "
              f"Layers: {res['hidden_layers']}, Dropout: {res['dropout_rate']}, "
              f"Batch Size: {res['batch_size']}")

    print(f"\nBest Configuration: {best_config}")
    return best_params, best_config

# Modified evaluation function for hyperparameter tuning
def evaluate(params, X, y, dropout_rate=0.0, train=False) -> float:
    """Evaluate model accuracy with optional dropout"""
    key = random.PRNGKey(99) if train else None
    logits = mlp_forward(params, X, dropout_rate=dropout_rate, train=train, key=key)
    preds = jax.nn.sigmoid(logits)
    binary_preds = (preds > 0.5).astype(jnp.float32)
    accuracy = jnp.mean(binary_preds == y)
    return float(accuracy)

In [None]:
# Run hyperparameter tuning
print("\n=== Starting Hyperparameter Tuning ===")
best_params, best_config = tune_hyperparameters()

# Train final model with best hyperparameters
print("\n=== Training Final Model with Best Hyperparameters ===")
key = random.PRNGKey(0)
layer_sizes = [input_dim] + best_config['hidden_layers'] + [1]
params = init_mlp_params(layer_sizes, key)

# Initialize optimizer with best learning rate
optimizer = optax.adam(learning_rate=best_config['learning_rate'])
opt_state = optimizer.init(params)


=== Starting Hyperparameter Tuning ===

Trying: lr=0.0001, layers=[128, 64], dropout=0.0, batch_size=32
Epoch 1 | Avg Loss: 0.6912
Epoch 11 | Avg Loss: 0.6760
Epoch 21 | Avg Loss: 0.6739
Epoch 31 | Avg Loss: 0.6727
Epoch 41 | Avg Loss: 0.6715
Validation Accuracy: 0.4975

Trying: lr=0.0001, layers=[128, 64], dropout=0.2, batch_size=32
Epoch 1 | Avg Loss: 0.6920
Epoch 11 | Avg Loss: 0.6777
Epoch 21 | Avg Loss: 0.6752
Epoch 31 | Avg Loss: 0.6739
Epoch 41 | Avg Loss: 0.6730
Validation Accuracy: 0.5045

Trying: lr=0.0001, layers=[128, 64], dropout=0.3, batch_size=32
Epoch 1 | Avg Loss: 0.6922
Epoch 11 | Avg Loss: 0.6781
Epoch 21 | Avg Loss: 0.6756
Epoch 31 | Avg Loss: 0.6745
Epoch 41 | Avg Loss: 0.6734
Validation Accuracy: 0.5072

Trying: lr=0.0001, layers=[128, 64], dropout=0.4, batch_size=32
Epoch 1 | Avg Loss: 0.6926
Epoch 11 | Avg Loss: 0.6788
Epoch 21 | Avg Loss: 0.6768
Epoch 31 | Avg Loss: 0.6752
Epoch 41 | Avg Loss: 0.6738
Validation Accuracy: 0.5025

Trying: lr=0.0001, layers=[128,

In [None]:
# Training with mini-batches
num_epochs = 100
batch_size = best_config['batch_size']
num_batches = max(1, len(X_train) // batch_size)

for epoch in range(num_epochs):
    # Shuffle data
    perm = random.permutation(key, len(X_train))
    key = random.fold_in(key, epoch)

    # Mini-batch updates
    total_loss = 0.0
    for i in range(num_batches):
        batch_idx = perm[i * batch_size:(i + 1) * batch_size]
        X_batch = X_train[batch_idx]
        y_batch = y_train[batch_idx]

        params, opt_state, loss = train_step(
            params, X_batch, y_batch, opt_state,
            dropout_rate=best_config['dropout_rate'], train=True
        )
        total_loss += loss

    avg_loss = total_loss / num_batches
    if epoch % 10 == 0 or epoch == num_epochs - 1:
        train_acc = evaluate(params, X_train, y_train, dropout_rate=0.0, train=False)
        val_acc = evaluate(params, X_valid, y_valid, dropout_rate=0.0, train=False)
        print(f"Epoch {epoch+1} | Loss: {avg_loss:.4f} | Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")

Epoch 1 | Loss: 0.6973 | Train Acc: 0.5278 | Val Acc: 0.4669
Epoch 11 | Loss: 0.6830 | Train Acc: 0.5286 | Val Acc: 0.4908
Epoch 21 | Loss: 0.6801 | Train Acc: 0.5248 | Val Acc: 0.5311
Epoch 31 | Loss: 0.6792 | Train Acc: 0.5377 | Val Acc: 0.5119
Epoch 41 | Loss: 0.6784 | Train Acc: 0.5381 | Val Acc: 0.5292
Epoch 51 | Loss: 0.6776 | Train Acc: 0.5284 | Val Acc: 0.5432
Epoch 61 | Loss: 0.6767 | Train Acc: 0.5268 | Val Acc: 0.5495
Epoch 71 | Loss: 0.6763 | Train Acc: 0.5474 | Val Acc: 0.5374
Epoch 81 | Loss: 0.6756 | Train Acc: 0.5254 | Val Acc: 0.5487
Epoch 91 | Loss: 0.6751 | Train Acc: 0.5418 | Val Acc: 0.5374
Epoch 100 | Loss: 0.6749 | Train Acc: 0.5270 | Val Acc: 0.5499


In [None]:
# Final evaluation
final_accuracy = evaluate(params, X_valid, y_valid, dropout_rate=0.0, train=False)
print(f"\nFinal Validation Accuracy: {final_accuracy:.10f}")


Final Validation Accuracy: 0.5499022007


### 3. Bagging

In [None]:
y_train = acl_train_embedded[:, 0].reshape(-1, 1)
X_train = acl_train_embedded[:, 1:]
y_valid = acl_valid_embedded[:, 0].reshape(-1, 1)
X_valid = acl_valid_embedded[:, 1:]

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import resample
import numpy as np

# List of classifiers to use
classifiers = [
    LogisticRegression(),
    SVC(kernel='linear'),
    RandomForestClassifier(n_estimators=100),
    KNeighborsClassifier(n_neighbors=5)
]

In [None]:
# Number of bootstrap samples
n_bootstrap = 10

# Create an empty list to store models
trained_classifiers = []

# Generate bootstrap samples and train each model
for clf in classifiers:
    clf_bootstrap_models = []
    for _ in range(n_bootstrap):
        # Generate bootstrap sample (with replacement)
        X_resampled, y_resampled = resample(X_train, y_train, n_samples=X_train.shape[0], random_state=1000)
        clf_clone = clf.__class__()  # Create a fresh clone of the classifier
        clf_clone.fit(X_resampled, y_resampled)
        clf_bootstrap_models.append(clf_clone)

    # Append the trained bootstrap models to the list of classifiers
    trained_classifiers.append(clf_bootstrap_models)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  re

In [None]:
# Function to aggregate predictions using majority voting
def bagging_predict(X):
    predictions = []

    for clf_bootstrap_models in trained_classifiers:
        clf_preds = np.zeros((X.shape[0], len(clf_bootstrap_models)))

        for idx, model in enumerate(clf_bootstrap_models):
            clf_preds[:, idx] = model.predict(X)

        # Average predictions for each classifier group
        avg_pred = np.mean(clf_preds, axis=1)
        # Convert to binary predictions using threshold
        binary_pred = (avg_pred >= 0.5).astype(int)
        predictions.append(binary_pred)

    # Final ensemble prediction
    final_pred = np.mean(predictions, axis=0)
    return (final_pred >= 0.5).astype(int)

# Apply bagging prediction
final_predictions = bagging_predict(X_valid)

# Evaluate final predictions (accuracy)
accuracy = np.mean(final_predictions == y_valid)
print(f"Bagging Accuracy: {accuracy:.10f}")

Bagging Accuracy: 0.5353250026


### 4. Fine-tuned LLM *FinBert* with DoRA

In [1]:
!pip install datasets scikit-learn peft numpy torch
!pip install accelerate==0.17.0

Collecting accelerate>=0.21.0 (from peft)
  Downloading accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.6.0-py3-none-any.whl (354 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m354.7/354.7 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.17.0
    Uninstalling accelerate-0.17.0:
      Successfully uninstalled accelerate-0.17.0
Successfully installed accelerate-1.6.0
Collecting accelerate==0.17.0
  Using cached accelerate-0.17.0-py3-none-any.whl.metadata (16 kB)
Using cached accelerate-0.17.0-py3-none-any.whl (212 kB)
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 1.6.0
    Uninstalling accelerate-1.6.0:
      Successfully uninstalled accelerate-1.6.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages th

In [None]:
!pip install transformers==4.37.2
!pip install peft==0.10.0
# version debugging referring to https://stackoverflow.com/questions/79273647/cannot-import-name-encoderdecodercache-from-transformers

In [9]:
from transformers import BertTokenizer, BertForSequenceClassification
from peft import LoraConfig, get_peft_model, TaskType
import torch
from torch import nn

In [42]:
# Load the pre-trained 'ProsusAI/finbert' model and tokenizer
model_name = 'ProsusAI/finbert'
tokenizer = BertTokenizer.from_pretrained(model_name)

finbert = BertForSequenceClassification.from_pretrained(model_name)  # Binary classification
finbert.config.num_labels = 2
finbert.num_labels = 2

original_finbert = finbert



In [48]:
# Add dropout to the classifier for better regularization

# First apply the base model modifications
dropout_prob = 0.5
finbert.classifier = nn.Sequential(
    nn.Dropout(dropout_prob),
    nn.Linear(finbert.config.hidden_size, finbert.config.num_labels)
)

# Configure PEFT with LoRA specifically for binary classification
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,
    lora_alpha=32,
    lora_dropout=0.5,
    bias="none",
    target_modules=["query", "key", "value", "output.dense"],
)

# Wrap the model with LoRA
finbert = get_peft_model(finbert, peft_config)

# Manually ensure the classifier parameters are trainable
for param in finbert.classifier.parameters():
    param.requires_grad = True

finbert.print_trainable_parameters()

trainable params: 963,078 || all params: 110,446,856 || trainable%: 0.8719831735183118


### Setup data loader

In [49]:
from datasets import Dataset

acl_train_ds = Dataset.from_pandas(acl_train_df)
acl_valid_ds = Dataset.from_pandas(acl_valid_df)

def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding=True,            # Padding handled dynamically by the DataCollator
        truncation=True,
        max_length=128
    )

acl_train_token = acl_train_ds.map(tokenize_function, batched=True)
acl_valid_token = acl_valid_ds.map(tokenize_function, batched=True)

acl_train_token = acl_train_token.rename_column("gold", "label")
acl_valid_token = acl_valid_token.rename_column("gold", "label")

acl_train_token.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
acl_valid_token.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

Map:   0%|          | 0/20781 [00:00<?, ? examples/s]

Map:   0%|          | 0/2555 [00:00<?, ? examples/s]

In [50]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

train_dataloader = DataLoader(acl_train_token, batch_size=16, shuffle=True, collate_fn=data_collator)
valid_dataloader = DataLoader(acl_valid_token, batch_size=16, shuffle=False, collate_fn=data_collator)

### Train with torch

In [51]:
import torch
from torch import nn
from transformers import AdamW
from tqdm import tqdm
from transformers import get_scheduler

# setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
finbert.to(device)
print(device)

# set up optimizer
optimizer = AdamW(finbert.parameters(), lr=2e-5, weight_decay=0.1)

# set up loss function
class_weights = torch.tensor([1.0, 1.0], device=device)  # adjust per class stats
loss_fn = nn.CrossEntropyLoss(weight=class_weights)

# Add early stopping mechanism
best_val_accuracy = 0
patience = 2
early_stop_counter = 0
best_model_state = None

# Add learning rate scheduler for better convergence
num_epochs = 5  # Increase epochs since we have early stopping
lr_scheduler = get_scheduler(
    "cosine",  # Use cosine schedule for better convergence
    optimizer=optimizer,
    num_warmup_steps=int(0.1 * len(train_dataloader) * num_epochs),  # 10% warmup
    num_training_steps=len(train_dataloader) * num_epochs
)

cuda




In [52]:
# torch training loop with regularization techniques
for epoch in range(num_epochs):
    finbert.train()
    total_loss = 0

    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}"):
        batch = {k: v.to(device) for k, v in batch.items()}

        # Enable dropout during training (it's enabled by default in train mode)
        outputs = finbert(**batch)
        loss = outputs.loss

        # Add L2 regularization if needed
        # for param in finbert.parameters():
        #    loss += 0.01 * torch.sum(param ** 2)

        total_loss += loss.item()

        loss.backward()

        # Gradient clipping to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(finbert.parameters(), max_norm=1.0)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1} - Avg training loss: {avg_train_loss:.4f}")

    # Validation
    finbert.eval()
    correct, total = 0, 0
    val_loss = 0

    with torch.no_grad():
        for batch in valid_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = finbert(**batch)
            val_loss += outputs.loss.item()
            predictions = torch.argmax(outputs.logits, dim=-1)
            correct += (predictions == batch["labels"]).sum().item()
            total += batch["labels"].size(0)

    val_accuracy = correct / total
    avg_val_loss = val_loss / len(valid_dataloader)
    print(f"Validation Loss: {avg_val_loss:.4f}, Accuracy: {val_accuracy:.4f}")

    # Early stopping check
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        early_stop_counter = 0
        # Save best model state
        best_model_state = {k: v.cpu().clone() for k, v in finbert.state_dict().items()}
    else:
        early_stop_counter += 1

    if early_stop_counter >= patience:
        print(f"Early stopping triggered after {epoch + 1} epochs")
        break

Epoch 1: 100%|██████████| 1299/1299 [03:10<00:00,  6.81it/s]


Epoch 1 - Avg training loss: 0.7119
Validation Loss: 0.6930, Accuracy: 0.5182


Epoch 2: 100%|██████████| 1299/1299 [03:10<00:00,  6.82it/s]


Epoch 2 - Avg training loss: 0.6973
Validation Loss: 0.6953, Accuracy: 0.4869


Epoch 3: 100%|██████████| 1299/1299 [03:10<00:00,  6.82it/s]


Epoch 3 - Avg training loss: 0.6887
Validation Loss: 0.6970, Accuracy: 0.4814
Early stopping triggered after 3 epochs


In [53]:
# Load the best model state before saving
if best_model_state is not None:
    finbert.load_state_dict(best_model_state)
    print(f"Loaded best model with validation accuracy: {best_val_accuracy:.10f}")

# save the fine-tuned model
finbert.save_pretrained("finbert-lora-semantic")
tokenizer.save_pretrained("finbert-lora-semantic")

Loaded best model with validation accuracy: 0.5181996086


('finbert-lora-semantic/tokenizer_config.json',
 'finbert-lora-semantic/special_tokens_map.json',
 'finbert-lora-semantic/vocab.txt',
 'finbert-lora-semantic/added_tokens.json')

In [None]:
# Evaluate the original FinBERT (before LoRA fine-tuning)

# Evaluation function for models
def evaluate_model(model, dataloader, device):
    model.eval()
    correct, total = 0, 0
    val_loss = 0

    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()
            predictions = torch.argmax(outputs.logits, dim=-1)
            correct += (predictions == batch["labels"]).sum().item()
            total += batch["labels"].size(0)

    accuracy = correct / total
    avg_loss = val_loss / len(dataloader)
    return accuracy, avg_loss

In [56]:
# Evaluate original FinBERT
print("\n=== Evaluating Original FinBERT Model ===")
original_finbert.to(device)
original_accuracy, original_loss = evaluate_model(original_finbert, valid_dataloader, device)
print(f"Original FinBERT - Validation Loss: {original_loss:.4f}, Accuracy: {original_accuracy:.10f}")


=== Evaluating Original FinBERT Model ===
Original FinBERT - Validation Loss: 0.6930, Accuracy: 0.5181996086
