# Stock price forecasting

Load three benchmark datasets, [BigData22](https://huggingface.co/datasets/TheFinAI/flare-sm-bigdata), [ACL18](https://huggingface.co/datasets/TheFinAI/flare-sm-acl), and [CIKM18](https://huggingface.co/datasets/TheFinAI/flare-sm-cikm) from [PIXIU](https://github.com/The-FinAI/PIXIU?tab=readme-ov-file) the for stock price forecasting.

In [3]:
#!pip install datasets
#!pip install transformers
#!pip install scikit-learn

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [4]:
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix

In [5]:
bigdata_train = load_dataset("TheFinAI/flare-sm-bigdata", split="train")
acl_train = load_dataset("TheFinAI/flare-sm-acl", split="train")
cikm_train = load_dataset("TheFinAI/flare-sm-cikm", split="train")

bigdata_valid = load_dataset("TheFinAI/flare-sm-bigdata", split="validation")
acl_valid = load_dataset("TheFinAI/flare-sm-acl", split="valid")
cikm_valid = load_dataset("TheFinAI/flare-sm-cikm", split="valid")

bigdata_test = load_dataset("TheFinAI/flare-sm-bigdata", split="test")
acl_test = load_dataset("TheFinAI/flare-sm-acl", split="test")
cikm_test = load_dataset("TheFinAI/flare-sm-cikm", split="test")

(…)-00000-of-00001-4c97651cf23a4342.parquet:   0%|          | 0.00/9.78M [00:00<?, ?B/s]

(…)-00000-of-00001-7ba518568ea39642.parquet:   0%|          | 0.00/433k [00:00<?, ?B/s]

(…)-00000-of-00001-bcbe082671cc1fdb.parquet:   0%|          | 0.00/792k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4897 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/798 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1472 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/653 [00:00<?, ?B/s]

(…)-00000-of-00001-24d52140a30ef03c.parquet:   0%|          | 0.00/34.8M [00:00<?, ?B/s]

(…)-00000-of-00001-9e63b9de85b2453a.parquet:   0%|          | 0.00/6.92M [00:00<?, ?B/s]

(…)-00000-of-00001-7ec206eb036ab81e.parquet:   0%|          | 0.00/4.64M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20781 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3720 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/2555 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/651 [00:00<?, ?B/s]

(…)-00000-of-00001-f71a7dda3fae0889.parquet:   0%|          | 0.00/13.3M [00:00<?, ?B/s]

(…)-00000-of-00001-e1663a0932037903.parquet:   0%|          | 0.00/4.15M [00:00<?, ?B/s]

(…)-00000-of-00001-b105ab56855808e4.parquet:   0%|          | 0.00/1.70M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3396 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1143 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/431 [00:00<?, ? examples/s]

Preprocess data.

In [6]:
bigdata_train_df = bigdata_train.to_pandas()[['gold', 'text']] # 0: rise, 1: fall
acl_train_df = acl_train.to_pandas()[['gold', 'text']]
cikm_train_df = cikm_train.to_pandas()[['gold', 'text']]

bigdata_valid_df = bigdata_valid.to_pandas()[['gold', 'text']]
acl_valid_df = acl_valid.to_pandas()[['gold', 'text']]
cikm_valid_df = cikm_valid.to_pandas()[['gold', 'text']]

bigdata_test_df = bigdata_test.to_pandas()[['gold', 'text']]
acl_test_df = acl_test.to_pandas()[['gold', 'text']]
cikm_test_df = cikm_test.to_pandas()[['gold', 'text']]

In [7]:
print('BigData training set is of shape: ', bigdata_train_df.shape)
print('ACL training set is of shape: ', acl_train_df.shape)
print('CIKM training set is of shape: ', cikm_train_df.shape)

print('BigData validation set is of shape: ', bigdata_valid_df.shape)
print('ACL validation set is of shape: ', acl_valid_df.shape)
print('CIKM validation set is of shape: ', cikm_valid_df.shape)

print('BigData test set is of shape: ', bigdata_test_df.shape)
print('ACL test set is of shape: ', acl_test_df.shape)
print('CIKM test set is of shape: ', cikm_test_df.shape)
bigdata_train_df.head()

BigData training set is of shape:  (4897, 2)
ACL training set is of shape:  (20781, 2)
CIKM training set is of shape:  (3396, 2)
BigData validation set is of shape:  (798, 2)
ACL validation set is of shape:  (2555, 2)
CIKM validation set is of shape:  (431, 2)
BigData test set is of shape:  (1472, 2)
ACL test set is of shape:  (3720, 2)
CIKM test set is of shape:  (1143, 2)


Unnamed: 0,gold,text
0,1,"date,open,high,low,close,adj-close,inc-5,inc-1..."
1,1,"date,open,high,low,close,adj-close,inc-5,inc-1..."
2,1,"date,open,high,low,close,adj-close,inc-5,inc-1..."
3,1,"date,open,high,low,close,adj-close,inc-5,inc-1..."
4,1,"date,open,high,low,close,adj-close,inc-5,inc-1..."


## Financial LLM fine-tuning

In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch import optim
from torch.optim import AdamW
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from peft import get_peft_model, LoraConfig, TaskType

# Load the pre-trained FinBERT model and tokenizer with proper binary classification setup
model_name = 'ProsusAI/finbert'
tokenizer = BertTokenizer.from_pretrained(model_name)

# Initialize the model with binary classification (2 labels)
finbert = BertForSequenceClassification.from_pretrained(model_name)
finbert.config.num_labels = 2
finbert.num_labels = 2
finbert.classifier = nn.Linear(finbert.config.hidden_size, finbert.config.num_labels)
# Print model configuration for debugging
print("Updated model configuration:")
print(f"Number of labels: {finbert.config.num_labels}")
print(f"Classifier output shape: {finbert.classifier.out_features if hasattr(finbert.classifier, 'out_features') else 'Custom classifier'}")
original_finbert = finbert

# Configure PEFT with LoRA specifically for binary classification
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,                       # Reduced rank for efficiency
    lora_alpha=16,             # Adjusted alpha for better stability
    lora_dropout=0.1,
    bias="none",               # No bias adaptation for classification tasks
    target_modules=["query", "key", "value", "output.dense"],  # Target more modules for better adaptation
    modules_to_save=["classifier"]  # Save the classifier layer which is crucial for the task
)

# Apply PEFT to the model
finbert = get_peft_model(finbert, peft_config)
finbert.print_trainable_parameters()

In [None]:
# Define a custom dataset to handle the text and labels
class FinancialDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):  # Reduced max_len for efficiency
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])  # Ensure text is a string
        label = int(self.labels[item])  # Ensure label is an integer

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [105]:
# Sample data (financial texts with labels for rise or fall)
train_texts = bigdata_train_df['text'].tolist()
train_labels = bigdata_train_df['gold'].tolist()  # 1: Rise, 0: Fall
valid_texts = bigdata_valid_df['text'].tolist()
valid_labels = bigdata_valid_df['gold'].tolist()
test_texts = bigdata_test_df['text'].tolist()
test_labels = bigdata_test_df['gold'].tolist()

print(f"Number of training examples: {len(train_texts)}")
print(f"Number of validation examples: {len(valid_texts)}")
print(f"Class distribution in training data: {np.bincount(train_labels)}")
print(f"Class distribution in validation data: {np.bincount(valid_labels)}")

# Prepare the dataset and dataloaders
train_dataset = FinancialDataset(train_texts, train_labels, tokenizer)
val_dataset = FinancialDataset(valid_texts, valid_labels, tokenizer)
test_dataset = FinancialDataset(test_texts, test_labels, tokenizer)

batch_size = 8  # Increased for faster training
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Define the optimizer and loss function
optimizer = AdamW(finbert.parameters(), lr=2e-5, eps=1e-8, weight_decay=0.01)

# Add a learning rate scheduler
total_steps = len(train_loader) * 5  # 5 epochs
warmup_steps = int(total_steps * 0.1)  # 10% of total steps for warmup
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

Number of training examples: 4897
Number of validation examples: 798
Class distribution in training data: [2585 2312]
Class distribution in validation data: [390 408]


In [88]:
if torch.cuda.is_available():
    print(f"CUDA Device Name: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Device Count: {torch.cuda.device_count()}")
    print(f"CUDA Compute Capability: {torch.cuda.get_device_capability(0)}")
else:
    print("CUDA is not available.")

# Move the model to the correct device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
original_finbert = original_finbert.to(device)
finbert = finbert.to(device)
print(f"Using device: {device}")

CUDA Device Name: NVIDIA A100-SXM4-40GB
CUDA Device Count: 1
CUDA Compute Capability: (8, 0)
Using device: cuda


In [89]:
# Loss function with class weights if needed (if dataset is imbalanced)
if np.bincount(train_labels)[0] != np.bincount(train_labels)[1]:
    # Calculate class weights for balanced loss
    class_counts = np.bincount(train_labels)
    class_weights = 1. / torch.tensor(class_counts, dtype=torch.float)
    class_weights = class_weights / class_weights.sum()
    class_weights = class_weights.to(device)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    print(f"Using weighted loss with weights: {class_weights}")
else:
    loss_fn = nn.CrossEntropyLoss()
    print("Using standard unweighted loss")

Using weighted loss with weights: tensor([0.4721, 0.5279], device='cuda:0')


In [90]:
from datetime import datetime
import pytz
import matplotlib.pyplot as plt
import seaborn as sns

# Training loop
def train(model, train_loader, optimizer, scheduler, device, epoch):
    """
    Training function for one epoch.

    Args:
        model: The model to train
        train_loader: DataLoader for training data
        optimizer: The optimizer to use
        scheduler: Learning rate scheduler
        device: Device to train on (cuda/cpu)
        epoch: Current epoch number

    Returns:
        Tuple of (average loss, accuracy) for this epoch
    """
    print(f"\nStarting training epoch {epoch+1}...")
    print(f"Total batches: {len(train_loader)}")

    model.train()
    epoch_loss = 0
    correct_predictions = 0
    total_predictions = 0

    for batch_idx, batch in enumerate(train_loader):
        if batch_idx % 100 == 0:
            local_tz = pytz.timezone('America/Vancouver')
            local_time = datetime.now(local_tz)
            print(f"Processing batch {batch_idx}/{len(train_loader)}, "
                  f"Loss: {epoch_loss/(batch_idx+1) if batch_idx > 0 else 0:.4f}, "
                  f"Accuracy: {correct_predictions/total_predictions*100 if total_predictions > 0 else 0:.2f}%, "
                  f"Time: {local_time.strftime('%H:%M:%S')}")

        try:
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Print shapes for debugging on first batch of first epoch
            if epoch == 0 and batch_idx == 0:
                print(f"Input shapes - input_ids: {input_ids.shape}, attention_mask: {attention_mask.shape}, labels: {labels.shape}")

            # Forward pass - ensure we're passing the correct inputs
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            logits = outputs.logits

            # Print shapes for debugging on first batch of first epoch
            if epoch == 0 and batch_idx == 0:
                print(f"Output shapes - logits: {logits.shape}")
                print(f"Unique labels: {torch.unique(labels)}")  # Verify we have binary labels

            # Calculate batch accuracy
            preds = torch.argmax(logits, dim=1)
            correct_predictions += (preds == labels).sum().item()
            total_predictions += labels.size(0)

            # Backward pass and optimization
            loss.backward()

            # Gradient clipping to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()
            scheduler.step()

            epoch_loss += loss.item()

        except RuntimeError as e:
            print(f"Error in batch {batch_idx}: {e}")
            # Print more debugging info
            print(f"Input shapes: input_ids: {input_ids.shape}, attention_mask: {attention_mask.shape}, labels: {labels.shape}")
            print(f"Label values: {labels}")
            raise  # Re-raise the exception after printing debug info

    # Calculate epoch metrics
    avg_loss = epoch_loss / len(train_loader)
    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0

    return avg_loss, accuracy

# Evaluation loop
def evaluate(model, val_loader, device):
    model.eval()
    val_loss = 0
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            val_loss += loss.item()

            preds = torch.argmax(logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    # Calculate overall metrics
    avg_val_loss = val_loss / len(val_loader)
    accuracy = accuracy_score(true_labels, predictions)

    return {
        'loss': avg_val_loss,
        'accuracy': accuracy
    }


# Plotting function to visualize training progress
def plot_training_progress(train_losses, train_accs, val_losses, val_accs):
    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.title('Training and Validation Loss')

    plt.subplot(1, 2, 2)
    plt.plot(train_accs, label='Training Accuracy')
    plt.plot(val_accs, label='Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.title('Training and Validation Accuracy')

    plt.tight_layout()
    plt.savefig('training_progress.png')
    plt.close()

# Function to plot confusion matrix
def plot_confusion_matrix(cm, epoch):
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix - Epoch {epoch+1}')
    plt.savefig(f'confusion_matrix_epoch_{epoch+1}.png')
    plt.close()


### Pre-trained model

In [91]:
# Apply original FinBERT on validation set for baseline comparison
print("\nEvaluating original FinBERT model on validation set...")
original_metrics = evaluate(original_finbert, val_loader, device)


Evaluating original FinBERT model on validation set...


In [101]:
print("\nOriginal FinBERT Performance on Validation Set:")
print(f"Loss: {original_metrics['loss']:.10f}")
print(f"Accuracy: {original_metrics['accuracy']:.10f}")


Original FinBERT Performance on Validation Set:
Loss: 0.6936957473
Accuracy: 0.5112781955


### Fine tuning

In [93]:
# Try a single batch first to debug
print("Testing a single batch through the model for debugging...")
for batch in train_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    print(f"Input shapes - input_ids: {input_ids.shape}, attention_mask: {attention_mask.shape}, labels: {labels.shape}")
    print(f"Unique labels: {torch.unique(labels)}")

    outputs = finbert(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    print(f"Output shapes - logits: {outputs.logits.shape}")

    # Check if the number of classes matches our expectations
    if outputs.logits.shape[1] != 2:
        print(f"WARNING: Model is outputting {outputs.logits.shape[1]} classes, but we need 2 for binary classification.")

    break  # Just test one batch

Testing a single batch through the model for debugging...
Input shapes - input_ids: torch.Size([8, 512]), attention_mask: torch.Size([8, 512]), labels: torch.Size([8])
Unique labels: tensor([0, 1], device='cuda:0')
Output shapes - logits: torch.Size([8, 2])


In [109]:
# Fine-tune and evaluate
epochs = 1
train_losses, train_accs = [], []
val_losses, val_accs = [], []
best_acc = 0
best_model_path = "best_finbert_model.pt"

for epoch in range(epochs):
    # Training phase - ensure all arguments are passed in the correct order
    train_loss, train_acc = train(
        model=finbert,
        train_loader=train_loader,
        optimizer=optimizer,
        scheduler=scheduler,
        device=device,
        epoch=epoch
    )
    train_losses.append(train_loss)
    train_accs.append(train_acc)

    # Evaluation phase
    val_metrics = evaluate(finbert, val_loader, device)
    val_losses.append(val_metrics['loss'])
    val_accs.append(val_metrics['accuracy'])

    # evaluate on test_loader
    test_metrics = evaluate(finbert, test_loader, device)

    # Print metrics
    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"Train Loss: {train_loss:.10f}, Train Accuracy: {train_acc:.10f}")
    print(f"Val Loss: {val_metrics['loss']:.10f}, Val Accuracy: {val_metrics['accuracy']:.10f}")
    print(f"Test Loss: {test_metrics['loss']:.10f}, Val Accuracy: {test_metrics['accuracy']:.10f}")

    # Save the best model based on accuracy
    if val_metrics['accuracy'] > best_acc:
        best_acc = val_metrics['accuracy']
        torch.save({
            'model_state_dict': finbert.state_dict(),
            'peft_config': peft_config
        }, best_model_path)
        print(f"New best model saved with Accuracy: {best_acc:.4f}")


Starting training epoch 1...
Total batches: 613
Processing batch 0/613, Loss: 0.0000, Accuracy: 0.00%, Time: 21:28:26
Input shapes - input_ids: torch.Size([8, 512]), attention_mask: torch.Size([8, 512]), labels: torch.Size([8])
Output shapes - logits: torch.Size([8, 2])
Unique labels: tensor([0, 1], device='cuda:0')
Processing batch 100/613, Loss: 0.6880, Accuracy: 51.00%, Time: 21:28:45
Processing batch 200/613, Loss: 0.6923, Accuracy: 50.94%, Time: 21:29:04
Processing batch 300/613, Loss: 0.6935, Accuracy: 50.33%, Time: 21:29:22
Processing batch 400/613, Loss: 0.6938, Accuracy: 50.88%, Time: 21:29:40
Processing batch 500/613, Loss: 0.6945, Accuracy: 50.52%, Time: 21:29:59
Processing batch 600/613, Loss: 0.6938, Accuracy: 51.12%, Time: 21:30:17
Epoch 1/1
Train Loss: 0.6951328761, Train Accuracy: 0.5113334695
Val Loss: 0.6991507250, Val Accuracy: 0.4887218045
Test Loss: 0.6897112076, Val Accuracy: 0.5523097826
New best model saved with Accuracy: 0.4887


In [102]:
## Load the best model for final evaluation
#finbert.load_state_dict(torch.load(best_model_path))
#finbert.eval()

# Load the best model for final evaluation
checkpoint = torch.load(best_model_path, weights_only=False)
finbert.load_state_dict(checkpoint['model_state_dict'])  # Load model state
peft_config = checkpoint['peft_config']  # Reload PEFT configuration if needed
finbert.eval()

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): BertForSequenceClassification(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default

Evaluate on test set.

In [103]:
# Evaluate on test set
test_texts = bigdata_test_df['text'].tolist()
test_labels = bigdata_test_df['gold'].tolist()
test_dataset = FinancialDataset(test_texts, test_labels, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

original_test_metrics = evaluate(original_finbert, test_loader, device)
print("\nOriginal FinBERT Performance on Test Set:")
print(f"Loss: {original_test_metrics['loss']:.10f}")
print(f"Accuracy: {original_test_metrics['accuracy']:.10f}")

test_metrics = evaluate(finbert, test_loader, device)
print("\nTest Set Evaluation:")
print(f"Loss: {test_metrics['loss']:.10f}")
print(f"Accuracy: {test_metrics['accuracy']:.10f}")


Original FinBERT Performance on Test Set:
Loss: 0.6902687349
Accuracy: 0.5523097826

Test Set Evaluation:
Loss: 0.6902687349
Accuracy: 0.5523097826
