# Fine-tuning FinBERT

In [18]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Load the libraries and data

In [19]:
import os
import pandas as pd
import torch

from transformers import AutoModelForSequenceClassification, AutoTokenizer
from google.colab import drive


MODEL_NAME = "ProsusAI/finbert"
EXPORT_DIR = "./model"

## Load the model and tokenizer

In [20]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

## Load the preprocessed data

In [21]:
# df = pd.read_csv("data/preprocessed_cryptonews.csv")
# df = df.sample(1037)
# df.head()

absolute_path = "/content/drive/My Drive/Colab Notebooks"
dataset_file_path = os.path.join(absolute_path, "Datasets", "preprocessed_cryptonews.csv")

df = pd.read_csv(dataset_file_path)
df_sample = df.sample(5)

df_sample.head()

Unnamed: 0,text,sentiment,polarity,subjectivity
23951,Short-term hodlers may be done with the bulk o...,2,0.27,0.47
29515,YouTube has doubled down on its future NFT pla...,0,-0.08,0.21
3035,The circulating supply of WLD tokens has stead...,2,0.08,0.62
27926,Grayscale’s new fund gives weighted exposure t...,2,0.14,0.45
21125,Laura Mercurio will replace Steve Vallas as Bl...,1,0.0,0.0


## Tokenize the data

In [22]:
from torch.utils.data import Dataset, DataLoader

class TorchDataset(torch.utils.data.Dataset):
    """
    Create a PyTorch dataset for text classification using BERT models.
    """
    def __init__(self, inputs, labels, max_length=512):
        self.inputs = inputs
        self.labels = labels
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs['input_ids'])

    def __getitem__(self, idx):
         # text, label = self.inputs[idx], self.labels[idx] # This line is not needed

        # Convert input_ids and attention_mask to PyTorch tensors
        input_ids = torch.tensor(self.inputs['input_ids'][idx], dtype=torch.long)
        attention_mask = torch.tensor(self.inputs['attention_mask'][idx], dtype=torch.long)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

X_train, y_train = list(df_sample['text']), list(df_sample['sentiment'])
X_train_encoded = tokenizer(
    X_train,
    truncation=True,
    padding=True
)
dataloader = DataLoader(
    TorchDataset(X_train_encoded, y_train),
    batch_size=5,
    shuffle=True,
)

## Fine-tune the model

### Training

### Inference

Improving the performance of model inference:
- Use batch processing through the `datasets` library which integrates well with PyTorch's `DataLoader` option.
- If available, use move both the model and its inputs to the GPU to speed up the predictions.

In [23]:
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


def compute_metrics(labels, preds, average='weighted'):
    """
    Compute accuracy, precision, recall, and F1 score based on predictions.
    """
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average=average
    )
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

def predict_sentiment(model, dataloader, batch_size=128):
    # Enable evaluation mode
    model.eval()

    all_labels, all_predictions = [], []
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    with torch.no_grad():
        for batch in dataloader:
            inputs = {key: value.to(device) for key, value in batch.items()}
            labels = inputs["labels"]

            # Get predictions for this batch
            outputs = model(**inputs)
            logits = outputs.logits

            # Get probabilities for this batch
            batch_probabilities = torch.softmax(logits, dim=1)
            batch_classes = torch.argmax(batch_probabilities, dim=1)

            # Add to aggregated results (move back to CPU for list conversion)
            # all_predictions.extend(batch_classes.cpu().tolist())
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(batch_classes.cpu().numpy())

    return all_labels, all_predictions


# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Get predictions based on the loaded data
labels, predictions = predict_sentiment(model, dataloader)
print(labels, predictions)
results = compute_metrics(labels, predictions)
# results = compute_metrics(df['sentiment'], torch.tensor(predictions))
results

[np.int64(2), np.int64(2), np.int64(1), np.int64(2), np.int64(0)] [np.int64(2), np.int64(0), np.int64(2), np.int64(2), np.int64(2)]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'accuracy': 0.4, 'precision': 0.3, 'recall': 0.4, 'f1': 0.34285714285714286}

## old inference

In [None]:
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(preds, labels, average='weighted'):
    """
    Compute accuracy, precision, recall, and F1 score based on predictions.
    """
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average=average
    )
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

def predict_with_model(model, encodings, batch_size=128):
    model.eval()

    with torch.no_grad():
        # Move model to GPU if available
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model = model.to(device)

        input_ids = torch.tensor(encodings['input_ids'])
        attention_mask = torch.tensor(encodings['attention_mask'])

        # Create a DataLoader for batching
        dataset = torch.utils.data.TensorDataset(input_ids, attention_mask)
        dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)

        all_predictions = []

        for batch_input_ids, batch_attention_mask in dataloader:
            # Move batch to GPU if available
            batch_input_ids = batch_input_ids.to(device)
            batch_attention_mask = batch_attention_mask.to(device)

            # Get predictions for this batch
            outputs = model(
                input_ids=batch_input_ids,
                attention_mask=batch_attention_mask
            )
            logits = outputs.logits

            # Get probabilities for this batch
            batch_probabilities = torch.softmax(logits, dim=1)
            batch_classes = torch.argmax(batch_probabilities, dim=1)

            # Add to aggregated results (move back to CPU for list conversion)
            all_predictions.extend(batch_classes.cpu().tolist())

        return all_predictions



predictions = predict_with_model(model, encodings)
results = compute_metrics(df['sentiment'], torch.tensor(predictions))
results

KeyboardInterrupt: 

## Export outputs

In [None]:
model.save_pretrained(EXPORT_DIR)
tokenizer.save_pretrained(EXPORT_DIR)

('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/vocab.txt',
 './model/added_tokens.json',
 './model/tokenizer.json')