# Fine-tuning FinBERT 

## Load the libraries and data 

In [3]:
import pandas as pd

import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

from utils import compute_metrics

MODEL_NAME = "ProsusAI/finbert"
EXPORT_DIR = "./model"

## Load the model and tokenizer

In [4]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

## Load the preprocessed data 

In [5]:
df = pd.read_csv("data/preprocessed_cryptonews.csv")
df = df.sample(1037)

df.head()

Unnamed: 0,text,sentiment,polarity,subjectivity
7709,South Korean commercial banking giant Shinhan ...,1,0.0,0.5
13440,Signature Bank’s deposit exodus had already se...,0,-0.16,0.29
28340,"Two weeks after the war in Ukraine started, th...",1,0.0,0.0
8189,The European Union's Markets in Crypto-Assets ...,1,0.0,0.0
8888,Institutional flows into Ripple's XRP failed t...,0,-0.5,0.3


## Tokenize the data

In [6]:
text = df['text'].tolist()

encodings = tokenizer(
    text, 
    truncation=True,
    padding=True
)

encodings['input_ids'], encodings['attention_mask']

([[101,
   2148,
   4759,
   3293,
   8169,
   5016,
   12277,
   4819,
   2924,
   2949,
   2019,
   7861,
   2615,
   1011,
   11892,
   24010,
   3231,
   2005,
   6540,
   3597,
   2378,
   2128,
   22930,
   26897,
   10504,
   2006,
   1996,
   2002,
   4063,
   2050,
   2897,
   1012,
   102,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  [101,
   8085,
   2924,
   1521,
   1055,
   12816,
   16388,
   2018,
   2525,
   3876,
   2091,
   2011,
   1996,
   2051,
   25644,
   3706,
   1999,
   1010,
   2429,
   2000,
   15377,
   3581,
   1012,
   1032,
   1060,
   2050,
   2692,
   102,
   0,
   0,
   0,
   0,
   0,
   0,
   0,


## Fine-tune the model

## Training

### Inference

Improving the performance of model inference:
- Use batch processing through the `datasets` library which integrates well with PyTorch's `DataLoader` option.
- If available, use move both the model and its inputs to the GPU to speed up the predictions.

In [None]:
def predict_with_model(model, encodings, batch_size=128):
    with torch.no_grad():
        # Move model to GPU if available
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model = model.to(device)
        
        input_ids = torch.tensor(encodings['input_ids'])
        attention_mask = torch.tensor(encodings['attention_mask'])
        
        # Create a DataLoader for batching
        dataset = torch.utils.data.TensorDataset(input_ids, attention_mask)
        dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
        
        all_predictions = []
        
        for batch_input_ids, batch_attention_mask in dataloader:
            # Move batch to GPU if available
            batch_input_ids = batch_input_ids.to(device)
            batch_attention_mask = batch_attention_mask.to(device)
            
            # Get predictions for this batch
            outputs = model(
                input_ids=batch_input_ids,
                attention_mask=batch_attention_mask
            )
            logits = outputs.logits
            
            # Get probabilities for this batch
            batch_probabilities = torch.softmax(logits, dim=1)
            batch_classes = torch.argmax(batch_probabilities, dim=1)
            
            # Add to aggregated results (move back to CPU for list conversion)
            all_predictions.extend(batch_classes.cpu().tolist())
        
        return all_predictions

predictions = predict_with_model(model, encodings)
results = compute_metrics(df['sentiment'], torch.tensor(predictions))
results

{'accuracy': 0.3114754098360656,
 'precision': 0.30708164559127216,
 'recall': 0.3114754098360656,
 'f1': 0.3084313205325629}

## Export outputs

In [3]:
model.save_pretrained(EXPORT_DIR)
tokenizer.save_pretrained(EXPORT_DIR)

('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/vocab.txt',
 './model/added_tokens.json',
 './model/tokenizer.json')