# Fine-tuning FinBERT 

## Load the libraries and data 

In [36]:
import pandas as pd

import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

from utils import compute_metrics

MODEL_NAME = "ProsusAI/finbert"
EXPORT_DIR = "./model"

## Load the model and tokenizer

In [37]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

## Load the preprocessed data 

In [38]:
df = pd.read_csv("data/preprocessed_cryptonews.csv")
df = df.sample(20)

df.head()

Unnamed: 0,text,sentiment,polarity,subjectivity
4858,Holešky testnet has a 1.6-billion ETH supply a...,2,0.35,0.4
7888,Bitcoin's rally on Thursday following a judge'...,0,-0.03,0.33
13805,"The largest and most popular meme coin, DOGE, ...",2,0.23,0.63
7557,A Bitcoin impersonator has suddenly risen to p...,1,0.0,0.5
24590,Ethereum’s co-founder appears to have been awa...,1,0.0,0.25


## Tokenize the data

In [39]:
text = df['text'].tolist()

encodings = tokenizer(
    text, 
    truncation=True,
    padding=True
)

encodings['input_ids'], encodings['attention_mask']

([[101,
   8198,
   4801,
   3231,
   7159,
   2038,
   1037,
   1015,
   1012,
   1020,
   1011,
   4551,
   3802,
   2232,
   4425,
   2012,
   4888,
   1010,
   2437,
   2009,
   9280,
   2062,
   7801,
   2084,
   1996,
   3080,
   2175,
   2121,
   3669,
   2897,
   1012,
   102,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  [101,
   2978,
   3597,
   2378,
   1005,
   1055,
   8320,
   2006,
   9432,
   2206,
   1037,
   3648,
   1005,
   1055,
   6996,
   1999,
   10819,
   1058,
   1012,
   24644,
   2038,
   10003,
   2000,
   2022,
   2460,
   2973,
   1012,
   102,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0

In [40]:
with torch.no_grad(): # disable gradient calculations
    input_ids = torch.tensor(encodings['input_ids'])
    attention_mask = torch.tensor(encodings['attention_mask'])
    
    # Get predictions
    outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
    )
    logits = outputs.logits

    # Get probabilities
    probabilities = torch.softmax(logits, dim=1)
    classes = torch.argmax(probabilities, dim=1)

df['sentiment'].tolist(), classes.tolist()

([2, 0, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 0, 1, 1, 1],
 [0, 1, 1, 2, 2, 0, 0, 1, 2, 2, 0, 1, 2, 1, 0, 0, 2, 0, 2, 2])

In [45]:
compute_metrics(df['sentiment'].tolist(), classes.tolist())

{'accuracy': 0.2,
 'precision': 0.14166666666666666,
 'recall': 0.2,
 'f1': 0.16545454545454547}

## Export outputs

In [3]:
model.save_pretrained(EXPORT_DIR)
tokenizer.save_pretrained(EXPORT_DIR)

('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/vocab.txt',
 './model/added_tokens.json',
 './model/tokenizer.json')