In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import transformers
from transformers import BertTokenizer, Trainer, BertForSequenceClassification, TrainingArguments
from datasets import Dataset
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# do a confusion matrix


print(torch.__version__, ' ', transformers.__version__)
print(torch.cuda.is_available())

# load FinBERT
model = BertForSequenceClassification.from_pretrained(
    'yiyanghkust/finbert-pretrain', 
    num_labels=3
)

tokenizer = BertTokenizer.from_pretrained(
    'yiyanghkust/finbert-pretrain'
)

## sentiment mapping
- positive: 1 -> low risk
- neutral: 2 -> medium risk
- negative: 0 -> high risk

### Data Preprocessing

In [None]:
df = pd.read_csv('labeled_headlines_specific.csv')
df.columns = ['sentence', 'label'] ## use your own customized dataset
df.head()

df = df.dropna(subset=['sentence', 'label'])

df_train, df_test, = train_test_split(df, stratify=df['label'], test_size=0.25, random_state=42)
df_train, df_val = train_test_split(df_train, stratify=df_train['label'],test_size=0.25, random_state=42)

print(df_train.shape, df_test.shape, df_val.shape)

### Data setup

In [None]:

train_test_val = {'train':df_train, 'test':df_test, 'val':df_val}
dataset_dict = {}

for tag, dataset in train_test_val.items():
    
    data = Dataset.from_pandas(dataset).map(
        lambda e: tokenizer(e['sentence'], truncation=True, padding='max_length', max_length=128), 
        batched=True
        )
    data.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
    dataset_dict[tag] = data


### Model training

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'accuracy' : accuracy_score(predictions, labels)}

args = TrainingArguments(
        output_dir = 'temp/',
        evaluation_strategy = 'epoch',
        save_strategy = 'epoch',
        learning_rate=2e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=5,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model='accuracy',
)

trainer = Trainer(
        model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
        args=args,                           # training arguments, defined above
        train_dataset=dataset_dict['train'],         # training dataset
        eval_dataset=dataset_dict['val'],            # evaluation dataset
        compute_metrics=compute_metrics
)

trainer.train()   

In [None]:
model.eval()
prediction_output = trainer.predict(dataset_dict['test'])

print(prediction_output.metrics)

In [None]:
import warnings
warnings.filterwarnings('ignore')

predictions = np.argmax(prediction_output.predictions, axis=1)

print(classification_report(dataset_dict['test']['label'], predictions))

labels = prediction_output.label_ids

# Generate the confusion matrix
cm = confusion_matrix(labels, predictions)

# Plot the confusion matrix
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xticks(ticks=[0.5, 1.5, 2.5], labels=['Negative', 'Positive', 'Neutral'])
plt.yticks(ticks=[0.5, 1.5, 2.5], labels=['Negative', 'Positive', 'Neutral'])
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix');


### Save model

In [None]:
trainer.save_model('models/')