In [None]:
import numpy as np
import pandas as pd 
import transformers
from transformers import BertTokenizer, Trainer, BertForSequenceClassification, TrainingArguments
from datasets import Dataset
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

print(torch.__version__, ' ', transformers.__version__)
print(torch.cuda.is_available())

## sentiment transfer
- positive: 1 -> low risk
- neutral: 2 -> medium risk
- negative: 0 -> high risk

### load FinBERT

In [None]:
model = BertForSequenceClassification.from_pretrained(
    'yiyanghkust/finbert-pretrain', 
    num_labels=3
)

tokenizer = BertTokenizer.from_pretrained(
    'yiyanghkust/finbert-pretrain'
)

## Data Preprocessing

In [None]:
df = pd.read_csv('labeled_headlines_specific.csv')
df.columns = ['sentence', 'label'] ## use your own customized dataset
df.head()

df = df.dropna(subset=['sentence', 'label'])

df_train, df_test, = train_test_split(df, stratify=df['label'], test_size=0.25, random_state=42)
df_train, df_val = train_test_split(df_train, stratify=df_train['label'],test_size=0.25, random_state=42)

print(df_train.shape, df_test.shape, df_val.shape)

### Data setup

In [None]:
dataset_train = Dataset.from_pandas(df_train)
dataset_val = Dataset.from_pandas(df_val)
dataset_test = Dataset.from_pandas(df_test)

dataset_train = dataset_train.map(lambda e: tokenizer(e['sentence'], truncation=True, padding='max_length', max_length=128), batched=True)
dataset_val = dataset_val.map(lambda e: tokenizer(e['sentence'], truncation=True, padding='max_length', max_length=128), batched=True)
dataset_test = dataset_test.map(lambda e: tokenizer(e['sentence'], truncation=True, padding='max_length' , max_length=128), batched=True)

dataset_train.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
dataset_val.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
dataset_test.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])

### Model training

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'accuracy' : accuracy_score(predictions, labels)}

args = TrainingArguments(
        output_dir = 'temp/',
        evaluation_strategy = 'epoch',
        save_strategy = 'epoch',
        learning_rate=2e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=5,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model='accuracy',
)

trainer = Trainer(
        model=model,                         # the instantiated 🤗 Transformers model to be trained
        args=args,                           # training arguments, defined above
        train_dataset=dataset_train,         # training dataset
        eval_dataset=dataset_val,            # evaluation dataset
        compute_metrics=compute_metrics
)

trainer.train()   

In [None]:
model.eval()
prediction_output = trainer.predict(dataset_test)

print(prediction_output.metrics)

In [None]:
# do a confusion matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt




predictions = np.argmax(prediction_output.predictions, axis=1)

print(predictions)
labels = prediction_output.label_ids

# Generate the confusion matrix
cm = confusion_matrix(labels, predictions)

# Plot the confusion matrix
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')


### Save model

In [None]:
trainer.save_model('models/')