<a href="https://colab.research.google.com/github/harshil0217/BERT_headline_classifier_v2/blob/main/model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')
!pwd

Mounted at /content/drive
/content


In [2]:
!git clone https://github.com/harshil0217/BERT_headline_classifier_v2.git
import os
os.chdir('/content/BERT_headline_classifier_v2')

Cloning into 'BERT_headline_classifier_v2'...
remote: Enumerating objects: 99, done.[K
remote: Counting objects: 100% (99/99), done.[K
remote: Compressing objects: 100% (81/81), done.[K
remote: Total 99 (delta 52), reused 40 (delta 16), pack-reused 0 (from 0)[K
Receiving objects: 100% (99/99), 1.04 MiB | 8.26 MiB/s, done.
Resolving deltas: 100% (52/52), done.


In [3]:
#import needed libraries

import pandas as pd
import numpy as np
import torch

from torch.utils.data.dataset import Dataset
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from itertools import product
from huggingface_hub import notebook_login

In [4]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [5]:
train.head()

Unnamed: 0,text,sentiment
0,Finnish plumbing and heating systems supplier ...,negative
1,Finnish Talvivaara Mining Co HEL : TLV1V said ...,neutral
2,It comes complete with an LCD touch screen sys...,neutral
3,"MADISON , Wis. , Feb. 6 - PRNewswire - -- Fisk...",positive
4,In February the Elcoteq group sold its St Pete...,negative


In [6]:
train_labels = train['sentiment']
test_labels = test['sentiment']

In [7]:
#encode labels with get dummies
train_labels = pd.get_dummies(train_labels)
test_labels = pd.get_dummies(test_labels)

In [8]:
train_labels

Unnamed: 0,negative,neutral,positive
0,True,False,False
1,False,True,False
2,False,True,False
3,False,False,True
4,True,False,False
...,...,...,...
3871,False,True,False
3872,False,True,False
3873,False,True,False
3874,False,True,False


In [9]:
#convert train and test labels to float
train_labels = train_labels.astype(float)
test_labels = test_labels.astype(float)
train_labels

Unnamed: 0,negative,neutral,positive
0,1.0,0.0,0.0
1,0.0,1.0,0.0
2,0.0,1.0,0.0
3,0.0,0.0,1.0
4,1.0,0.0,0.0
...,...,...,...
3871,0.0,1.0,0.0
3872,0.0,1.0,0.0
3873,0.0,1.0,0.0
3874,0.0,1.0,0.0


In [10]:
train_labels = train_labels.values.tolist()
test_labels = test_labels.values.tolist()

In [11]:
train_texts = train['text'].to_list()
test_texts = test['text'].to_list()

In [12]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [13]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [14]:
#create dataset for headline classifier data

class HeadlineDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [15]:
train_dataset = HeadlineDataset(train_encodings, train_labels)
test_dataset = HeadlineDataset(test_encodings, test_labels)

In [16]:
# add compute metrics
def compute_metrics(pred):
# Convert logits to probabilities
    logits = pred.predictions
    probs = torch.sigmoid(torch.tensor(logits))

    # Convert probabilities to binary predictions
    preds = np.where(probs >= 0.5, 1, 0)

    # True labels
    labels = pred.label_ids

    # Calculate accuracy for multi-label classification
    accuracy = accuracy_score(labels, preds)

    # Calculate precision, recall, and F1-score
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [17]:
#load model

model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased',
                                                           problem_type = 'multi_label_classification',
                                                           num_labels=3)

training_args = TrainingArguments(
    output_dir='.',
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size = 16,
    num_train_epochs=5
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.259107,0.842105,0.856338,0.854489,0.853446
2,No log,0.277986,0.827657,0.838792,0.832817,0.831495
3,0.258200,0.339369,0.852425,0.854547,0.855521,0.85503
4,0.258200,0.339101,0.854489,0.86309,0.859649,0.861243
5,0.076800,0.396555,0.861713,0.863155,0.863777,0.863115
6,0.076800,0.444269,0.862745,0.866032,0.865841,0.865439
7,0.022800,0.450214,0.864809,0.868309,0.864809,0.866541
8,0.022800,0.4598,0.864809,0.867603,0.867905,0.867715


TrainOutput(global_step=1944, training_loss=0.0936235361138489, metrics={'train_runtime': 854.0057, 'train_samples_per_second': 36.309, 'train_steps_per_second': 2.276, 'total_flos': 2469894985978560.0, 'train_loss': 0.0936235361138489, 'epoch': 8.0})

In [19]:
#check accuracy with testing data

results = trainer.evaluate()
results

{'eval_loss': 0.4597996771335602,
 'eval_accuracy': 0.8648090815273478,
 'eval_precision': 0.8676032438390421,
 'eval_recall': 0.8679050567595459,
 'eval_f1': 0.8677153890757997,
 'eval_runtime': 4.5332,
 'eval_samples_per_second': 213.755,
 'eval_steps_per_second': 13.456,
 'epoch': 8.0}

In [27]:
notebook_login()
repo_name = 'financial_headline_classifier'
trainer.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

events.out.tfevents.1723835896.6a27384c82e5.1279.1:   0%|          | 0.00/560 [00:00<?, ?B/s]

events.out.tfevents.1723835037.6a27384c82e5.1279.0:   0%|          | 0.00/9.74k [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/harshil0217/financial_headline_classifier/commit/5c7bd263288fc1543936e9eb499b103c3e6160de', commit_message='Upload tokenizer', commit_description='', oid='5c7bd263288fc1543936e9eb499b103c3e6160de', pr_url=None, pr_revision=None, pr_num=None)