<a href="https://colab.research.google.com/github/harshil0217/BERT_headline_classifier_v2/blob/main/model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pwd

Mounted at /content/drive
/content


In [None]:
import os
os.chdir('drive/MyDrive/GitHub/BERT_headline_classifier_v2')

In [None]:
#import needed libraries

import pandas as pd
import numpy as np
import torch

from torch.utils.data.dataset import Dataset
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split


In [None]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [None]:
train.head()

Unnamed: 0,headline,dominant_emotion
0,India’s oldest e-commerce firm’s IPO is a rare...,pessimism
1,5-Year-Old Girl Forms Unbreakable Bond With A ...,other
2,India floods: Hundreds of passengers rescued f...,optimism
3,S&P 500 Hits Fresh Record High on Trade Optimism,other
4,3M stock price target cut to $195 from $205 at...,pessimism


In [None]:
train_labels = train['dominant_emotion']
test_labels = test['dominant_emotion']

In [None]:
#encode labels with get dummies
train_labels = pd.get_dummies(train_labels)
test_labels = pd.get_dummies(test_labels)

In [None]:
train_labels

Unnamed: 0,optimism,other,pessimism
0,False,False,True
1,False,True,False
2,True,False,False
3,False,True,False
4,False,False,True
...,...,...,...
1724,True,False,False
1725,True,False,False
1726,True,False,False
1727,False,False,True


In [None]:
#convert train and test labels to float
train_labels = train_labels.astype(float)
test_labels = test_labels.astype(float)
train_labels

Unnamed: 0,optimism,other,pessimism
0,0.0,0.0,1.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,0.0,1.0,0.0
4,0.0,0.0,1.0
...,...,...,...
1724,1.0,0.0,0.0
1725,1.0,0.0,0.0
1726,1.0,0.0,0.0
1727,0.0,0.0,1.0


In [None]:
train_labels = train_labels.values.tolist()
test_labels = test_labels.values.tolist()

In [None]:
train_texts = train['headline'].to_list()
test_texts = test['headline'].to_list()

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [None]:
#create dataset for headline classifier data

class HeadlineDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [None]:
train_dataset = HeadlineDataset(train_encodings, train_labels)
test_dataset = HeadlineDataset(test_encodings, test_labels)

In [None]:
#load model

model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased',
                                                           problem_type = 'multi_label_classification',
                                                           num_labels=3)

training_args = TrainingArguments(
    output_dir='.',
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size = 16,
    num_train_epochs=8
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.559685
2,No log,0.575097
3,No log,0.67796
4,No log,0.954465
5,0.340700,1.177684
6,0.340700,1.323043
7,0.340700,1.402607
8,0.340700,1.399966


TrainOutput(global_step=872, training_loss=0.2042633595816586, metrics={'train_runtime': 54.84, 'train_samples_per_second': 252.225, 'train_steps_per_second': 15.901, 'total_flos': 376733191567824.0, 'train_loss': 0.2042633595816586, 'epoch': 8.0})

In [None]:
#check accuracy with testing data

results = trainer.evaluate()
results

{'eval_loss': 1.3999660015106201,
 'eval_runtime': 0.4763,
 'eval_samples_per_second': 1135.776,
 'eval_steps_per_second': 71.38,
 'epoch': 8.0}