# BERT Finetuning

In [3]:
import random
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split

import torch
from transformers.file_utils import is_tf_available, is_torch_available
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments

In [10]:
is_torch_available()

True

In [6]:
def set_seed(seed: int):
    """
    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if
    installed).

    Args:
        seed (:obj:`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    if is_torch_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # ^^ safe to call this function even if cuda is not available
    if is_tf_available():
        import tensorflow as tf

        tf.random.set_seed(seed)

set_seed(1)

In [4]:
model_name = 'bert-base-uncased'
max_length = 512

## Loading the Dataset

In [5]:
tokenizer = BertTokenizerFast.from_pretrained(model_name, 
                                              do_lower_case=True)

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [17]:
def read_20newsgroups(test_size=0.2):

    dataset = fetch_20newsgroups(subset='all', 
                                 shuffle=True, 
                                 remove=('headers', 'footers', 'quotes'))

    documents = dataset.data
    labels = dataset.target

    return train_test_split(documents, 
                            labels, 
                            test_size=test_size), dataset.target_names

In [19]:
ret = read_20newsgroups()
(train_texts, valid_texts, train_labels, valid_labels), target_names = ret

In [21]:
train_labels

array([17, 16, 10, ...,  9, 15,  4])

In [41]:
valid_labels

array([ 5, 12,  5, ..., 18,  6, 10])

In [27]:
train_encodings = tokenizer(train_texts, truncation=True,
                           padding=True, max_length=max_length)
valid_encodings = tokenizer(valid_texts, truncation=True,
                           padding=True, max_length=max_length)

In [31]:
class NewsGroupsDataset(torch.utils.data.Dataset):
    
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item['labels'] = torch.tensor([self.labels[idx]])
        
        return item
    
    def __len__(self):
        return len(self.labels)

In [34]:
train_dataset = NewsGroupsDataset(train_encodings, train_labels)
valid_dataset = NewsGroupsDataset(valid_encodings, valid_labels)

## Training the Model

In [35]:
model = BertForSequenceClassification.from_pretrained(model_name, 
                                                     num_labels=len(target_names)).to('cuda')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [42]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc
    }

In [43]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_steps=200,               # log & save weights each logging_steps
    evaluation_strategy="steps",     # evaluate each `logging_steps`
)

In [None]:
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

In [45]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
200,1.9398,1.63082,0.571353,21.2473,177.434
400,1.4044,1.201034,0.655438,21.2678,177.264
600,1.1561,1.079928,0.67878,21.27,177.245
800,1.0889,0.987148,0.693369,21.2547,177.373
1000,0.9321,0.901242,0.729973,21.2348,177.539
1200,0.7338,0.96496,0.707958,21.2336,177.549
1400,0.7406,0.854132,0.743236,21.2246,177.624
1600,0.6905,0.862933,0.746684,21.2355,177.533
1800,0.7019,0.855739,0.744032,21.242,177.478
2000,0.5337,0.835247,0.754907,21.2469,177.438


TrainOutput(global_step=2829, training_loss=0.8166770830555575, metrics={'train_runtime': 1154.2138, 'train_samples_per_second': 2.451, 'total_flos': 15213644873809920, 'epoch': 3.0})

In [46]:
trainer.evaluate()

{'eval_loss': 0.8043307065963745,
 'eval_accuracy': 0.773474801061008,
 'eval_runtime': 21.2142,
 'eval_samples_per_second': 177.711,
 'epoch': 3.0}

In [47]:
model_path = '20newsgroups-bert-base-uncased'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('20newsgroups-bert-base-uncased/tokenizer_config.json',
 '20newsgroups-bert-base-uncased/special_tokens_map.json',
 '20newsgroups-bert-base-uncased/vocab.txt',
 '20newsgroups-bert-base-uncased/added_tokens.json')

## Performing Inference

In [53]:
def get_prediction(text):
    inputs = tokenizer(text, padding=True, 
                       truncation=True, max_length=max_length,
                       return_tensors='pt').to('cuda')
    
    outputs = model(**inputs)
    probs = outputs[0].softmax(1)
    return target_names[probs.argmax()]


In [54]:
text = """
The first thing is first. 
If you purchase a Macbook, you should not encounter performance issues that will prevent you from learning to code efficiently.
However, in the off chance that you have to deal with a slow computer, you will need to make some adjustments. 
Having too many background apps running in the background is one of the most common causes. 
The same can be said about a lack of drive storage. 
For that, it helps if you uninstall xcode and other unnecessary applications, as well as temporary system junk like caches and old backups.
"""
get_prediction(text)

'comp.sys.mac.hardware'

In [55]:
text = """
The first thing is first. 
If you purchase a Macbook, you should not encounter performance issues that will prevent you from learning to code efficiently.
However, in the off chance that you have to deal with a slow computer, you will need to make some adjustments. 
Having too many background apps running in the background is one of the most common causes. 
The same can be said about a lack of drive storage. 
For that, it helps if you uninstall xcode and other unnecessary applications, as well as temporary system junk like caches and old backups.
"""
print(get_prediction(text))

comp.sys.mac.hardware


In [56]:
text = """
A black hole is a place in space where gravity pulls so much that even light can not get out. 
The gravity is so strong because matter has been squeezed into a tiny space. This can happen when a star is dying.
Because no light can get out, people can't see black holes. 
They are invisible. Space telescopes with special tools can help find black holes. 
The special tools can see how stars that are very close to black holes act differently than other stars.
"""
print(get_prediction(text))

sci.space


In [57]:
text = """
Coronavirus disease (COVID-19) is an infectious disease caused by a newly discovered coronavirus.
Most people infected with the COVID-19 virus will experience mild to moderate respiratory illness and recover without requiring special treatment.  
Older people, and those with underlying medical problems like cardiovascular disease, diabetes, chronic respiratory disease, and cancer are more likely to develop serious illness.
"""
print(get_prediction(text))


sci.med
