# Baseline Roberta Classifier on Yelp dataset using huggingface


In [1]:
import os
from typing import List

%pip install datasets
%pip install transformers
import torch
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import DataLoader
from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments

Collecting datasets
[?25l  Downloading https://files.pythonhosted.org/packages/f0/f4/2a3d6aee93ae7fce6c936dda2d7f534ad5f044a21238f85e28f0b205adf0/datasets-1.1.2-py3-none-any.whl (147kB)
[K     |████████████████████████████████| 153kB 11.7MB/s 
[?25hCollecting pyarrow>=0.17.1
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e1/27958a70848f8f7089bff8d6ebe42519daf01f976d28b481e1bfd52c8097/pyarrow-2.0.0-cp36-cp36m-manylinux2014_x86_64.whl (17.7MB)
[K     |████████████████████████████████| 17.7MB 198kB/s 
Collecting xxhash
[?25l  Downloading https://files.pythonhosted.org/packages/f7/73/826b19f3594756cb1c6c23d2fbd8ca6a77a9cd3b650c9dec5acc85004c38/xxhash-2.0.0-cp36-cp36m-manylinux2010_x86_64.whl (242kB)
[K     |████████████████████████████████| 245kB 49.5MB/s 
Installing collected packages: pyarrow, xxhash, datasets
  Found existing installation: pyarrow 0.14.1
    Uninstalling pyarrow-0.14.1:
      Successfully uninstalled pyarrow-0.14.1
Successfully installed datasets-1

In [5]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', return_dict=True)

RANDOM_SEED = 42
# Taking only subset of data (faster training, fine-tuning the whole dataset takes ~20 hours per epoch)
TRAIN_SIZE = 5_000
VALID_SIZE = 1_000
TEST_SIZE = 1_000

dataset = load_dataset("yelp_polarity", split="train")
train_test_split = dataset.train_test_split(train_size=TRAIN_SIZE, seed=RANDOM_SEED)
train_dataset = train_test_split["train"]
test_val_dataset = train_test_split["test"].train_test_split(train_size=VALID_SIZE, test_size=TEST_SIZE, seed=RANDOM_SEED)
val_dataset, test_dataset = test_val_dataset["train"], test_val_dataset["test"]

print(f"Train size: {len(train_dataset)}, Validation size: {len(val_dataset)}, Test size: {len(test_dataset)}")

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

Train size: 5000, Validation size: 1000, Test size: 1000


In [6]:
class DataCollator:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        
    def __call__(self, examples: List[dict]):
        labels = [example['label'] for example in examples]
        texts = [example['text'] for example in examples]
        tokenizer_output = self.tokenizer(texts, truncation=True, padding=True)
        return {
            'labels': torch.tensor(labels), 
            'input_ids': torch.tensor(tokenizer_output['input_ids']), 
            'attention_mask': torch.tensor(tokenizer_output['attention_mask'])
            }
    
data_collator = DataCollator(tokenizer)

I thought, that using my own DataCollator would slow things down. However, it turns out that it does ~1.20s / it, compared to 1.46s/it of default data collator. So the speed of data loading is not an issue here. The speedup may be due to the smaller sequence length of some batches (it is the same speed after using padding='max_length' strategy).


In [7]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [9]:
training_args = TrainingArguments(
    learning_rate=3e-5,
    weight_decay=0.01,
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,  # actual batch size: 16 (as suggested in Bert paper)
    warmup_steps=250,  # don't have any intuition for the right value here
    logging_dir='./logs',
    logging_steps=25,
    save_steps=150,
    eval_steps=150,
    evaluation_strategy='steps',  # evaluation every eval_steps (without it no evaluation is done)
    remove_unused_columns=False,
    no_cuda=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics
    
)


trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
150,0.247186,0.17742,0.959,0.957425,0.954451,0.960417
300,0.232007,0.166232,0.946,0.94375,0.94375,0.94375
450,0.081021,0.212733,0.951,0.950455,0.923379,0.979167
600,0.149918,0.172889,0.964,0.962733,0.95679,0.96875


TrainOutput(global_step=624, training_loss=0.2184932415301983)

In [10]:
trainer.evaluate(test_dataset)

{'epoch': 1.9984,
 'eval_accuracy': 0.957,
 'eval_f1': 0.9582929194956353,
 'eval_loss': 0.20396965742111206,
 'eval_precision': 0.9610894941634242,
 'eval_recall': 0.9555125725338491,
 'total_flos': 2913644550765120}