# Roberta Classifier on Twitter: baseline


In [1]:
import json
import os
from typing import List

%pip install datasets
%pip install transformers
import torch
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments


ROOT_DIR = "drive/My Drive/Colab Notebooks/nlp/results/twitter_baseline"
if not os.path.exists(ROOT_DIR):
    os.mkdir(ROOT_DIR)



In [2]:
def get_datasets(dataset_name, train_size, val_size=5_000, test_size=20_000, random_seed: int = 42):
    """Returns a tuple of train, validation and test datasets of sizes determined by arguments."""
    dataset = load_dataset(dataset_name, split="train")
    # We want test and validation data to be the same for every experiment
    test_split = dataset.train_test_split(test_size=test_size, seed=random_seed)
    test_dataset = test_split["test"]
    train_val_split = test_split["train"].train_test_split(test_size=val_size, seed=random_seed)
    # Validation and test sets
    train_dataset = train_val_split["train"].train_test_split(train_size=train_size, seed=random_seed)["train"]
    val_dataset = train_val_split["test"]
    return train_dataset, val_dataset, test_dataset


class DataCollator:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        
    def __call__(self, examples: List[dict]):
        labels = [0 if example['sentiment'] == 0 else 1 for example in examples]
        texts = [example['text'] for example in examples]
        tokenizer_output = self.tokenizer(texts, truncation=True, padding=True)
        return {
            'labels': torch.tensor(labels), 
            'input_ids': torch.tensor(tokenizer_output['input_ids']), 
            'attention_mask': torch.tensor(tokenizer_output['attention_mask'])
            }


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [3]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base', use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained('roberta-base', return_dict=True)
data_collator = DataCollator(tokenizer)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [4]:
dataset = load_dataset("sentiment140", split="train")
dataset[45]  # this one is confusing (45)
# In general, it would be worth it to describe them in my master thesis

Reusing dataset sentiment140 (/root/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/9fe1c0ce3319c47cc65ff7e49aac6c34d9c050ab1432988c104b3b275e360f3f)


{'date': 'Mon Apr 06 22:22:05 PDT 2009',
 'query': 'NO_QUERY',
 'sentiment': 0,
 'text': "@Viennah Yay! I'm happy for you with your job! But that also means less time for me and you... ",
 'user': 'antzpantz'}

In [5]:
TRAIN_SIZES = [20, 100, 1_000, 10_000, 100_000]
for train_size in TRAIN_SIZES:
    train_dataset, val_dataset, test_dataset = get_datasets("sentiment140", train_size, val_size=5_000, test_size=20_000)
    print(f"Train size: {len(train_dataset)}, Validation size: {len(val_dataset)}, Test size: {len(test_dataset)}")
    print(train_dataset[0])
    print(val_dataset[0])
    print(test_dataset[0])
    

    output_dir = os.path.join(ROOT_DIR, f"train_size_{train_size}")

    num_train_epochs = 6 if train_size <= 10_000 else 3  # don't want to wait too long for a converged model

    # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments
    training_args = TrainingArguments(
        learning_rate=3e-5,
        weight_decay=0.01,
        output_dir=output_dir,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        warmup_steps=0,  # don't have any intuition for the right value here
        logging_dir=output_dir,
        logging_steps=10,
        load_best_model_at_end=True,
        evaluation_strategy='epoch',
        remove_unused_columns=False,
        no_cuda=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=data_collator,
        compute_metrics=compute_metrics
        
    )

    trainer.train()

    test_result = trainer.evaluate(test_dataset)

    print(test_result)

    with open(os.path.join(output_dir, 'test_result.json'), 'w') as f:
        json.dump(test_result, f, indent=4)

Reusing dataset sentiment140 (/root/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/9fe1c0ce3319c47cc65ff7e49aac6c34d9c050ab1432988c104b3b275e360f3f)
Loading cached split indices for dataset at /root/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/9fe1c0ce3319c47cc65ff7e49aac6c34d9c050ab1432988c104b3b275e360f3f/cache-fbbd2e12a01297cb.arrow and /root/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/9fe1c0ce3319c47cc65ff7e49aac6c34d9c050ab1432988c104b3b275e360f3f/cache-f68c56688e800ed5.arrow
Loading cached split indices for dataset at /root/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/9fe1c0ce3319c47cc65ff7e49aac6c34d9c050ab1432988c104b3b275e360f3f/cache-d7a51a012ad6e2b7.arrow and /root/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/9fe1c0ce3319c47cc65ff7e49aac6c34d9c050ab1432988c104b3b275e360f3f/cache-ad4bdc05c1b33993.arrow
Loading cached split indices for dataset at /root/.cache/huggingface/datasets/sentiment140/sentiment14

Train size: 20, Validation size: 5000, Test size: 20000
{'date': 'Fri May 29 07:29:46 PDT 2009', 'query': 'NO_QUERY', 'sentiment': 4, 'text': 'Packing ', 'user': 'hannamelaa'}
{'date': 'Wed Jun 24 23:21:53 PDT 2009', 'query': 'NO_QUERY', 'sentiment': 0, 'text': "yea!!!! I think I have someone to watch the dogs while we're away. just have to kennel for 2 days instead of 11. no one wanted Zhu ", 'user': 'lindentreephoto'}
{'date': 'Sun Jun 07 07:43:33 PDT 2009', 'query': 'NO_QUERY', 'sentiment': 0, 'text': 'why am i awake so early?  damn projects. super nervous for the science one. mines gunna be so lame i dont wanna fail ', 'user': '_stacey_rae'}


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.696513,0.5032,0.669505,0.5032,1.0
2,No log,0.709801,0.5032,0.669505,0.5032,1.0
3,No log,0.72591,0.5032,0.669505,0.5032,1.0
4,No log,0.748271,0.5032,0.669505,0.5032,1.0
5,0.659558,0.761789,0.5032,0.669505,0.5032,1.0
6,0.659558,0.766794,0.5032,0.669505,0.5032,1.0


{'eval_loss': 0.6970205903053284, 'eval_accuracy': 0.49975, 'eval_f1': 0.6664444074012336, 'eval_precision': 0.49975, 'eval_recall': 1.0, 'epoch': 6.0, 'total_flos': 3592830028080}


Reusing dataset sentiment140 (/root/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/9fe1c0ce3319c47cc65ff7e49aac6c34d9c050ab1432988c104b3b275e360f3f)
Loading cached split indices for dataset at /root/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/9fe1c0ce3319c47cc65ff7e49aac6c34d9c050ab1432988c104b3b275e360f3f/cache-fbbd2e12a01297cb.arrow and /root/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/9fe1c0ce3319c47cc65ff7e49aac6c34d9c050ab1432988c104b3b275e360f3f/cache-f68c56688e800ed5.arrow
Loading cached split indices for dataset at /root/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/9fe1c0ce3319c47cc65ff7e49aac6c34d9c050ab1432988c104b3b275e360f3f/cache-d7a51a012ad6e2b7.arrow and /root/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/9fe1c0ce3319c47cc65ff7e49aac6c34d9c050ab1432988c104b3b275e360f3f/cache-ad4bdc05c1b33993.arrow


Train size: 100, Validation size: 5000, Test size: 20000
{'date': 'Fri Jun 05 08:43:16 PDT 2009', 'query': 'NO_QUERY', 'sentiment': 0, 'text': 'So wants to visit the Himalayas NOW! I hate this heat - it is driving me crazy, no Meru in sight too ', 'user': 'NerdIndian'}
{'date': 'Wed Jun 24 23:21:53 PDT 2009', 'query': 'NO_QUERY', 'sentiment': 0, 'text': "yea!!!! I think I have someone to watch the dogs while we're away. just have to kennel for 2 days instead of 11. no one wanted Zhu ", 'user': 'lindentreephoto'}
{'date': 'Sun Jun 07 07:43:33 PDT 2009', 'query': 'NO_QUERY', 'sentiment': 0, 'text': 'why am i awake so early?  damn projects. super nervous for the science one. mines gunna be so lame i dont wanna fail ', 'user': '_stacey_rae'}


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.693654,0.5098,0.611999,0.50855,0.768283
2,0.713399,0.690712,0.5176,0.248598,0.574928,0.158585
3,0.655428,0.683038,0.5466,0.389442,0.60401,0.287361
4,0.655428,0.667072,0.585,0.611496,0.578053,0.649046
5,0.557447,0.657749,0.6024,0.620321,0.597059,0.645469
6,0.451040,0.658409,0.6062,0.598409,0.614579,0.583068


{'eval_loss': 0.6618354916572571, 'eval_accuracy': 0.60035, 'eval_f1': 0.6113677249963534, 'eval_precision': 0.5946840711312902, 'eval_recall': 0.6290145072536268, 'epoch': 6.0, 'total_flos': 18927423470160}


Reusing dataset sentiment140 (/root/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/9fe1c0ce3319c47cc65ff7e49aac6c34d9c050ab1432988c104b3b275e360f3f)
Loading cached split indices for dataset at /root/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/9fe1c0ce3319c47cc65ff7e49aac6c34d9c050ab1432988c104b3b275e360f3f/cache-fbbd2e12a01297cb.arrow and /root/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/9fe1c0ce3319c47cc65ff7e49aac6c34d9c050ab1432988c104b3b275e360f3f/cache-f68c56688e800ed5.arrow
Loading cached split indices for dataset at /root/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/9fe1c0ce3319c47cc65ff7e49aac6c34d9c050ab1432988c104b3b275e360f3f/cache-d7a51a012ad6e2b7.arrow and /root/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/9fe1c0ce3319c47cc65ff7e49aac6c34d9c050ab1432988c104b3b275e360f3f/cache-ad4bdc05c1b33993.arrow


Train size: 1000, Validation size: 5000, Test size: 20000
{'date': 'Tue Jun 16 18:06:27 PDT 2009', 'query': 'NO_QUERY', 'sentiment': 0, 'text': "I have seen the most awesome couch, but it's too big and too far away to pick up with alana's car. But it's soooo nice. ", 'user': 'rileyo'}
{'date': 'Wed Jun 24 23:21:53 PDT 2009', 'query': 'NO_QUERY', 'sentiment': 0, 'text': "yea!!!! I think I have someone to watch the dogs while we're away. just have to kennel for 2 days instead of 11. no one wanted Zhu ", 'user': 'lindentreephoto'}
{'date': 'Sun Jun 07 07:43:33 PDT 2009', 'query': 'NO_QUERY', 'sentiment': 0, 'text': 'why am i awake so early?  damn projects. super nervous for the science one. mines gunna be so lame i dont wanna fail ', 'user': '_stacey_rae'}


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.536299,0.469682,0.7814,0.806446,0.727244,0.905008
2,0.379594,0.456506,0.8024,0.809561,0.785928,0.834658
3,0.277192,0.598859,0.8126,0.808658,0.831583,0.786963
4,0.156042,0.863906,0.8,0.803922,0.793344,0.814785
5,0.100813,1.016466,0.7978,0.813089,0.760111,0.874006
6,0.041074,0.992444,0.8136,0.815153,0.813539,0.816773


{'eval_loss': 0.44245052337646484, 'eval_accuracy': 0.81135, 'eval_f1': 0.8168713294180459, 'eval_precision': 0.7932692307692307, 'eval_recall': 0.8419209604802401, 'epoch': 6.0, 'total_flos': 188580199259040}


Reusing dataset sentiment140 (/root/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/9fe1c0ce3319c47cc65ff7e49aac6c34d9c050ab1432988c104b3b275e360f3f)
Loading cached split indices for dataset at /root/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/9fe1c0ce3319c47cc65ff7e49aac6c34d9c050ab1432988c104b3b275e360f3f/cache-fbbd2e12a01297cb.arrow and /root/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/9fe1c0ce3319c47cc65ff7e49aac6c34d9c050ab1432988c104b3b275e360f3f/cache-f68c56688e800ed5.arrow
Loading cached split indices for dataset at /root/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/9fe1c0ce3319c47cc65ff7e49aac6c34d9c050ab1432988c104b3b275e360f3f/cache-d7a51a012ad6e2b7.arrow and /root/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/9fe1c0ce3319c47cc65ff7e49aac6c34d9c050ab1432988c104b3b275e360f3f/cache-ad4bdc05c1b33993.arrow


Train size: 10000, Validation size: 5000, Test size: 20000
{'date': 'Fri May 29 17:32:59 PDT 2009', 'query': 'NO_QUERY', 'sentiment': 4, 'text': "Had a faboo workout today... now I'm chillaxin' and sending luv to my Myspace Peeeeoples! ", 'user': 'Tani122'}
{'date': 'Wed Jun 24 23:21:53 PDT 2009', 'query': 'NO_QUERY', 'sentiment': 0, 'text': "yea!!!! I think I have someone to watch the dogs while we're away. just have to kennel for 2 days instead of 11. no one wanted Zhu ", 'user': 'lindentreephoto'}
{'date': 'Sun Jun 07 07:43:33 PDT 2009', 'query': 'NO_QUERY', 'sentiment': 0, 'text': 'why am i awake so early?  damn projects. super nervous for the science one. mines gunna be so lame i dont wanna fail ', 'user': '_stacey_rae'}


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.393051,0.368648,0.8462,0.845923,0.852929,0.83903
2,0.333301,0.509522,0.8456,0.838898,0.883128,0.798887
3,0.269244,0.517283,0.8508,0.851631,0.852309,0.850954
4,0.134808,0.696503,0.85,0.848607,0.862182,0.835453
5,0.130817,0.786245,0.8548,0.855148,0.858574,0.851749
6,0.051373,0.89313,0.8512,0.852906,0.848544,0.857313


{'eval_loss': 0.35943934321403503, 'eval_accuracy': 0.84785, 'eval_f1': 0.8473998294970163, 'eval_precision': 0.8494872310476573, 'eval_recall': 0.8453226613306654, 'epoch': 6.0, 'total_flos': 1912869374850240}


Reusing dataset sentiment140 (/root/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/9fe1c0ce3319c47cc65ff7e49aac6c34d9c050ab1432988c104b3b275e360f3f)
Loading cached split indices for dataset at /root/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/9fe1c0ce3319c47cc65ff7e49aac6c34d9c050ab1432988c104b3b275e360f3f/cache-fbbd2e12a01297cb.arrow and /root/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/9fe1c0ce3319c47cc65ff7e49aac6c34d9c050ab1432988c104b3b275e360f3f/cache-f68c56688e800ed5.arrow
Loading cached split indices for dataset at /root/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/9fe1c0ce3319c47cc65ff7e49aac6c34d9c050ab1432988c104b3b275e360f3f/cache-d7a51a012ad6e2b7.arrow and /root/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/9fe1c0ce3319c47cc65ff7e49aac6c34d9c050ab1432988c104b3b275e360f3f/cache-ad4bdc05c1b33993.arrow


Train size: 100000, Validation size: 5000, Test size: 20000
{'date': 'Tue Jun 16 06:33:39 PDT 2009', 'query': 'NO_QUERY', 'sentiment': 0, 'text': '@ArtyTheCat @SirBarley @smokeythedog @PantherQueen @hanseebundee we will have to try our sleepover again! Sorry. ', 'user': 'SadiePetunia'}
{'date': 'Wed Jun 24 23:21:53 PDT 2009', 'query': 'NO_QUERY', 'sentiment': 0, 'text': "yea!!!! I think I have someone to watch the dogs while we're away. just have to kennel for 2 days instead of 11. no one wanted Zhu ", 'user': 'lindentreephoto'}
{'date': 'Sun Jun 07 07:43:33 PDT 2009', 'query': 'NO_QUERY', 'sentiment': 0, 'text': 'why am i awake so early?  damn projects. super nervous for the science one. mines gunna be so lame i dont wanna fail ', 'user': '_stacey_rae'}


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.431152,0.31328,0.8726,0.871702,0.883626,0.860095
2,0.245532,0.345563,0.8742,0.877124,0.862466,0.892289
3,0.155908,0.454297,0.8776,0.87833,0.878679,0.877981


{'eval_loss': 0.32321497797966003, 'eval_accuracy': 0.8652, 'eval_f1': 0.8624630139781655, 'eval_precision': 0.8798792547101072, 'eval_recall': 0.8457228614307154, 'epoch': 3.0, 'total_flos': 9749845795467840}
