In [2]:
!pip install transformers
!pip install datasets
!pip install torch
!pip install sklearn

import torch
from torch import nn
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, recall_score
from transformers import Trainer, AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, DataCollatorWithPadding

from scipy.special import softmax
from sklearn.model_selection import train_test_split
from datasets import load_metric, load_dataset, DatasetDict
import csv

from os.path import exists

import urllib.request

from IPython.display import clear_output
clear_output()

In [3]:
TASKS = ["finetune", "from-scratch"][:]
DATA_SAVE_LOC = "./.cache/tokenized_sentiment140"

senti = load_dataset("sentiment140")
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

Reusing dataset sentiment140 (/root/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/f81c014152931b776735658d8ae493b181927de002e706c4d5244ecb26376997)


  0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
if exists(DATA_SAVE_LOC):
    print(f'Reusing tokenized dataset in {DATA_SAVE_LOC}.')
    tokenized_senti = DatasetDict.load_from_disk(DATA_SAVE_LOC)
  
else:
    print(f"Couldn't find cached tokenized dataset in {DATA_SAVE_LOC}. Starting encoding.")

    def preprocess(text):
        new_text = []

        for t in text.split(" "):
            t = '@user' if t.startswith('@') and len(t) > 1 else t
            t = 'http' if t.startswith('http') else t
            new_text.append(t)

        return " ".join(new_text)

    def preprocess2(examples):

        for i in range(len(examples)):
            examples['text'][i] = preprocess(examples['text'][i])

        encodings = tokenizer(examples["text"], padding=True, truncation=True, max_length=512)
          
        label_dict = {0:0, 2:1, 4:2}
        encodings["label"] = [label_dict[lab] for lab in examples["sentiment"]]
          
        return encodings

    tokenized_senti = senti.map(preprocess2, batched=True)
    tokenized_senti.save_to_disk(DATA_SAVE_LOC)

def compute_metrics(p):    
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)
    return {"accuracy": accuracy, "recall": recall, "f1": f1} 

Reusing tokenized dataset in ./.cache/tokenized_sentiment140.


In [5]:
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=3, 
                                                                max_position_embeddings = 514,
                                                                num_hidden_layers = 3,
                                                                num_attention_heads = 3
                                                                )

model.roberta.embeddings.word_embeddings.weight.requires_grad = False
model.roberta.embeddings.position_embeddings.weight.requires_grad = False
model.roberta.embeddings.token_type_embeddings.weight.requires_grad = False
model.roberta.embeddings.LayerNorm.weight.requires_grad = False
model.roberta.embeddings.LayerNorm.bias.requires_grad = False

for i in range(3):
    model.roberta.encoder.layer[i].attention.self.query.weight.requires_grad = False
    model.roberta.encoder.layer[i].attention.self.query.bias.requires_grad = False
    model.roberta.encoder.layer[i].attention.self.key.weight.requires_grad = False
    model.roberta.encoder.layer[i].attention.self.key.bias.requires_grad = False
    model.roberta.encoder.layer[i].attention.self.value.weight.requires_grad = False
    model.roberta.encoder.layer[i].attention.self.value.bias.requires_grad = False
    model.roberta.encoder.layer[i].attention.output.dense.weight.requires_grad = False
    model.roberta.encoder.layer[i].attention.output.dense.bias.requires_grad = False
    model.roberta.encoder.layer[i].attention.output.LayerNorm.weight.requires_grad = False
    model.roberta.encoder.layer[i].attention.output.LayerNorm.bias.requires_grad = False
    model.roberta.encoder.layer[i].intermediate.dense.weight.requires_grad = False
    model.roberta.encoder.layer[i].intermediate.dense.bias.requires_grad = False
    model.roberta.encoder.layer[i].output.dense.weight.requires_grad = False
    model.roberta.encoder.layer[i].output.dense.bias.requires_grad = False
    model.roberta.encoder.layer[i].output.LayerNorm.weight.requires_grad = False
    model.roberta.encoder.layer[i].output.LayerNorm.bias.requires_grad = False

model.classifier.dense = nn.Linear(768, 32,
                                        device=model.classifier.dense.weight.device, 
                                        dtype=model.classifier.dense.weight.dtype)

model.classifier.out_proj = nn.Linear(32, 3,
                                           device=model.classifier.out_proj.weight.device, 
                                           dtype=model.classifier.out_proj.weight.dtype)


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
  
metric = load_metric("accuracy")


training_args = TrainingArguments(
    output_dir="./output",
    evaluation_strategy="epoch",
    learning_rate=6e-4,
    per_device_train_batch_size=50,
    per_device_eval_batch_size=50,
    num_train_epochs=5,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_senti["train"],
    eval_dataset=tokenized_senti["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics)
 

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.encoder.layer.7.output.LayerNorm.bias', 'roberta.encoder.layer.4.intermediate.dense.bias', 'roberta.encoder.layer.4.output.dense.weight', 'roberta.encoder.layer.4.intermediate.dense.weight', 'roberta.encoder.layer.11.attention.output.dense.bias', 'roberta.encoder.layer.11.intermediate.dense.weight', 'roberta.encoder.layer.8.attention.output.LayerNorm.bias', 'roberta.encoder.layer.6.attention.self.value.bias', 'roberta.encoder.layer.4.attention.self.value.weight', 'roberta.encoder.layer.6.attention.output.dense.bias', 'roberta.encoder.layer.9.attention.self.query.bias', 'roberta.encoder.layer.6.output.LayerNorm.bias', 'roberta.encoder.layer.11.attention.self.value.weight', 'roberta.encoder.layer.8.attention.self.query.bias', 'roberta.encoder.layer.8.attention.output.dense.weight', 'roberta.encoder.layer.11.attention.output.LayerNorm.bias', 'roberta.pooler.dens

In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: date, sentiment, query, user, text. If date, sentiment, query, user, text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1600000
  Num Epochs = 5
  Instantaneous batch size per device = 50
  Total train batch size (w. parallel, distributed & accumulation) = 150
  Gradient Accumulation steps = 1
  Total optimization steps = 53335
