In [1]:
# !pip install transformers datasets evaluate scikit-learn tqdm

In [81]:
%%capture
# Use this only after you check everything is being loaded properly

import torch
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification,\
    TrainingArguments, Trainer, pipeline, DataCollatorWithPadding, set_seed
import evaluate
import numpy as np
import torch.nn as nn
import math
import time
from tqdm import tqdm

classifier = pipeline("sentiment-analysis")
# classifier(['this is a bad idea', 'I hate this so much'])

In [82]:
set_seed(42)
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [83]:
r1_dataset = load_dataset("dynabench/dynasent", "dynabench.dynasent.r1.all")

r2_dataset = load_dataset("dynabench/dynasent", "dynabench.dynasent.r2.all")

Found cached dataset dynasent (C:/Users/Jeonghoon Kim/.cache/huggingface/datasets/dynabench___dynasent/dynabench.dynasent.r1.all/1.1.0/ab89971d9ae1aacc59ed44d6855bf0e89167417257e2c2666f38e532148f2967)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset dynasent (C:/Users/Jeonghoon Kim/.cache/huggingface/datasets/dynabench___dynasent/dynabench.dynasent.r2.all/1.1.0/ab89971d9ae1aacc59ed44d6855bf0e89167417257e2c2666f38e532148f2967)


  0%|          | 0/3 [00:00<?, ?it/s]

In [84]:
# string label to int
r1_dataset = r1_dataset.class_encode_column('gold_label')

Casting to class labels:   0%|          | 0/81 [00:00<?, ?ba/s]

Casting to class labels:   0%|          | 0/4 [00:00<?, ?ba/s]

Casting to class labels:   0%|          | 0/4 [00:00<?, ?ba/s]

In [85]:
r1_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'hit_ids', 'sentence', 'indices_into_review_text', 'model_0_label', 'model_0_probs', 'text_id', 'review_id', 'review_rating', 'label_distribution', 'gold_label', 'metadata'],
        num_rows: 80488
    })
    validation: Dataset({
        features: ['id', 'hit_ids', 'sentence', 'indices_into_review_text', 'model_0_label', 'model_0_probs', 'text_id', 'review_id', 'review_rating', 'label_distribution', 'gold_label', 'metadata'],
        num_rows: 3600
    })
    test: Dataset({
        features: ['id', 'hit_ids', 'sentence', 'indices_into_review_text', 'model_0_label', 'model_0_probs', 'text_id', 'review_id', 'review_rating', 'label_distribution', 'gold_label', 'metadata'],
        num_rows: 3600
    })
})

In [86]:
r1_dataset['test'][0]

{'id': 'r1-0098060',
 'hit_ids': ['y14605', 'y14606'],
 'sentence': 'Had to remind him to toast the sandwich.',
 'indices_into_review_text': [700, 740],
 'model_0_label': 'positive',
 'model_0_probs': {'negative': 0.011094148270785809,
  'positive': 0.7560697793960571,
  'neutral': 0.2328360378742218},
 'text_id': 'r1-0098060',
 'review_id': 'ebgPQgQJx5Al2CC-aNMU5A',
 'review_rating': 1,
 'label_distribution': {'positive': [],
  'negative': ['w114', 'w380', 'w40', 'w516'],
  'neutral': ['w1269'],
  'mixed': []},
 'gold_label': 0,
 'metadata': {'split': 'test',
  'round': 1,
  'subset': 'all',
  'model_in_the_loop': 'RoBERTa'}}

In [87]:
r2_dataset['test'][0]

{'id': 'r2-0019256',
 'hit_ids': ['y21512', 'y21524'],
 'sentence': 'The art exhibit has a lot to offer.',
 'sentence_author': 'w262',
 'has_prompt': True,
 'prompt_data': {'indices_into_review_text': [242, 356],
  'review_rating': 5,
  'prompt_sentence': "They're currently under construction for a new exhibit, but there is still enough art to enjoy for around 2 hours.",
  'review_id': '0cJld_mdcScG6zZtoPEFTA'},
 'model_1_label': 'positive',
 'model_1_probs': {'negative': 0.010349077172577381,
  'positive': 0.8954706788063049,
  'neutral': 0.09418027848005295},
 'text_id': 'r2-0019256',
 'label_distribution': {'positive': ['w148', 'w358', 'w4', 'w423', 'w139'],
  'negative': [],
  'neutral': [],
  'mixed': []},
 'gold_label': 'positive',
 'metadata': {'split': 'test',
  'round': 2,
  'subset': 'all',
  'model_in_the_loop': 'RoBERTa'}}

In [88]:
r1_dataset = r1_dataset.rename_column('gold_label', 'label')
r2_dataset = r2_dataset.rename_column('gold_label', 'label')


In [89]:
r1_dataset['train'][:10]

{'id': ['r1-0000001',
  'r1-0000002',
  'r1-0000003',
  'r1-0000004',
  'r1-0000006',
  'r1-0000007',
  'r1-0000008',
  'r1-0000010',
  'r1-0000011',
  'r1-0000012'],
 'hit_ids': [['y5238'],
  ['y11155'],
  ['y14984', 'y17992'],
  ['y1167', 'y297'],
  ['y11412', 'y19170', 'y4073'],
  ['y2002'],
  ['y14885', 'y16948'],
  ['y18243', 'y4069', 'y4712'],
  ['y19430', 'y2698'],
  ['y3416']],
 'sentence': ['Roto-Rooter is always good when you need someone right away.',
  "It's so worth the price of cox service over headaches of not being able to publish anything in a short amount of time on the Internet or frankly just stream in different rooms without buffering constantly.",
  'I placed my order of "sticky ribs" as an appetizer and the "angry chicken" as my entree.',
  'There is mandatory valet parking, so make sure you get everything you need from the car!',
  "My wife and I couldn't finish it.",
  'I went with a revised quote and they set an appointment for the next week.',
  'I found out 

There are two fields in this dataset: 

- 'hit_ids': List of Amazon Mechanical Turk Human Interface Tasks (HITs) in which this example appeared during validation. The values are anonymized but used consistently throughout the dataset.
- 'sentence': The example text.
- 'indices_into_review_text': indices of 'sentence' into the original review in the Yelp Academic Dataset.
- 'model_0_label': prediction of Model 0 as described in the paper. The possible values are 'positive', 'negative', and 'neutral'.
- 'model_0_probs': probability distribution predicted by Model 0. The keys are ('positive', 'negative', 'neutral') and the values are floats.
- 'text_id': unique identifier for this entry.
- 'review_id': review-level identifier for the review from the Yelp Academic Dataset containing 'sentence'.
- 'review_rating': review-level star-rating for the review containing 'sentence' in the Yelp Academic Dataset. The possible values are 1, 2, 3, 4, and 5.
- 'label_distribution': response distribution from the MTurk validation task. The keys are ('positive', 'negative', 'neutral') and the values are lists of anonymized MTurk ids, which are used consistently throughout the dataset.
- 'gold_label': the label chosen by at least three of the five workers if there is one (possible values: 'positive', 'negative', 'neutral', and 'mixed'), else None.

- `sentence`: example sentence.
- `gold_label`: string output `positive`, `negative`, `neutral` or `mixed`.

### 2. Preprocess

The next step is to load a tokenizer to preprocess the `sentence` field.
A tokenizer converts text to a sequence of tokens and creates numerical representation.
Notice how there are multiple ways to tokenize text. Make sure to use the right tokenizer for your model.

In [90]:
distilbert_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
bert_cased_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base")


text = "Hello Mr.Kim!"

for tokenizer in [distilbert_tokenizer, bert_cased_tokenizer, roberta_tokenizer]:
  print(f"\n\n{tokenizer.name_or_path}")
  vocab = {v: k for k, v in tokenizer.vocab.items()}
  tokenized_text = tokenizer(text)
  print([vocab[id] for id in tokenized_text['input_ids']])
    
tokenizer = distilbert_tokenizer



distilbert-base-uncased
['[CLS]', 'hello', 'mr', '.', 'kim', '!', '[SEP]']


bert-base-cased
['[CLS]', 'Hello', 'Mr', '.', 'Kim', '!', '[SEP]']


roberta-base
['<s>', 'Hello', 'ĠMr', '.', 'Kim', '!', '</s>']


In [91]:
def preprocess_function(examples):
    return tokenizer(examples["sentence"], truncation=True, padding=True)

In [92]:
tokenized_r1_dataset = r1_dataset.map(preprocess_function, batched=True)

  0%|          | 0/81 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [93]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [94]:
data_collator

DataCollatorWithPadding(tokenizer=DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

### 3. Create evaluation method

Including a metric during training is often helpful for evaluating your model's performance (otherwise, it just prints the loss). You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):

In [95]:
# Proportion of correct predictions among the total number of cases processed
accuracy = evaluate.load("accuracy")

In [96]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
#     print(predicitions, labels)
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [97]:
id2label = {0: "negative", 1: "positive", 2:"neutral", 3:"mixed"}
label2id = {"negative": 0, "positive": 1, "neutral":2, "mixed": 3}

In [98]:
# This automodel class gives us the model with pretrained weights + a sequence classification head
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=4, id2label=id2label, label2id=label2id
)
class CustomTrainer(Trainer):
    def _inner_training_loop(
        self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None
    ):
        number_of_epochs = args.num_train_epochs
        start = time.time()
        train_loss=[]
        train_acc=[]
        eval_acc=[]

        criterion = torch.nn.CrossEntropyLoss().to(device)
        self.optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, 1, gamma=0.9)
        
        train_dataloader = self.get_train_dataloader()
        eval_dataloader = self.get_eval_dataloader()

        for epoch in range(number_of_epochs):
            train_loss_per_epoch = 0
            train_acc_per_epoch = 0
            with tqdm(train_dataloader, unit="batch") as training_epoch:
                training_epoch.set_description(f"Training Epoch {epoch}")
                for step, inputs in enumerate(training_epoch):
                    inputs = inputs.to(device)
                    labels = inputs['labels']
                    
                    # forward pass
                    self.optimizer.zero_grad()
                    output = model(inputs['input_ids'])
                    # get the loss
                    loss = criterion(output['logits'], labels)
                    train_loss_per_epoch += loss.item()
                    #calculate gradients
                    loss.backward()
                    #update weights
                    self.optimizer.step()
                    train_acc_per_epoch += (output['logits'].argmax(1) == labels).sum().item()
            # adjust the learning rate
            self.scheduler.step()
            train_loss_per_epoch /= len(train_dataloader)
            train_acc_per_epoch /= (len(train_dataloader)*batch_size)
            
            
            eval_loss_per_epoch = 0
            eval_acc_per_epoch = 0
            with tqdm(eval_dataloader, unit="batch") as eval_epoch:
                eval_epoch.set_description(f"Evaluation Epoch {epoch}")
                for step, inputs in enumerate(eval_epoch):
                    inputs = inputs.to(device)
                    labels = inputs['labels']
                    with torch.no_grad():
                        output = model(inputs['input_ids'])
                        loss = criterion(output['logits'], labels)
                        eval_loss_per_epoch += loss.item()
                        eval_acc_per_epoch += (output['logits'].argmax(1) == labels).sum().item()
            eval_loss_per_epoch /= (len(eval_dataloader))
            eval_acc_per_epoch /= (len(eval_dataloader)*batch_size)
        
            
            print(f'\tTrain Loss: {train_loss_per_epoch:.3f} | Train Acc: {train_acc_per_epoch*100:.2f}%')
            print(f'\tEval Loss: {eval_loss_per_epoch:.3f} | Eval Acc: {eval_acc_per_epoch*100:.2f}%')
    
        print(f'Time: {(time.time()-start)/60:.3f} minutes')


In [100]:
# https://huggingface.co/transformers/v4.4.2/main_classes/trainer.html#trainingarguments
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=1e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)
# print(training_args)
# https://huggingface.co/transformers/v4.4.2/main_classes/trainer.html#id1
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_r1_dataset["train"],
    eval_dataset=tokenized_r1_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Training Epoch 0:   0%|                                                                    | 0/2516 [00:09<?, ?batch/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB (GPU 0; 4.00 GiB total capacity; 3.27 GiB already allocated; 0 bytes free; 3.50 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF