# Dependencies

In [1]:
%%capture
!pip install transformers datasets

In [2]:
import torch
import os

from datasets import *

# Data

https://huggingface.co/datasets/super_glue

In [3]:
ds = load_dataset('super_glue', 'rte')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/586k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/69.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/622k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/277 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [4]:
print(ds)

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'idx', 'label'],
        num_rows: 2490
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'idx', 'label'],
        num_rows: 277
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'idx', 'label'],
        num_rows: 3000
    })
})


Features:
- premise: First piece of text representing the premise.
- hypothesis: Second piece of text which is the hypothesis. We check whether it can be entailed (inferred) from the premise.
- idx: Row index.
- label: 0=entailment, 1=not entailment.

In [5]:
train_entry = ds['train'][0]
print(train_entry)

{'premise': 'No Weapons of Mass Destruction Found in Iraq Yet.', 'hypothesis': 'Weapons of Mass Destruction Found in Iraq.', 'idx': 0, 'label': 1}


In [6]:
val_entry = ds['validation'][0]
print(val_entry)

{'premise': 'Dana Reeve, the widow of the actor Christopher Reeve, has died of lung cancer at age 44, according to the Christopher Reeve Foundation.', 'hypothesis': 'Christopher Reeve had an accident.', 'idx': 0, 'label': 1}


The test data does not provide the labels as it's used to evaluate entries on the public leaderboard for the SuperGLUE benchmark. So the labels for the test data are all set to -1.

In [7]:
test_entry = ds['test'][0]
print(test_entry)

{'premise': "Mangla was summoned after Madhumita's sister Nidhi Shukla, who was the first witness in the case.", 'hypothesis': 'Shukla is related to Mangla.', 'idx': 0, 'label': -1}


### Tokenize

In [8]:
from transformers import BertTokenizer

In [9]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [10]:
print("Tokenizer input max length:", tokenizer.model_max_length)
print("Tokenizer vocab size:", tokenizer.vocab_size)

Tokenizer input max length: 512
Tokenizer vocab size: 30522


In [11]:
print(tokenizer.all_special_tokens)

['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']


For each entry in the dataset, the tokenizer will combine the premise and hypothesis strings as one string with the structure: `[CLS] + premise + [SEP] + hypothesis + [SEP]`.

`token_type_ids` will be set to 0 for tokens corresponding to the premise, and 1 for the tokens corresponding to the hypothesis.

In [12]:
enc = tokenizer.encode_plus(text=train_entry['premise'], text_pair=train_entry['hypothesis'])

In [44]:
print(train_entry['premise'])
print(train_entry['hypothesis'])

No Weapons of Mass Destruction Found in Iraq Yet.
Weapons of Mass Destruction Found in Iraq.


In [43]:
print(enc['input_ids'])
print(enc['token_type_ids'])
print(enc['attention_mask'])

[101, 2053, 4255, 1997, 3742, 6215, 2179, 1999, 5712, 2664, 1012, 102, 4255, 1997, 3742, 6215, 2179, 1999, 5712, 1012, 102]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [14]:
max_length = 320 # 512

In [15]:
def tokenize_text(batch):
  return tokenizer(text=batch["premise"], text_pair=batch["hypothesis"], truncation=True, padding='max_length', max_length=max_length)

In [16]:
ds_tokenized = ds.map(tokenize_text, batched=True)

Map:   0%|          | 0/2490 [00:00<?, ? examples/s]

Map:   0%|          | 0/277 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [17]:
# Clear up memory
del ds

In [18]:
print(ds_tokenized)

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'idx', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2490
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'idx', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 277
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'idx', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3000
    })
})


In [19]:
ds_tokenized.set_format("torch", columns=["premise", "hypothesis", "input_ids", "token_type_ids", "attention_mask", "label"])

In [21]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

### Set up DataLoaders

In [22]:
from torch.utils.data import DataLoader, Dataset

In [23]:
class RteDataset(Dataset):
  def __init__(self, dataset_dict, partition_key):
    self.partition = dataset_dict[partition_key]

  def __getitem__(self, index):
    return self.partition[index]

  def __len__(self):
    return self.partition.num_rows

In [24]:
train_dataset = RteDataset(ds_tokenized, partition_key="train")
val_dataset = RteDataset(ds_tokenized, partition_key="validation")
test_dataset = RteDataset(ds_tokenized, partition_key="test")

In [25]:
BATCH_SIZE = 32

In [26]:
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    #num_workers=4
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=BATCH_SIZE,
    #num_workers=4
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=BATCH_SIZE,
    #num_workers=4
)

# Model

### Pre-trained BERT model
(https://arxiv.org/pdf/1810.04805.pdf)

In [27]:
from transformers import BertForSequenceClassification

In [28]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=2,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

# Finetuning

In [30]:
from transformers import AdamW

In [31]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [32]:
model.resize_token_embeddings(len(tokenizer))

Embedding(30522, 768, padding_idx=0)

In [None]:
model.to(device)

In [34]:
lr = 2e-5
EPOCHS = 5

In [35]:
import time
from transformers import get_linear_schedule_with_warmup

In [36]:
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)



### Training

In [37]:
def get_accuracy(y_pred, y_test):
  predictions = torch.log_softmax(y_pred, dim=1).argmax(dim=1)
  accuracy = (predictions == y_test).sum() / len(y_test)
  return accuracy

In [38]:
def train(model, train_loader, val_loader):
  final_val_loss = float("inf")
  final_val_acc = 0

  for epoch in range(EPOCHS):
    start = time.time()

    # Training
    model.train()
    interval = len(train_loader) // 10

    total_train_loss = 0
    total_train_acc = 0

    for batch_idx, batch in enumerate(train_loader):
      input_ids = batch["input_ids"].to(device)
      token_type_ids = batch["token_type_ids"].to(device)
      attention_mask = batch["attention_mask"].to(device)
      labels = batch["label"].to(device)

      optimizer.zero_grad()

      loss, prediction = model(input_ids,
                               token_type_ids=token_type_ids,
                               attention_mask=attention_mask,
                               labels=labels).values()
      acc = get_accuracy(prediction, labels)

      total_train_loss += loss.item()
      total_train_acc += acc.item()

      loss.backward()

      optimizer.step()
      scheduler.step()

      if batch_idx % 20 == 0:
        print("Training loss: %.4f" % loss)

    train_loss = total_train_loss / len(train_loader)
    train_acc = total_train_acc / len(train_loader)

    # Validation
    model.eval()

    total_val_loss = 0
    total_val_acc = 0

    with torch.no_grad():
      for batch_idx, batch in enumerate(val_loader):
        input_ids = batch["input_ids"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        loss, prediction = model(input_ids,
                                token_type_ids=token_type_ids,
                                attention_mask=attention_mask,
                                labels=labels).values()
        acc = get_accuracy(prediction, labels)

        total_val_loss += loss.item()
        total_val_acc += acc.item()

    val_loss = total_val_loss / len(val_loader)
    val_acc = total_val_acc / len(val_loader)

    final_val_loss = val_loss
    final_val_acc = val_acc

    end = time.time()
    hours, remainder = divmod(end - start, 3600) # returns: (seconds / 3600, seconds % 3600)
    minutes, seconds = divmod(remainder, 60) # returns: (remainder / 60, remainder % 60)

    print(f"Epoch: {epoch+1} train loss: {train_loss:.4f} train acc: {train_acc:.4f} | val loss: {val_loss:.4f} val acc: {val_acc:.4f}")
    print("Epoch time elapsed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
    print("")

  return (final_val_loss, final_val_acc)

In [39]:
loss, acc = train(model, train_loader, val_loader)

Training loss: 1.0845
Training loss: 0.7121
Training loss: 0.6988
Training loss: 0.6828
Epoch: 1 train loss: 0.7106 train acc: 0.4955 | val loss: 0.6914 val acc: 0.5288
Epoch time elapsed: 00:02:02.61

Training loss: 0.6551
Training loss: 0.6876
Training loss: 0.6900
Training loss: 0.6972
Epoch: 2 train loss: 0.6710 train acc: 0.5872 | val loss: 0.6362 val acc: 0.6400
Epoch time elapsed: 00:02:07.94

Training loss: 0.5368
Training loss: 0.4689
Training loss: 0.3867
Training loss: 0.4785
Epoch: 3 train loss: 0.5166 train acc: 0.7560 | val loss: 0.6544 val acc: 0.6766
Epoch time elapsed: 00:02:09.93

Training loss: 0.3628
Training loss: 0.3440
Training loss: 0.3912
Training loss: 0.2723
Epoch: 4 train loss: 0.3132 train acc: 0.8669 | val loss: 0.7426 val acc: 0.6746
Epoch time elapsed: 00:02:10.22

Training loss: 0.1697
Training loss: 0.2298
Training loss: 0.2087
Training loss: 0.1256
Epoch: 5 train loss: 0.1716 train acc: 0.9345 | val loss: 0.9021 val acc: 0.6817
Epoch time elapsed: 00:

# Results on some data

### Training set

In [45]:
model.eval()

with torch.no_grad():
  training_entry = next(iter(train_loader))

  input_ids = training_entry["input_ids"].to(device)
  token_type_ids = training_entry["token_type_ids"].to(device)
  attention_mask = training_entry["attention_mask"].to(device)
  label = training_entry["label"].to(device)

  logits = model(input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask,
                labels=label).logits

  probabilities = torch.softmax(logits, dim=1).detach().cpu().tolist()

  for i in range(5):
    prob_entailment = round(probabilities[i][0], 3)
    prob_not_entailment = round(probabilities[i][1], 3)
    print("premise:", training_entry["premise"][i])
    print("hypothesis:", training_entry["hypothesis"][i])
    print("label:", training_entry["label"][i].item())
    print("Predictions:")
    print("Probability entailment:", prob_entailment)
    print("Probability not entailment:", prob_not_entailment)
    print("")

premise: Ordonez Reyes accused Jose Jesus Pena, alleged chief of security for the Nicaraguan embassy in Tegucigalpa, of masterminding the January 7th assassination of contra-commander Manuel Antonio Rugama.
hypothesis: Jose Jesus Pena is accused of the assassination of Manuel Antonio Rugama.
label: 0
Predictions:
Probability entailment: 0.988
Probability not entailment: 0.012

premise: Some 55 percent of the German public are opposed to the euro, less than 150 days before its introduction on January 1, 2002, a poll by research group Wahlen showed.
hypothesis: The introduction of the euro has been opposed.
label: 0
Predictions:
Probability entailment: 0.99
Probability not entailment: 0.01

premise: To the world, M. Larry Lawrence, the new U.S. emissary to Switzerland who hosted President Clinton on his Southern California vacation, will be known as Mr. Ambassador.
hypothesis: Larry Lawrence is the head of the U.S. Embassy in Switzerland.
label: 0
Predictions:
Probability entailment: 0.9

### Validation set

In [46]:
model.eval()

with torch.no_grad():
  val_entry = next(iter(val_loader))

  input_ids = val_entry["input_ids"].to(device)
  token_type_ids = val_entry["token_type_ids"].to(device)
  attention_mask = val_entry["attention_mask"].to(device)
  label = val_entry["label"].to(device)

  logits = model(input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask,
                labels=label).logits

  probabilities = torch.softmax(logits, dim=1).detach().cpu().tolist()

  for i in range(5):
    prob_entailment = round(probabilities[i][0], 3)
    prob_not_entailment = round(probabilities[i][1], 3)
    print("premise:", val_entry["premise"][i])
    print("hypothesis:", val_entry["hypothesis"][i])
    print("label:", val_entry["label"][i].item())
    print("Predictions:")
    print("Probability entailment:", prob_entailment)
    print("Probability not entailment:", prob_not_entailment)
    print("")

premise: Dana Reeve, the widow of the actor Christopher Reeve, has died of lung cancer at age 44, according to the Christopher Reeve Foundation.
hypothesis: Christopher Reeve had an accident.
label: 1
Predictions:
Probability entailment: 0.055
Probability not entailment: 0.945

premise: Yet, we now are discovering that antibiotics are losing their effectiveness against illness. Disease-causing bacteria are mutating faster than we can come up with new antibiotics to fight the new variations.
hypothesis: Bacteria is winning the war against antibiotics.
label: 0
Predictions:
Probability entailment: 0.019
Probability not entailment: 0.981

premise: Cairo is now home to some 15 million people - a burgeoning population that produces approximately 10,000 tonnes of rubbish per day, putting an enormous strain on public services. In the past 10 years, the government has tried hard to encourage private investment in the refuse sector, but some estimate 4,000 tonnes of waste is left behind every d

### Test set

In [42]:
model.eval()

with torch.no_grad():
  test_entry = next(iter(test_loader))

  input_ids = test_entry["input_ids"].to(device)
  token_type_ids = test_entry["token_type_ids"].to(device)
  attention_mask = test_entry["attention_mask"].to(device)

  logits = model(input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask).logits

  probabilities = torch.softmax(logits, dim=1).detach().cpu().tolist()

  for i in range(5):
    prob_entailment = round(probabilities[i][0], 3)
    prob_not_entailment = round(probabilities[i][1], 3)
    print("premise:", test_entry["premise"][i])
    print("hypothesis:", test_entry["hypothesis"][i])
    # print("label:", test_entry["label"][i])
    print("Predictions:")
    print("Probability entailment:", prob_entailment)
    print("Probability not entailment:", prob_not_entailment)
    print("")

premise: Mangla was summoned after Madhumita's sister Nidhi Shukla, who was the first witness in the case.
hypothesis: Shukla is related to Mangla.
Predictions:
Probability entailment: 0.826
Probability not entailment: 0.174

premise: Authorities in Brazil say that more than 200 people are being held hostage in a prison in the country's remote, Amazonian-jungle state of Rondonia.
hypothesis: Authorities in Brazil hold 200 people as hostage.
Predictions:
Probability entailment: 0.97
Probability not entailment: 0.03

premise: A mercenary group faithful to the warmongering policy of former Somozist colonel Enrique Bermudez attacked an IFA truck belonging to the interior ministry at 0900 on 26 March in El Jicote, wounded and killed an interior ministry worker and wounded five others.
hypothesis: An interior ministry worker was killed by a mercenary group.
Predictions:
Probability entailment: 0.042
Probability not entailment: 0.958

premise: The British ambassador to Egypt, Derek Plumbly, t