<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Dataset" data-toc-modified-id="Dataset-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Dataset</a></span></li><li><span><a href="#Tokenizing" data-toc-modified-id="Tokenizing-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Tokenizing</a></span></li><li><span><a href="#Formatting" data-toc-modified-id="Formatting-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Formatting</a></span></li><li><span><a href="#Data-Loader" data-toc-modified-id="Data-Loader-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Data Loader</a></span></li></ul></div>

# Imports
<hr style = "border:2px solid black" ></hr>

In [29]:
import datasets, pandas
import pandas as pd
import transformers
from datasets import load_dataset
import torch
from transformers import AutoTokenizer
import pytorch_lightning
import sklearn

In [2]:
datasets.__version__

'2.9.0'

In [3]:
pandas.__version__

'1.5.3'

In [4]:
transformers.__version__

'4.26.1'

In [5]:
torch.__version__

'1.13.1'

In [28]:
pytorch_lightning.__version__

'1.9.2'

In [30]:
sklearn.__version__

'1.2.1'

# Dataset
<hr style = "border:2px solid black" ></hr>

- The CoLA (Corpus of Linguistic Acceptability) dataset is about given a sentence it has to be classified into one of the two classes: 
    - Unacceptable if grammatically not correct 
    - Acceptable if grammatically correct.

In [6]:
cola_dataset = load_dataset('glue', 'cola')

Found cached dataset glue (/Users/gm_main/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
cola_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

In [8]:
train_dataset = cola_dataset['train']
val_dataset = cola_dataset['validation']
test_dataset = cola_dataset['test']

In [9]:
len(train_dataset), len(val_dataset), len(test_dataset)

(8551, 1043, 1063)

In [10]:
train_dataset[0]

{'sentence': "Our friends won't buy this analysis, let alone the next one we propose.",
 'label': 1,
 'idx': 0}

In [11]:
val_dataset[0]

{'sentence': 'The sailors rode the breeze clear of the rocks.',
 'label': 1,
 'idx': 0}

In [12]:
test_dataset[0]

{'sentence': 'Bill whistled past the house.', 'label': -1, 'idx': 0}

In [13]:
train_dataset.features

{'sentence': Value(dtype='string', id=None),
 'label': ClassLabel(names=['unacceptable', 'acceptable'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [14]:
train_dataset.filter(
    lambda example: example['label'] == train_dataset.features['label'].str2int('acceptable'))[:5]

Loading cached processed dataset at /Users/gm_main/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-1c79cbc9d38c1fa5.arrow


{'idx': [0, 1, 2, 3, 4],
 'label': [1, 1, 1, 1, 1],
 'sentence': ["Our friends won't buy this analysis, let alone the next one we propose.",
  "One more pseudo generalization and I'm giving up.",
  "One more pseudo generalization or I'm giving up.",
  'The more we study verbs, the crazier they get.',
  'Day by day the facts are getting murkier.']}

In [15]:
train_dataset.filter(
    lambda example: example['label'] == train_dataset.features['label'].str2int('unacceptable'))[:5]

Loading cached processed dataset at /Users/gm_main/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-45b6bd4a632a56d6.arrow


{'idx': [18, 20, 22, 23, 25],
 'label': [0, 0, 0, 0, 0],
 'sentence': ['They drank the pub.',
  'The professor talked us.',
  'We yelled ourselves.',
  'We yelled Harry hoarse.',
  'Harry coughed himself.']}

# Tokenizing
<hr style = "border:2px solid black" ></hr>

In [16]:
tokenizer = AutoTokenizer.from_pretrained("google/bert_uncased_L-2_H-128_A-2")

In [17]:
train_dataset = cola_dataset['train']
val_dataset = cola_dataset['validation']
test_dataset = cola_dataset['test']

In [18]:
tokenizer

BertTokenizerFast(name_or_path='google/bert_uncased_L-2_H-128_A-2', vocab_size=30522, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [19]:
print(train_dataset[0]['sentence'])
tokenizer(train_dataset[0]['sentence'])

Our friends won't buy this analysis, let alone the next one we propose.


{'input_ids': [101, 2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894, 1996, 2279, 2028, 2057, 16599, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [20]:
tokenizer.decode(tokenizer(train_dataset[0]['sentence'])['input_ids'])

"[CLS] our friends won't buy this analysis, let alone the next one we propose. [SEP]"

In [21]:
def encode(examples):
    return tokenizer(
            examples["sentence"],
            truncation=True,
            padding="max_length",
            max_length=512,
        )

In [22]:
train_dataset = train_dataset.map(encode, batched=True)

Loading cached processed dataset at /Users/gm_main/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-db2498a8c712be41.arrow


# Formatting
<hr style = "border:2px solid black" ></hr>

In [23]:
train_dataset.set_format(type='torch', columns=[
                         'input_ids', 'attention_mask', 'label'])

# Data Loader
<hr style = "border:2px solid black" ></hr>

In [24]:
dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32)

In [25]:
next(iter(dataloader))

{'label': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0,
         1, 0, 0, 1, 1, 1, 1, 1]),
 'input_ids': tensor([[  101,  2256,  2814,  ...,     0,     0,     0],
         [  101,  2028,  2062,  ...,     0,     0,     0],
         [  101,  2028,  2062,  ...,     0,     0,     0],
         ...,
         [  101,  5965, 12808,  ...,     0,     0,     0],
         [  101,  2198, 10948,  ...,     0,     0,     0],
         [  101,  3021, 24471,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [26]:
for batch in dataloader:
    print(batch['input_ids'].shape, batch['attention_mask'].shape, batch['label'].shape)

torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) to

torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) to