# Download dataset

In [1]:
from datasets import load_dataset

In [2]:
cola_dataset = load_dataset("glue", "cola")
print(cola_dataset)

Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

Downloading and preparing dataset glue/cola to /home/hphuocthanh/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data:   0%|          | 0.00/377k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8551 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1043 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1063 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /home/hphuocthanh/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})


In [3]:
train_dataset = cola_dataset['train']
print(train_dataset[0])

{'sentence': "Our friends won't buy this analysis, let alone the next one we propose.", 'label': 1, 'idx': 0}


# Create a pytorch lightning data module
## preprocess data (setup, tokenization, ..)

In [16]:
import lightning.pytorch as pl
from torch.utils.data import DataLoader

model_nn = 'google/bert_uncased_L-2_H-128_A-2'
bs = 16
class DataModule(pl.LightningDataModule):
  def __init__(self, model_name=model_nn, batch_size=bs):
    super().__init__()

    self.batch_size = batch_size
    self.tokenizer = AutoTokenizer.from_pretrained(model_name)
  
  def prepare_data(self):
    cola_dataset = load_dataset("glue", "cola")
    self.train_data = cola_dataset['train']
    self.val_data = cola_dataset['validation']

  def tokenize_data(self, sample):
    return self.tokenizer(sample['sentence'], truncation=True, padding="max_length", max_length=256)
  
  def setup(self, stage=None):
    if stage == 'fit' or stage is None:
      self.train_data = self.train_data.map(self.tokenize_data, batched=True)
      self.train_data.set_format(
        type="torch", columns=['input_ids', 'attention_mask', 'label']
      )

      self.val_data = self.val_data.map(self.tokenize_data, batched=True)
      self.val_data.set_format(
        type="torch", columns=['input_ids', 'attention_mask', 'label']
      )
  
  def train_dataloader(self):
    return DataLoader(self.train_data, batch_size=self.batch_size, shuffle=True)
  
  def val_dataloader(self):
    return DataLoader(self.val_data, batch_size=self.batch_size, shuffle=True)

# Define the model with lightningmodule

In [12]:
import lightning.pytorch as pl
from torch import nn
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F
import torch
from sklearn.metrics import accuracy_score

learning_rate = 1e-2

class ColaModel(pl.LightningModule):
  def __init__(self, model_name=model_nn, lr=learning_rate):
    super(ColaModel, self).__init__()
    self.save_hyperparameters()

    self.bert = AutoModel.from_pretrained(model_name)
    self.W = nn.Linear(self.bert.config.hidden_size, 2)
    self.num_classes = 2
  
  def forward(self, input_ids, attention_mask):
    outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

    h_cls = outputs.last_hidden_state[:, 0]
    logits = self.W(h_cls)
    return logits

  def training_step(self, batch, batch_idx):
    logits = self.forward(batch['input_ids'], batch['attention_mask'])
    loss = F.cross_entropy(logits, batch['label'])
    self.log("train_loss", loss, prog_bar=True)
    return loss
  
  def validation_step(self, batch, batch_idx):
    logits = self.forward(batch['input_ids'], batch['attention_mask'])
    loss = F.cross_entropy(logits, batch['label'])
    _, preds = torch.max(logits, dim=1)
    val_acc = accuracy_score(preds.cpu(), batch["label"].cpu())
    val_acc = torch.tensor(val_acc)
    self.log("val_loss", loss, prog_bar=True)
    self.log("val_acc", val_acc, prog_bar=True)

  def configure_optimizers(self):
    return torch.optim.Adam(self.parameters(), lr=self.hparams["lr"])

# Train the model

In [18]:
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.callbacks.early_stopping import EarlyStopping

cola_data = DataModule()
cola_model = ColaModel()

checkpoint_callback = ModelCheckpoint(
    dirpath="./models", monitor="val_loss", mode="min"
)
early_stopping_callback = EarlyStopping(
    monitor="val_loss", patience=3, verbose=True, mode="min"
)

trainer = pl.Trainer(
    devices=(1 if torch.cuda.is_available() else 0),
    max_epochs=1,
    fast_dev_run=False,
    logger=pl.loggers.TensorBoardLogger("logs/", name="cola", version=1),
    default_root_dir="logs",
    callbacks=[checkpoint_callback, early_stopping_callback],
)
trainer.fit(cola_model, cola_data)

Some weights of the model checkpoint at google/bert_uncased_L-2_H-128_A-2 were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using

  0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/8551 [00:00<?, ? examples/s]

Map:   0%|          | 0/1043 [00:00<?, ? examples/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name | Type      | Params
-----------------------------------
0 | bert | BertModel | 4.4 M 
1 | W    | Linear    | 258   
-----------------------------------
4.4 M     Trainable params
0         Non-trainable params
4.4 M     Total params
17.545    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: 0.621
`Trainer.fit` stopped: `max_epochs=1` reached.


# Inference model

In [19]:
class ColaPredictor:
    def __init__(self, model_path):
        self.model_path = model_path
        # loading the trained model
        self.model = ColaModel.load_from_checkpoint(model_path)
        # keep the model in eval mode
        self.model.eval()
        self.model.freeze()
        self.processor = DataModule()
        self.softmax = torch.nn.Softmax(dim=0)
        self.labels = ["unacceptable", "acceptable"]

    def predict(self, text):
        # text => run time input
        inference_sample = {"sentence": text}
        # tokenizing the input
        processed = self.processor.tokenize_data(inference_sample)
        # predictions
        logits = self.model(
            torch.tensor([processed["input_ids"]]),
            torch.tensor([processed["attention_mask"]]),
        )
        scores = self.softmax(logits[0]).tolist()
        predictions = []
        for score, label in zip(scores, self.labels):
            predictions.append({"label": label, "score": score})
        return predictions

In [20]:
sentence = "The boy is doing a standup comedy"
predictor = ColaPredictor("./models/epoch=0-step=535.ckpt")
print(predictor.predict(sentence))

Some weights of the model checkpoint at google/bert_uncased_L-2_H-128_A-2 were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'label': 'unacceptable', 'score': 0.28197792172431946}, {'label': 'acceptable', 'score': 0.7180220484733582}]
