In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Tue May 18 17:41:15 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0    36W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Fine tuning pictoBERT for maskless language modeling

In [2]:
import torch

MODEL_VERSION = "gloss"
MAX_EPOCHS = 1
BATCH_SIZE = 160
LEARNING_RATE = 1e-04
NUM_WORKERS = 4
GPUS = torch.cuda.device_count()
PRECISION = 16 if torch.cuda.device_count() > 0 else 32
MLM_PROBABILITY= 0.15

In [3]:
!pip install pytorch_lightning==1.2.10 transformers 



## loading tokenizer

In [4]:
!gdown https://drive.google.com/uc?id=1-2g-GCxjBwESqDn3JByAJABU9Dkuqy0m

Downloading...
From: https://drive.google.com/uc?id=1-2g-GCxjBwESqDn3JByAJABU9Dkuqy0m
To: /content/childes_all_new.json
  0% 0.00/332k [00:00<?, ?B/s]100% 332k/332k [00:00<00:00, 98.1MB/s]


In [5]:
TOKENIZER_PATH = "./childes_all_new.json" # you can change this path to use your custom tokenizer

from transformers import PreTrainedTokenizerFast

loaded_tokenizer = PreTrainedTokenizerFast(tokenizer_file=TOKENIZER_PATH)
loaded_tokenizer.pad_token = "[PAD]"
loaded_tokenizer.sep_token = "[SEP]"
loaded_tokenizer.mask_token = "[MASK]"
loaded_tokenizer.cls_token = "[CLS]"
loaded_tokenizer.unk_token = "[UNK]"

## loading datasets

In [6]:
!gdown https://drive.google.com/uc?id=1Mn2eIeSCBLk8tpU2NwNQblooFcqSbjmh
!gdown https://drive.google.com/uc?id=1qEX2zVxX8xFbBfY4mLSf0qG4wgkHFggD
!gdown https://drive.google.com/uc?id=1BwPXeP3lKQk-orKAq1bs2C7hg-B1D6QG

Downloading...
From: https://drive.google.com/uc?id=1Mn2eIeSCBLk8tpU2NwNQblooFcqSbjmh
To: /content/train_childes_all.pt
247MB [00:01, 148MB/s]
Downloading...
From: https://drive.google.com/uc?id=1qEX2zVxX8xFbBfY4mLSf0qG4wgkHFggD
To: /content/test_childes_all.pt
2.52MB [00:00, 79.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=1BwPXeP3lKQk-orKAq1bs2C7hg-B1D6QG
To: /content/val_childes_all.pt
2.52MB [00:00, 80.3MB/s]


In [7]:
from torch.utils.data import Dataset, Subset
from torch import tensor
from sklearn.model_selection import train_test_split

class MyDataset(Dataset):
  def __init__(self, examples):
    self.input_ids = examples['input_ids']
    self.attention_mask = examples['attention_mask']
    self.special_tokens_mask = examples['special_tokens_mask']
  
  def __len__(self):
    return len(self.input_ids)
  
  def __getitem__(self, idx):
    input_ids = tensor(self.input_ids[idx])
    attention_mask = tensor(self.attention_mask[idx])
    special_tokens_mask = tensor(self.special_tokens_mask[idx])

    return {
      "input_ids":input_ids, 
      "attention_mask":attention_mask, 
      "special_tokens_mask":special_tokens_mask
    }


In [8]:
import pickle

tds = pickle.load(open('./train_childes_all.pt','rb'))
train_dataset = MyDataset(tds)

vds = pickle.load(open('./val_childes_all.pt','rb'))
val_dataset = MyDataset(vds)

tsds = pickle.load(open('./test_childes_all.pt','rb'))
test_dataset = MyDataset(tsds)


In [9]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=loaded_tokenizer, mlm=False)

In [10]:
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler

train_dataloader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=True,
    collate_fn=data_collator,
    drop_last = True,
    shuffle=True
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=True,
    collate_fn=data_collator,
    drop_last = True
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    collate_fn=data_collator,
    pin_memory=True,
    drop_last = True
)

## Defining optmizer

BERT uses a Adam Weigth Decay Optimizer. The code below was extracted from [here](https://github.com/jcyk/BERT/blob/88982eb2d8fdfb8984a93df2aa00de07f63af82d/adam.py)

In [11]:
import torch
from torch.optim import Optimizer

class AdamWeightDecayOptimizer(Optimizer):
    """A basic Adam optimizer that includes "correct" L2 weight decay.
    https://github.com/google-research/bert/blob/master/optimization.py
    https://raw.githubusercontent.com/pytorch/pytorch/v1.0.0/torch/optim/adam.py"""
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                 weight_decay=0, amsgrad=False):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay, amsgrad=amsgrad)
        super(AdamWeightDecayOptimizer, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(AdamWeightDecayOptimizer, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('amsgrad', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
                amsgrad = group['amsgrad']

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                    if amsgrad:
                        # Maintains max of all exp. moving avg. of sq. grad. values
                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                if amsgrad:
                    max_exp_avg_sq = state['max_exp_avg_sq']
                beta1, beta2 = group['betas']

                state['step'] += 1

                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                if amsgrad:
                    # Maintains the maximum of all 2nd moment running avg. till now
                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                    # Use the max. for normalizing running avg. of gradient
                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])
                else:
                    denom = exp_avg_sq.sqrt().add_(group['eps'])

                # Just adding the square of the weights to the loss function is *not*
                # the correct way of using L2 regularization/weight decay with Adam,
                # since that will interact with the m and v parameters in strange ways.
                #
                # Instead we want ot decay the weights in a manner that doesn't interact
                # with the m/v parameters. This is equivalent to adding the square
                # of the weights to the loss with plain (non-momentum) SGD.
                update = (exp_avg/denom).add_(group['weight_decay'], p.data)
                p.data.add_(-group['lr'], update)
        return loss

## Define model

In [12]:
from argparse import ArgumentParser
import math
import torch
import torch.nn as nn
import pytorch_lightning as pl
from pytorch_lightning.metrics.functional import accuracy
from pytorch_lightning.callbacks import ModelCheckpoint


from transformers import BertLMHeadModel
from transformers import BertForMaskedLM

class LitBertClassifier(pl.LightningModule):
    def __init__(self, pretrained_model_name='bert-large-uncased'):
        super().__init__()
        self.save_hyperparameters()
        self.batch_size = BATCH_SIZE
        self.lr = LEARNING_RATE
        self.train_dataset = train_dataset
        self.bert = BertForMaskedLM.from_pretrained(pretrained_model_name)
      
    
    def freeze_to(self, layers):
      for param in self.bert.bert.encoder.layer[:layers].parameters():
        param.requires_grad = False


    def forward(self, input_ids, attention_mask, labels=None):
        if labels == None:
            return self.bert(
                input_ids=input_ids,
                attention_mask=attention_mask,
            )    
        return self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels = labels
        )

    def training_step(self, batch, batch_idx):
        outputs = self._shared_step(batch, batch_idx)
        loss = outputs[0]

        self.log("train_loss", loss, on_epoch=True, prog_bar=True,)

        return loss

    def train_dataloader(self):
      return DataLoader(self.train_dataset,batch_size=self.batch_size,num_workers=NUM_WORKERS,pin_memory=True,collate_fn=data_collator)


    def validation_step(self, batch, batch_idx):
        with torch.no_grad():
          result = self._shared_step(batch, batch_idx)
          loss = result[0].detach()

          return {
              "val_loss":loss
          }
    
    def validation_epoch_end(self, outputs):
        val_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        self.log("val_loss", val_loss, on_epoch=True, prog_bar=True,)
    
    
    def test_step(self, batch, batch_idx):
        with torch.no_grad():
          result = self._shared_step(batch, batch_idx)
          loss = result[0].detach()
          perplexity = torch.exp(loss)
          self.log("test_ppl", perplexity, on_epoch=True, prog_bar=True,)
          self.log("test_loss", loss, on_epoch=True, prog_bar=True,)

          return {
              "test_ppl":perplexity,
              "test_loss":loss
          }
    
    def _shared_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        outputs = self.forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        return outputs

    def configure_optimizers(self):
      optimizer =  AdamWeightDecayOptimizer(self.parameters(), lr=self.lr,betas=(0.9, 0.999), eps=1e-6)
      scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=self.lr, steps_per_epoch=len(train_dataloader), epochs=500,anneal_strategy='linear')
      return [optimizer],[scheduler]

    def backward(self, loss, optimizer, idx):
        loss.backward()


## Download pictoBERT

In [14]:
!gdown https://drive.google.com/uc?id=11FaLUqVe3hhgkwoX7pG7Yp0LAwyqGzq6
!gdown https://drive.google.com/uc?id=1KzyPFFfU5rm_ZmoF2L1uGPRlenQ3YjVJ

Downloading...
From: https://drive.google.com/uc?id=11FaLUqVe3hhgkwoX7pG7Yp0LAwyqGzq6
To: /content/pictobert-large-contextual.zip
1.18GB [00:10, 117MB/s]
Downloading...
From: https://drive.google.com/uc?id=1KzyPFFfU5rm_ZmoF2L1uGPRlenQ3YjVJ
To: /content/pictobert-large-gloss.zip
1.18GB [00:12, 96.6MB/s]


In [14]:
!unzip pictobert-large-contextual.zip
!unzip pictobert-large-gloss.zip

Archive:  pictobert-large-gloss.zip
   creating: pictobert-gloss/
  inflating: pictobert-gloss/config.json  
  inflating: pictobert-gloss/pytorch_model.bin  


In [13]:
# model = LitBertClassifier("./pictobert") # contextualized
model = LitBertClassifier("./pictobert-gloss") # gloss-based

## logger

In [14]:
from pytorch_lightning import loggers as pl_loggers

tb_logger = pl_loggers.TensorBoardLogger("./logs",name=MODEL_VERSION)

## checkpoint

In [15]:
checkpoint_callback = ModelCheckpoint(
    dirpath="./checkpoints",
    filename='bert'+MODEL_VERSION+'-large-{epoch:02d}-{val_loss:.2f}',
    mode='min',
    save_last=True
)



## trainer

In [16]:
trainer = pl.Trainer(
    accelerator='ddp',
    max_epochs=MAX_EPOCHS,
    logger=tb_logger,
    gpus=GPUS,
    callbacks=[checkpoint_callback],
    precision=PRECISION,
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
Using native 16bit precision.


## training

In [21]:
trainer.fit(model, train_dataloader,val_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Missing logger folder: ./logs/gloss
initializing ddp: GLOBAL_RANK: 0, MEMBER: 1/1

  | Name | Type            | Params
-----------------------------------------
0 | bert | BertForMaskedLM | 317 M 
-----------------------------------------
317 M     Trainable params
0         Non-trainable params
317 M     Total params
1,271.248 Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:1005.)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Saving latest checkpoint...





1

In [16]:
# checkpoint = LitBertClassifier.load_from_checkpoint("checkpoints/bert-large-epoch=00-val_loss=0.00.ckpt")
checkpoint = LitBertClassifier.load_from_checkpoint("checkpoints/bertgloss-large-epoch=00-val_loss=0.02.ckpt")

In [20]:
trainer = pl.Trainer(
    # resume_from_checkpoint=CHECKPOINTS_PATH+"last.ckpt",
    # plugins='ddp_sharded',
    accelerator='ddp',
    max_epochs=MAX_EPOCHS,
    logger=tb_logger,
    gpus=GPUS,
    callbacks=[checkpoint_callback],
    precision=16,
    # accumulate_grad_batches=ACCUMULATE_GRAD_BATCHES
    )

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
Using native 16bit precision.


In [17]:
trainer.test(model,test_dataloaders=test_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
initializing ddp: GLOBAL_RANK: 0, MEMBER: 1/1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_loss': 3.1758742332458496, 'test_ppl': 24.04897117614746}
--------------------------------------------------------------------------------


[{'test_loss': 3.1758742332458496, 'test_ppl': 24.04897117614746}]