In [4]:
import logging
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
    )
import numpy as np
import scipy
import torch
import torch.nn.functional as F
from torch.utils.data.dataset import Dataset
import argparse
import os
from pathlib import Path
from torch.optim import SGD, Adam
import pytorch_lightning as pl
from torchmetrics import Accuracy
from datetime import datetime 
from pathlib import Path
from pytorch_lightning import loggers as pl_loggers
import time
from argparse import Namespace
import json
import shutil
logger = logging.getLogger(__name__)

class BaseModel(pl.LightningModule):
    def __init__(
        self,
        **config_kwargs
    ):
        """Initialize a model, tokenizer and config."""
        logger.info("Initilazing BaseModel")
        super().__init__()
        self.save_hyperparameters() #save hyperparameters to checkpoint
        self.step_count = 0
        self.output_dir = Path(self.hparams.output_dir)
        self.model = self._load_model()

        self.accuracy = Accuracy()

    def _load_model(self):
        raise NotImplementedError

    def forward(self, **inputs):
        return self.model(**inputs)

    def batch2input(self, batch):
        raise NotImplementedError

    def training_step(self, batch, batch_idx):
        input = self.batch2input(batch)
        labels = input['labels']
        loss, pred_labels, _ = self(**input)

        self.log('train_loss', loss, prog_bar=False)
        self.log('train_acc', self.accuracy(pred_labels.view(-1), labels.view(-1).int()), prog_bar=False)
        
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        input = self.batch2input(batch)
        labels = input['labels']
        loss, pred_labels, _ = self(**input)

        self.log('val_loss', loss)
        self.log('val_acc', self.accuracy(pred_labels.view(-1), labels.view(-1).int()))

    def test_step(self, batch, batch_nb):
        input = self.batch2input(batch)
        labels = input['labels']
        loss, pred_labels, _ = self(**input)

        self.log('test_loss', loss)
        self.log('test_acc', self.accuracy(pred_labels.view(-1), labels.view(-1).int()))

    def configure_optimizers(self):
        """Prepare optimizer and schedule (linear warmup and decay)"""
        model = self.model
        # optimizer = SGD(model.parameters(), lr=self.hparams.learning_rate)
        optimizer = Adam(model.parameters(), lr=self.hparams.learning_rate)

        self.opt = optimizer
        return [optimizer]

    def setup(self, stage):
        if stage == "fit":
            self.train_loader = self.get_dataloader("train", self.hparams.train_batch_size, shuffle=True)

    def train_dataloader(self):
        return self.train_loader

    def val_dataloader(self):
        return self.get_dataloader("dev", self.hparams.eval_batch_size, shuffle=False)

    def test_dataloader(self):
        return self.get_dataloader("test", self.hparams.eval_batch_size, shuffle=False)

    @staticmethod
    def add_generic_args(parser, root_dir) -> None:
        parser.add_argument(
            "--max_epochs",
            default=10,
            type=int,
            help="The number of epochs to train your model.",
        )
        ############################################################
        ## WARNING: set --gpus 0 if you do not have access to GPUS #
        ############################################################
        parser.add_argument(
            "--gpus",
            default=1,
            type=int,
            help="The number of GPUs allocated for this, it is by default 1. Set to 0 for no GPU.",
        )
        parser.add_argument(
            "--output_dir",
            default=None,
            type=str,
            required=True,
            help="The output directory where the model predictions and checkpoints will be written.",
        )
        parser.add_argument("--do_train", action="store_true", default=True, help="Whether to run training.")
        parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.")
        parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
        parser.add_argument(
            "--data_dir",
            default="./",
            type=str,
            help="The input data dir. Should contain the training files.",
        )
        parser.add_argument("--learning_rate", default=1e-2, type=float, help="The initial learning rate for training.")
        parser.add_argument("--num_workers", default=16, type=int, help="kwarg passed to DataLoader")
        parser.add_argument("--num_train_epochs", dest="max_epochs", default=3, type=int)
        parser.add_argument("--train_batch_size", default=32, type=int)
        parser.add_argument("--eval_batch_size", default=32, type=int)
    
def generic_train(
    model: BaseModel,
    args: argparse.Namespace,
    early_stopping_callback=False,
    extra_callbacks=[],
    checkpoint_callback=None,
    logging_callback=None,
    **extra_train_kwargs
):
    
    # init model
    odir = Path(model.hparams.output_dir)
    odir.mkdir(exist_ok=True)
    log_dir = Path(os.path.join(model.hparams.output_dir, 'logs'))
    log_dir.mkdir(exist_ok=True)

    # Tensorboard logger
    pl_logger = pl_loggers.TensorBoardLogger(
        save_dir=log_dir,
        version="version_" + datetime.now().strftime("%d-%m-%Y--%H-%M-%S"),
        name="",
        default_hp_metric=True
    )

    # add custom checkpoints
    ckpt_path = os.path.join(
        args.output_dir, pl_logger.version, "checkpoints",
    )
    if checkpoint_callback is None:
        checkpoint_callback = pl.callbacks.ModelCheckpoint(
            dirpath=ckpt_path, filename="{epoch}-{val_acc:.2f}", monitor="val_acc", mode="max", save_top_k=1, verbose=True
        )

    train_params = {}

    train_params["max_epochs"] = args.max_epochs

    if args.gpus > 1:
        train_params["distributed_backend"] = "ddp"

    trainer = pl.Trainer.from_argparse_args(
        args,
        enable_model_summary=False,
        callbacks= [checkpoint_callback] + extra_callbacks,
        logger=pl_logger,
        **train_params,
    )

    if args.do_train:
        trainer.fit(model)
        # track model performance under differnt hparams settings in "Hparams" of TensorBoard
        pl_logger.log_hyperparams(params=model.hparams, metrics={'hp_metric': checkpoint_callback.best_model_score.item()})
        pl_logger.save()

        # save best model to `best_model.ckpt`
        target_path = os.path.join(ckpt_path, 'best_model.ckpt')
        logger.info(f"Copy best model from {checkpoint_callback.best_model_path} to {target_path}.")
        shutil.copy(checkpoint_callback.best_model_path, target_path)

    
    # Optionally, predict on test set and write to output_dir
    if args.do_predict:
        best_model_path = os.path.join(ckpt_path, "best_model.ckpt")
        model = model.load_from_checkpoint(best_model_path)
        return trainer.test(model)
    
    return trainer


In [5]:
from nltk.tokenize import WordPunctTokenizer 
tokenizer = WordPunctTokenizer()

class LSTM_PL(BaseModel):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
    def _load_model(self):
        self.hparams.vocab = json.load(
            open(
            os.path.join(self.hparams.data_dir, self.hparams.vocab_filename)
            )
        )
        self.hparams.vocab_size = len(self.hparams.vocab)
        if self.hparams.attention:
            return LSTM_Attention(self.hparams.vocab, self.hparams.vocab_size, self.hparams.word_embedding_size, self.hparams.use_glove)
        else:
            return LSTM(self.hparams.vocab, self.hparams.vocab_size, self.hparams.word_embedding_size, self.hparams.use_glove)

    def get_dataloader(self, type_path, batch_size, shuffle=False):
        # dataset path (change if necessary)
        datapath = os.path.join(self.hparams.data_dir, f"sst2.{type_path}")
        data = open(datapath).readlines()
        data = [d.strip().split(" ", maxsplit=1) for d in data] # list of [label, text] pair
        dataset = SST2Dataset(self.hparams.vocab, data)

        logger.info(f"Loading {type_path} data and labels from {datapath}")
        data_loader = torch.utils.data.DataLoader(
            dataset=dataset,
            batch_size=batch_size,
            shuffle=shuffle,
            num_workers=self.hparams.num_workers,
            collate_fn=dataset.collate_fn
        )
        
        return data_loader    

    def configure_optimizers(self):
        """Prepare optimizer and schedule (linear warmup and decay)"""
        model = self.model
        optimizer = Adam(model.parameters(), lr=self.hparams.learning_rate)
        self.opt = optimizer
        return [optimizer]
    
    def batch2input(self, batch):
        return {"input_ids": batch[0], "labels": batch[1], "lengths": batch[2], "masks": batch[3]}

    @staticmethod
    def add_model_specific_args(parser, root_dir):
        parser.add_argument(
            "--vocab_filename",
            default=None,
            type=str,
            required=True,
            help="Pretrained tokenizer name or path",
        )
        parser.add_argument(
            "--optimizer",
            default="adam",
            type=str,
            required=True,
            help="Whether to use SGD or not",
        )
        parser.add_argument(
            "--word_embedding_size",
            default=300,
            type=int,
            help="Pretrained tokenizer name or path",
        )
        parser.add_argument(
            "--attention",
            action="store_true",
            help="Use attention or not",
        )
        parser.add_argument("--use_glove", action="store_true", help="Whether to use vector representaion from GloVe")

        return parser

In [48]:
class LSTM(torch.nn.Module):
    """
    LSTM Seq classification model
    """
    def __init__(self, vocab, vocab_size, word_embedding_size, use_glove=None):
        """
        # Paramters
          vocab_size: int
              size of the vocabulary.
        """
        super(LSTM, self).__init__()
        self.embedding = torch.nn.Embedding(vocab_size, word_embedding_size, padding_idx=0)
        if use_glove:
            self._load_glove(vocab, word_embedding_size)
        #######################################
        ## TODO: add LSTM and output layer(s) #
        #######################################
        hidden_size = 300

        self.LSTM = torch.nn.LSTM(word_embedding_size, hidden_size, batch_first=True)
        self.relu = torch.nn.ReLU()
        self.dropout = torch.nn.Dropout()
        self.linear = torch.nn.Linear(hidden_size, 1)
        self.sigmoid = torch.nn.Sigmoid()

        self.loss = torch.nn.BCELoss()

    def forward(self, input_ids, labels, lengths, masks):
        """
        # Parameters
        input_ids: 
            matrix of size (batch_size, feature_length). Each row in data represents a sequence of token ids coming from tokenzied input text and vocabulary. 
        label: matrix of size (batch_size,). 
            Ground truth labels.
        lengths: matrix of size (batch_size, 1). 
            Token length of input text. Help you to compute average word embedding
        mask: matrix of size (batch_size, feature_length). 
            Input mask that tells you whether the token is pad or not. If not masks = 1, else = 0. This helps you to compute attention weights
        # Returns
        loss: tensor
            loss should be a scalar averaged accross batches
        predicted_labels : model predictions. 
            Should be either 0 or 1 based on a threshold (usually 0.5).
        """
        #################################################################
        ## TODO: compute loss and predicted_labels based on model output#
        #################################################################

        # HINT: you can use lengths to retrieve the hidden state corresponding to the last word
        # you may find this link helpful: https://discuss.pytorch.org/t/selecting-element-on-dimension-from-list-of-indexes/36319
        lengths = lengths - 1
        out = self.embedding(input_ids)
        # print(f"embedding shape: {out.shape}")
        out, (hidden, cell) = self.LSTM(out)
        # print(f"after lstm: {out.shape}")
        out = torch.cat([out[i:i+1][:,ind-1] for i,ind in enumerate(lengths)], dim=0)
        # out = out[torch.arange(out.size(0)), lengths]
        out = out.view(out.shape[0],out.shape[2])
        # print(f"before linear: {out.shape}")
        # return
        # out = self.relu(out)
        # out = self.dropout(out)
        out = self.linear(out)
        # print(f"after linear: {out.shape}")
        probs = self.sigmoid(out)
        # print(f"probs shape: {probs.shape}")
        # print(probs)
        # return
        loss = self.loss(probs, labels)
        predicted_labels = torch.tensor([0 if p < 0.5 else 1 for p in probs]).cuda()
        # print(f"predicted_labels shape: {predicted_labels.shape}")
        # print(predicted_labels)
        # return
        return loss, predicted_labels, [] # use empty list to keep number of return tensors consistant with lstm attention



In [49]:
import logging
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
    )
import time
import argparse
import glob
import os
logger = logging.getLogger(__name__)

def main():
    ########################################################
    ## TODO: change args if needed according to your files #
    ########################################################
    mock_args = f"--word_embedding_size 300 --data_dir {DATA_DIR} --output_dir lstm --optimizer adam \
    --vocab_filename unigram_vocab.json --learning_rate 0.001 --max_epochs 10 --do_predict \
    --train_batch_size 16 --use_glove"

    # load hyperparameters
    parser = argparse.ArgumentParser()
    BaseModel.add_generic_args(parser, os.getcwd())
    parser = LSTM_PL.add_model_specific_args(parser, os.getcwd())
    args = parser.parse_args(mock_args.split())
    print(args)
    # fix random seed to make sure the result is reproducible
    pl.seed_everything(args.seed)

    # If output_dir not provided, a folder will be generated in pwd
    if args.output_dir is None:
        args.output_dir = os.path.join(
            "./results",
            f"{args.task}_{time.strftime('%Y%m%d_%H%M%S')}",
        )
        os.makedirs(args.output_dir)
    dict_args = vars(args)
    model = LSTM_PL(**dict_args)
    trainer = generic_train(model, args)


if __name__ == "__main__":
    main()


02/15/2022 00:40:09 - INFO - pytorch_lightning.utilities.seed -   Global seed set to 42
02/15/2022 00:40:09 - INFO - __main__ -   Initilazing BaseModel
02/15/2022 00:40:09 - INFO - __main__ -   Load glove pretrained word embeddings


Namespace(attention=False, data_dir='./data', do_predict=True, do_train=True, eval_batch_size=32, gpus=1, learning_rate=0.001, max_epochs=10, num_workers=16, optimizer='adam', output_dir='lstm', seed=42, train_batch_size=16, use_glove=True, vocab_filename='unigram_vocab.json', word_embedding_size=300)


02/15/2022 00:40:11 - INFO - pytorch_lightning.utilities.distributed -   GPU available: True, used: True
02/15/2022 00:40:11 - INFO - pytorch_lightning.utilities.distributed -   TPU available: False, using: 0 TPU cores
02/15/2022 00:40:11 - INFO - pytorch_lightning.utilities.distributed -   IPU available: False, using: 0 IPUs
02/15/2022 00:40:11 - INFO - __main__ -   Loading train data and labels from ./data/sst2.train
  cpuset_checked))
02/15/2022 00:40:11 - INFO - pytorch_lightning.accelerators.gpu -   LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation sanity check: 0it [00:00, ?it/s]

02/15/2022 00:40:11 - INFO - __main__ -   Loading dev data and labels from ./data/sst2.dev
02/15/2022 00:40:12 - INFO - pytorch_lightning.utilities.seed -   Global seed set to 42


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

02/15/2022 00:40:26 - INFO - pytorch_lightning.utilities.distributed -   Epoch 0, global step 432: val_acc reached 0.78096 (best 0.78096), saving model to "/content/lstm/version_15-02-2022--00-40-11/checkpoints/epoch=0-val_acc=0.78.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

02/15/2022 00:40:41 - INFO - pytorch_lightning.utilities.distributed -   Epoch 1, global step 865: val_acc reached 0.82225 (best 0.82225), saving model to "/content/lstm/version_15-02-2022--00-40-11/checkpoints/epoch=1-val_acc=0.82.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

02/15/2022 00:40:55 - INFO - pytorch_lightning.utilities.distributed -   Epoch 2, global step 1298: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

02/15/2022 00:41:09 - INFO - pytorch_lightning.utilities.distributed -   Epoch 3, global step 1731: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

02/15/2022 00:41:23 - INFO - pytorch_lightning.utilities.distributed -   Epoch 4, global step 2164: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

02/15/2022 00:41:40 - INFO - pytorch_lightning.utilities.distributed -   Epoch 5, global step 2597: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

02/15/2022 00:41:55 - INFO - pytorch_lightning.utilities.distributed -   Epoch 6, global step 3030: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

02/15/2022 00:42:09 - INFO - pytorch_lightning.utilities.distributed -   Epoch 7, global step 3463: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

02/15/2022 00:42:23 - INFO - pytorch_lightning.utilities.distributed -   Epoch 8, global step 3896: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

02/15/2022 00:42:37 - INFO - pytorch_lightning.utilities.distributed -   Epoch 9, global step 4329: val_acc was not in top 1
02/15/2022 00:42:38 - INFO - __main__ -   Copy best model from /content/lstm/version_15-02-2022--00-40-11/checkpoints/epoch=1-val_acc=0.82.ckpt to lstm/version_15-02-2022--00-40-11/checkpoints/best_model.ckpt.
02/15/2022 00:42:38 - INFO - __main__ -   Initilazing BaseModel
02/15/2022 00:42:38 - INFO - __main__ -   Load glove pretrained word embeddings
02/15/2022 00:42:40 - INFO - pytorch_lightning.accelerators.gpu -   LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
02/15/2022 00:42:40 - INFO - __main__ -   Loading test data and labels from ./data/sst2.test


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.833058774471283, 'test_loss': 0.3802799880504608}
--------------------------------------------------------------------------------


In [247]:

class LSTM_Attention(torch.nn.Module):
    """
    LSTM with Attention Seq classification model
    """
    def __init__(self, vocab, vocab_size, word_embedding_size, use_glove=None):
        """
        # Parameters
        vocab_size: int
            size of the vocabulary.
        """
        super(LSTM_Attention, self).__init__()
        self.embedding = torch.nn.Embedding(vocab_size, word_embedding_size, padding_idx=0)
        if use_glove:
            self._load_glove(vocab, word_embedding_size)
        #################################################
        ## TODO: add LSTM, attention, and output layers #
        #################################################
        hidden_size = 300
        self.lamda = 3

        self.LSTM = torch.nn.LSTM(word_embedding_size, hidden_size, batch_first=True)
        # self.relu = torch.nn.ReLU()
        self.dropout = torch.nn.Dropout()

        self.attention = torch.nn.Linear(hidden_size, 1)
        self.softmax = torch.nn.Softmax()

        self.linear = torch.nn.Linear(hidden_size, 1)
        self.sigmoid = torch.nn.Sigmoid()

        self.loss = torch.nn.BCELoss()
        
        
    def _load_glove(self, vocab, word_embedding_size):
        logger.info("Load glove pretrained word embeddings")
        vectors = {}
        with open(os.path.join(DATA_DIR, "glove.small.300d.txt")) as fin:
            for line in fin:
                parts = line.split()
                vectors[parts[0]] = np.array([float(v) for v in parts[1:]])
        weights = []
        id2word = {k: w for w, k in vocab.items()}
        for i in range(len(vocab)):
            word = id2word[i]
            if word in vectors:
                weights.append(torch.from_numpy(vectors[word]))
            elif word in ["<pad>"]:
                weights.append(torch.zeros((word_embedding_size,)))
            else:
                weights.append(torch.randn((word_embedding_size,)))
        weights = torch.stack(weights).float()
        self.embedding.load_state_dict({"weight":weights})
        

    def forward(self, input_ids, labels, lengths, masks):
        """
        # Parameters
        input_ids: matrix of size (batch_size, feature_length). 
            Each row in data represents a sequence of token ids coming from tokenzied input text and vocabulary. 
        label: matrix of size (batch_size,).
            Ground truth labels.
        lengths: matrix of size (batch_size, 1). 
            Token length of input text. Help you to compute average word embedding
        mask: matrix of size (batch_size, feature_length). 
            Input mask that tells you whether the token is pad or not. If not masks = 1, else = 0. This helps you to compute attention weights
        # Returns
        loss: loss should be a scalar averaged accross batches
        predicted_labels : model predictions. Should be either 0 or 1 based on a threshold (usually 0.5).
        """
        #################################################################
        ## TODO: compute loss and predicted_labels based on model output#
        #################################################################
        # lengths = lengths - 1
        out = self.embedding(input_ids)
        # print(f"embedding shape: {out.shape}")
        out, (hidden, cell) = self.LSTM(out) # (batch_size, feature_length, hidden_size)
        # print(f"after lstm: {out.shape}") 
        # out = torch.cat([out[i:i+1][:,ind-1] for i,ind in enumerate(lengths)], dim=0)
        # out = out.view(out.shape[0],out.shape[2])
        weights = self.attention(out)/self.lamda
        weights[masks <1 ] = -1e9
        # print(f"weights: {weights.shape}")
        # print(f"weights: {weights[0]}")
        weights = self.softmax(weights) # (batch_size, feature_length)
        # print(f"weights: {weights.shape}")
        print(f"weights: {weights[0]}")
        print(f"weights: {masks[0]}")
        return

        out = torch.sum(weights * out, dim=1)
        # print(f"after weighted sum: {out.shape}")
        # return
        # out = self.relu(out)
        # out = self.dropout(out)
        out = self.linear(out)
        # print(f"after linear: {out.shape}")
        probs = self.sigmoid(out)
        # print(f"probs shape: {probs.shape}")
        # print(probs)
        # return
        loss = self.loss(probs, labels)
        predicted_labels = torch.tensor([0 if p < 0.5 else 1 for p in probs]).cuda()
        # print(f"predicted_labels shape: {predicted_labels.shape}")
        # print(predicted_labels)
        # return
        
        # HINT: you can assign -1e9 to padded tokens based on masks so that after softmax, these tokens get zero attention
        
        return loss, predicted_labels, weights

In [248]:
import logging
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
    )
import time
import argparse
import glob
import os
logger = logging.getLogger(__name__)

def main():
    ########################################################
    ## TODO: change args if needed according to your files #
    ########################################################
    mock_args = f"--word_embedding_size 300 --data_dir {DATA_DIR} --output_dir lstm-att --optimizer adam \
    --vocab_filename unigram_vocab.json --learning_rate 0.001 --max_epochs 5 --do_predict --attention --use_glove \
    --train_batch_size 16" 
    
    # load hyperparameters
    parser = argparse.ArgumentParser()
    BaseModel.add_generic_args(parser, os.getcwd())
    parser = LSTM_PL.add_model_specific_args(parser, os.getcwd())
    args = parser.parse_args(mock_args.split())
    print(args)
    # fix random seed to make sure the result is reproducible
    pl.seed_everything(args.seed)

    # If output_dir not provided, a folder will be generated in pwd
    if args.output_dir is None:
        args.output_dir = os.path.join(
            "./results",
            f"{args.task}_{time.strftime('%Y%m%d_%H%M%S')}",
        )
        os.makedirs(args.output_dir)
    dict_args = vars(args)
    model = LSTM_PL(**dict_args)
    trainer = generic_train(model, args)


if __name__ == "__main__":
    main()


02/15/2022 02:43:05 - INFO - pytorch_lightning.utilities.seed -   Global seed set to 42
02/15/2022 02:43:05 - INFO - __main__ -   Initilazing BaseModel
02/15/2022 02:43:05 - INFO - __main__ -   Load glove pretrained word embeddings


Namespace(attention=True, data_dir='./data', do_predict=True, do_train=True, eval_batch_size=32, gpus=1, learning_rate=0.001, max_epochs=5, num_workers=16, optimizer='adam', output_dir='lstm-att', seed=42, train_batch_size=16, use_glove=True, vocab_filename='unigram_vocab.json', word_embedding_size=300)


02/15/2022 02:43:07 - INFO - pytorch_lightning.utilities.distributed -   GPU available: True, used: True
02/15/2022 02:43:07 - INFO - pytorch_lightning.utilities.distributed -   TPU available: False, using: 0 TPU cores
02/15/2022 02:43:07 - INFO - pytorch_lightning.utilities.distributed -   IPU available: False, using: 0 IPUs
02/15/2022 02:43:07 - INFO - __main__ -   Loading train data and labels from ./data/sst2.train
  cpuset_checked))
02/15/2022 02:43:07 - INFO - pytorch_lightning.accelerators.gpu -   LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation sanity check: 0it [00:00, ?it/s]

02/15/2022 02:43:07 - INFO - __main__ -   Loading dev data and labels from ./data/sst2.dev


weights: tensor([[0.0311],
        [0.0313],
        [0.0309],
        [0.0311],
        [0.0306],
        [0.0309],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0312],
        [0.0312],
        [0.0312],
        [0.0312],
        [0.0312],
        [0.0312],
        [0.0312],
        [0.0312],
        [0.0312],
        [0.0312],
        [0.0312],
        [0.0312],
        [0.0312],
        [0.0312],
        [0.0312]], device='cuda:0')
weights: tensor([1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 

TypeError: ignored

In [None]:
%reload_ext  tensorboard
%tensorboard --logdir lstm/

# Visualize Attention Weights

In [None]:
# This is a helper function for you to visualize your attention weights
from IPython.display import HTML, display
def visualize_attention_weights(tokens, att_weights):
    """
    # Paramters
    tokens: list of strings
        tokenized words of a sentence. 
    att_weights: list of floats, each weight should be in [0, 1]
        att_weights gerneated by LSTM Attention model. 
    """
    html_template = """<span style="background-color:rgb(255, {}, {})">{}</span>"""
    out = []
    for t, w in zip(tokens, att_weights):
        rgb = 255 - w*255
        out.append(html_template.format(rgb,rgb,t))
    html = " ".join(out)
    display(HTML(html), metadata=dict(isolated=True))
    
############## YOU NEED TO USE ACTUAL EXAMPLES AND ATTENTION FROM YOUR MODEL ###
tokens = ["this", "is", "good"]
att_weights = [0.1, 0.2, 0.7]
visualize_attention_weights(tokens, att_weights)
############## 

In [95]:
## HINT: If you want to run predictions on test data to get attention weights, you may adapt the following code:
## On CPU
"""
model = LR_PL.load_from_checkpoint("PATH_TO_CHECKPOINT")
test_loader=model.test_loader()
for batch in test_loader:
    input = model.batch2input(batch)
    loss, pred, att_weights = model(**input)
"""

## On GPU
""" 
model = LR_PL.load_from_checkpoint("PATH_TO_CHECKPOINT").to('cuda')
test_loader = model.test_loader()
for batch in test_loader:
    model.transfer_batch_to_device(batch) # move data to gpu
    input = model.batch2input(batch)
    loss, pred, att_weights = model(**input)
"""

# Error Analysis

In [223]:
def sample_predictions(k, ckpt):
    model = LSTM_PL.load_from_checkpoint(ckpt)
    test_dataloader = model.test_dataloader()

    correct_dict = {}
    errors_dict = {}
    idx = 0
    for i, batch in enumerate(test_dataloader):
        input = model.batch2input(batch)
        labels = input['labels'].view(-1)
        offset = i*len(labels)

        loss, pred, att_weights = model(**input)
        att_weights = att_weights.detach().numpy()

        correct = np.where(labels==pred.cpu())[0]
        errors = np.where(labels!=pred.cpu())[0]
        correct_dict.update({idx+offset: att_weights[idx] for idx in correct})
        errors_dict.update({idx+offset: att_weights[idx] for idx in errors})

        if len(correct_dict) >= k and len(errors_dict) >= k:
          return correct_dict, errors_dict

In [238]:
ckpt = "lstm-att/version_15-02-2022--02-36-35/checkpoints/best_model.ckpt"
correct, errors = sample_predictions(10, ckpt)

02/15/2022 02:37:55 - INFO - __main__ -   Initilazing BaseModel
02/15/2022 02:37:55 - INFO - __main__ -   Load glove pretrained word embeddings
02/15/2022 02:37:57 - INFO - __main__ -   Loading test data and labels from ./data/sst2.test
  cpuset_checked))


In [239]:
datapath = os.path.join(DATA_DIR, "sst2.test")
data = open(datapath).readlines()
data = [d.strip().split(" ", maxsplit=1) for d in data]
for idx, weights in errors.items():
  print(data[idx])
  visualize_attention_weights(tokens, att_weights)

['0', 'gangs of new york is an unapologetic mess , whose only saving grace is that it ends by blowing just about everything up .']


['1', 'the movie exists for its soccer action and its fine acting .']


['1', 'jason x has cheesy effects and a hoary plot , but its macabre , self-deprecating sense of humor makes up for a lot .']


['0', 'oft-described as the antidote to american pie-type sex comedies , it actually has a bundle in common with them , as the film diffuses every opportunity for a breakthrough']


['0', 'those who managed to avoid the deconstructionist theorizing of french philosopher jacques derrida in college can now take an 85-minute brush-up course with the documentary derrida .']


['0', 'but what saves lives on the freeway does not necessarily make for persuasive viewing .']


['1', "steve irwin 's method is ernest hemmingway at accelerated speed and volume ."]


['1', "it 's a pleasure to see seinfeld griping about the biz with buddies chris rock , garry shandling and colin quinn ."]


['1', 'haneke challenges us to confront the reality of sexual aberration .']


['0', 'the thrill is -lrb- long -rrb- gone .']


['1', 'much monkeyfun for all .']


['1', "for the first time in several years , mr. allen has surpassed himself with the magic he 's spun with the hollywood empress of ms. leoni 's ellie ."]


['1', "it represents better-than-average movie-making that does n't demand a dumb , distracted audience ."]


['0', "it may be an easy swipe to take , but this barbershop just does n't make the cut ."]


In [206]:
tokens = data[idx][1].split()
visualize_attention_weights(tokens, weights)

In [240]:
np.where(input['masks'][0]>0)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),)

In [233]:
input['masks'][0]

tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [241]:
att_weights[0]

array([[0.03673287],
       [0.01778186],
       [0.01811536],
       [0.02769268],
       [0.03694467],
       [0.01897873],
       [0.01764604],
       [0.01257802],
       [0.01774774],
       [0.02374153],
       [0.03052772],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.03125   ],
       [0.03125   ],
       [0.03125   ],
       [0.03125   ],
       [0.03125   ],
       [0.03125   ],
       [0.031

In [230]:

model = LSTM_PL.load_from_checkpoint(ckpt)
test_dataloader = model.test_dataloader()

correct_dict = {}
errors_dict = {}
idx = 0
for i, batch in enumerate(test_dataloader):
    input = model.batch2input(batch)
    labels = input['labels'].view(-1)
    offset = i*len(labels)

    loss, pred, att_weights = model(**input)
    att_weights = att_weights.detach().numpy()

    correct = np.where(labels==pred.cpu())[0]
    errors = np.where(labels!=pred.cpu())[0]
    correct_dict.update({idx+offset: att_weights[idx] for idx in correct})
    errors_dict.update({idx+offset: att_weights[idx] for idx in errors})
    break

02/15/2022 02:34:41 - INFO - __main__ -   Initilazing BaseModel
02/15/2022 02:34:41 - INFO - __main__ -   Load glove pretrained word embeddings
02/15/2022 02:34:43 - INFO - __main__ -   Loading test data and labels from ./data/sst2.test
  cpuset_checked))


# BERT 
To reduce the computation, we use `distill BERT` which has much less parameters (66m) than `BERT base` model (~100m) https://github.com/huggingface/transformers/tree/master/examples/distillation .

In [None]:
!pip install transformers=="4.2.2"

In [None]:
from transformers import AutoModelForSequenceClassification, AutoConfig, AutoTokenizer

class BERTSST2Dataset(Dataset):
    """
    Using dataset to process input text on-the-fly
    """
    def __init__(self, tokenizer, data):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = 50 # assigned based on length analysis of training set

    def __getitem__(self, index):
        note = []
        label, text = int(self.data[index][0]), self.data[index][1]
        return text, label

    def collate_fn(self, batch_data):
        texts, labels = list(zip(*batch_data))
        # print(text)
        encodings = self.tokenizer(list(texts), padding=True, truncation=True, max_length=self.max_len, return_tensors= 'pt')
        return (
                encodings['input_ids'],
                encodings['attention_mask'],
                torch.LongTensor(labels).view(-1,1)
               )

    def __len__(self):
        return len(self.data)

class BERT_PL(BaseModel):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.tokenizer = AutoTokenizer.from_pretrained(self.hparams.model_name)
        
    def _load_model(self):
        model_config = AutoConfig.from_pretrained(
            self.hparams.model_name,
            num_labels=2,
        )
        return AutoModelForSequenceClassification.from_pretrained(self.hparams.model_name, config=model_config)

    def forward(self, **args):
        outputs = self.model(**args)
        loss, logits = outputs[0], outputs[1]
        predicted_labels = torch.argmax(logits, dim=1)
        return loss, predicted_labels, []

    def get_dataloader(self, type_path, batch_size, shuffle=False):
        # todo add dataset path
        datapath = os.path.join(self.hparams.data_dir, f"sst2.{type_path}")
        data = open(datapath).readlines()
        data = [d.strip().split(" ", maxsplit=1) for d in data] # list of [label, text] pair
        dataset = BERTSST2Dataset(self.tokenizer, data)

        logger.info(f"Loading {type_path} data and labels from {datapath}")
        data_loader = torch.utils.data.DataLoader(
            dataset=dataset,
            batch_size=batch_size,
            shuffle=shuffle,
            num_workers=self.hparams.num_workers,
            collate_fn=dataset.collate_fn
        )
        
        return data_loader    

    def configure_optimizers(self):
        """Prepare optimizer and schedule (linear warmup and decay)"""
        model = self.model
        optimizer = Adam(model.parameters(), lr=self.hparams.learning_rate)
        self.opt = optimizer
        return [optimizer]
    
    def batch2input(self, batch):
        return {"input_ids": batch[0], "labels": batch[2], "attention_mask": batch[1]}

    @staticmethod
    def add_model_specific_args(parser, root_dir):
        parser.add_argument(
            "--model_name",
            default=None,
            type=str,
            required=True,
            help="Pretrained tokenizer name or path",
        )
        parser.add_argument(
            "--optimizer",
            default="adam",
            type=str,
            required=True,
            help="Whether to use SGD or not",
        )
        return parser

In [None]:
import logging
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
    )
import time
import argparse
import glob
import os
logger = logging.getLogger(__name__)

def main():
    ########################################################
    ## TODO: change args if needed according to your files #
    ########################################################
    mock_args = f"--data_dir {DATA_DIR} --output_dir bert --optimizer adam \
    --model_name distilbert-base-uncased --learning_rate 0.00005 --max_epochs 3 --do_predict" # change model_name here

    # load hyperparameters
    parser = argparse.ArgumentParser()
    BaseModel.add_generic_args(parser, os.getcwd())
    parser = BERT_PL.add_model_specific_args(parser, os.getcwd())
    args = parser.parse_args(mock_args.split())
    print(args)
    # fix random seed to make sure the result is reproducible
    pl.seed_everything(args.seed)

    # If output_dir not provided, a folder will be generated in pwd
    if args.output_dir is None:
        args.output_dir = os.path.join(
            "./results",
            f"{args.task}_{time.strftime('%Y%m%d_%H%M%S')}",
        )
        os.makedirs(args.output_dir)
    dict_args = vars(args)
    model = BERT_PL(**dict_args)
    trainer = generic_train(model, args)


if __name__ == "__main__":
    main()

In [None]:
%reload_ext  tensorboard
%tensorboard --logdir bert/