In [1]:
! nvidia-smi

Mon Apr 18 12:53:12 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01    Driver Version: 470.63.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:61:00.0 Off |                    0 |
| N/A   42C    P0    32W / 250W |   4236MiB / 12198MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla P100-PCIE...  Off  | 00000000:DB:00.0 Off |                    0 |
| N/A   51C    P0    56W / 250W |  15971MiB / 16280MiB |    100%      Default |
|       

In [5]:
import os, re, math, copy, time, sys
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.autograd import Variable
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import tqdm

from transformers import BertForPreTraining, BertModel, BertConfig, BertTokenizer
from transformers import BertTokenizer, BertForSequenceClassification
from tokenizers import ByteLevelBPETokenizer

In [6]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import random
import wandb

In [7]:
wandb.init(project='design-lab', entity='jitaishik')

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjitaishik[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.14 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [8]:
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_VISIBLE_DEVICES=0


In [9]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [10]:
MAX_LEN=128

In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [12]:
import torch
import torch.nn as nn
from transformers import BertModel

class BertClassifier(nn.Module):

    def __init__(self, freeze_bert=False):

        super(BertClassifier, self).__init__()

        self.PAD = tokenizer.pad_token_id

        self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=3)

        
    def forward(self, C, c_labels):

        output = self.bert(
            input_ids = C, 
            attention_mask = (C!=self.PAD),
            labels = c_labels
        )   
        

        return output

In [13]:
from datasets import load_dataset

dataset = load_dataset("multi_nli")

Using custom data configuration default
Reusing dataset multi_nli (/home/aishik-pg/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39)


In [14]:
dataset

DatasetDict({
    train: Dataset({
        features: ['promptID', 'pairID', 'premise', 'premise_binary_parse', 'premise_parse', 'hypothesis', 'hypothesis_binary_parse', 'hypothesis_parse', 'genre', 'label'],
        num_rows: 392702
    })
    validation_matched: Dataset({
        features: ['promptID', 'pairID', 'premise', 'premise_binary_parse', 'premise_parse', 'hypothesis', 'hypothesis_binary_parse', 'hypothesis_parse', 'genre', 'label'],
        num_rows: 9815
    })
    validation_mismatched: Dataset({
        features: ['promptID', 'pairID', 'premise', 'premise_binary_parse', 'premise_parse', 'hypothesis', 'hypothesis_binary_parse', 'hypothesis_parse', 'genre', 'label'],
        num_rows: 9832
    })
})

In [15]:
len(dataset['train'])

392702

In [16]:
text = []
tag = []

In [17]:
premise = dataset['train']['premise']
hypo = dataset['train']['hypothesis']
label = dataset['train']['label']
length = len(dataset['train'])

In [18]:
for i in tqdm.auto.tqdm(range(length)):
  if label[i]>=0:
    text.append("[CLS]" + premise[i] + "[SEP]" + hypo[i] + "[SEP]")
    tag.append(label[i])

HBox(children=(FloatProgress(value=0.0, max=392702.0), HTML(value='')))




In [19]:
class multi_nli():
      
  def __init__(self, split, max_len):
    super(multi_nli, self).__init__()
    text = []
    tag = []
    if split=='train':

      premise = dataset['train']['premise']
      hypo = dataset['train']['hypothesis']
      label = dataset['train']['label']
      length = len(dataset['train'])

      for i in tqdm.auto.tqdm(range(length)):
        if label[i]>=0:
          text.append("[CLS]" + premise[i] + "[SEP]" + hypo[i] + "[SEP]")
          tag.append(label[i])

    if split=='val':

      premise = dataset['validation_matched']['premise']
      hypo = dataset['validation_matched']['hypothesis']
      label = dataset['validation_matched']['label']
      length = len(dataset['validation_matched'])

      for i in tqdm.auto.tqdm(range(length)):
        if label[i]>=0:
          text.append("[CLS]" + premise[i] + "[SEP]" + hypo[i] + "[SEP]")
          tag.append(label[i])
      
    self.data=[]
                
    data_u=[]
    data_a=[]

    temploader = DataLoader(list(zip(text, tag)), batch_size=256)
    for utts, acts in tqdm.auto.tqdm(temploader):
        enc_utts = tokenizer(list(utts), padding='max_length', add_special_tokens=False)['input_ids']
        data_u.extend(enc_utts)
        data_a.extend(acts.tolist())

    for u, a in zip(data_u, data_a):
        self.data.append([torch.tensor(u[:max_len]), a])

  def __getitem__(self, index):
    return self.data[index]
  def __len__(self):
    return len(self.data)

In [20]:
bert_classifier = BertClassifier(freeze_bert=True)
pytorch_total_params = sum(p.numel() for p in bert_classifier.parameters() if p.requires_grad)
print(f"## Training model with {pytorch_total_params/1000000:0.2F}M trainable parameters.")
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print(f"# Using device: {device}")
bert_classifier.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

## Training model with 109.48M trainable parameters.
# Using device: cuda


BertClassifier(
  (bert): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=Tru

In [21]:
data_train = multi_nli('train', MAX_LEN)

HBox(children=(FloatProgress(value=0.0, max=392702.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1534.0), HTML(value='')))




In [22]:
data_val = multi_nli('val',MAX_LEN)

HBox(children=(FloatProgress(value=0.0, max=9815.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




In [23]:
train_dataloader = DataLoader(data_train,  batch_size=32)
val_dataloader = DataLoader(data_val,  batch_size=32)

In [24]:
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.optim import Adam
num_epochs = 3
import torch.optim
# Create the optimizer
optimizer = torch.optim.Adam(bert_classifier.parameters(),
                  lr=2e-5,    # Default learning rate
                  eps=1e-8,   # Default epsilon value
                  )

# Total number of training steps
total_steps = len(train_dataloader) * num_epochs

# Set up the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0, # Default value
                                            num_training_steps=total_steps)

In [25]:
import random
import time

# Specify loss function
# loss_fn = nn.CrossEntropyLoss()

def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

In [26]:
def train(model, train_dataloader, val_dataloader=None, epochs=10, evaluation=False):
    """Train the BertClassifier model.
    """
    # Start training loop
    print("Start training...\n")
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0
        best_acc = 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            # Load batch to GPU
            c, c_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            outputs = model.forward(c, c_labels)
            
            loss, logits = outputs[:2]

            # Compute loss and accumulate the loss values
#             loss = loss_fn(logits, c_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            wandb.log({
                "step_training_loss": loss.item(), "train_step": step
            })

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 1000 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)
        wandb.log({
                "batch_train_loss": avg_train_loss, "epoch": epoch_i
            })

        print("-"*70)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_accuracy = evaluate(model, val_dataloader)
            wandb.log({
                "val_loss": val_loss, "val_accuracy": val_accuracy
            })
            if(val_accuracy > best_acc):
                best_acc = val_accuracy
                torch.save(model.state_dict(), 'bert-parameters.pt')

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")
    
    print("Training complete!")


def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        c, c_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            loss, logits = model.forward(c,c_labels)[:2]

        # Compute loss
#         loss = loss_fn(logits, c_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        accuracy = (preds == c_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

In [27]:
set_seed(42)    # Set seed for reproducibility
train(bert_classifier, train_dataloader, val_dataloader, epochs=num_epochs, evaluation=True)

Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   1    |   100   |   1.051940   |     -      |     -     |   49.03  
   1    |   200   |   0.889283   |     -      |     -     |   48.44  
   1    |   300   |   0.823125   |     -      |     -     |   48.66  
   1    |   400   |   0.753662   |     -      |     -     |   48.45  
   1    |   500   |   0.758669   |     -      |     -     |   48.08  
   1    |   600   |   0.720206   |     -      |     -     |   47.79  
   1    |   700   |   0.707333   |     -      |     -     |   48.38  
   1    |   800   |   0.668656   |     -      |     -     |   48.31  
   1    |   900   |   0.666253   |     -      |     -     |   48.40  
   1    |  1000   |   0.651586   |     -      |     -     |   48.34  
   1    |  1100   |   0.630928   |     -      |     -     |   48.34  
   1    |  1200   |   0.652890   |     -      |     -     |   48.74  


[34m[1mwandb[0m: Network error resolved after 0:02:09.683049, resuming normal operation.


   1    |  2500   |   0.551663   |     -      |     -     |   46.57  
   1    |  2600   |   0.566162   |     -      |     -     |   46.53  
   1    |  2700   |   0.581768   |     -      |     -     |   46.50  
   1    |  2800   |   0.572137   |     -      |     -     |   46.53  
   1    |  2900   |   0.542776   |     -      |     -     |   46.55  
   1    |  3000   |   0.564081   |     -      |     -     |   46.61  
   1    |  3100   |   0.534022   |     -      |     -     |   46.58  
   1    |  3200   |   0.558082   |     -      |     -     |   46.48  
   1    |  3300   |   0.534841   |     -      |     -     |   46.46  
   1    |  3400   |   0.562220   |     -      |     -     |   46.47  
   1    |  3500   |   0.536603   |     -      |     -     |   46.55  
   1    |  3600   |   0.539281   |     -      |     -     |   46.47  
   1    |  3700   |   0.539415   |     -      |     -     |   46.46  
   1    |  3800   |   0.548842   |     -      |     -     |   46.55  
   1    |  3900   | 