<a href="https://colab.research.google.com/github/hamidrezayaghobi/Learning-How-to-Mask-Text-Input-for-Better-Generalization/blob/main/masking_text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Initializatoin

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd /content/drive/MyDrive/Colab\ Notebooks/Lab/

/content/drive/MyDrive/Colab Notebooks/Lab


## Imports

In [None]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m67.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m125.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90

In [None]:
import numpy as np
import torch
import torch.nn as nn
import pandas as pd

from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
nn.Sigmoid()

Sigmoid()

## Config

In [None]:
#MODEL
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'

#DATASET
MAX_LENGTH = 64
NUM_LABELS = 4

#TRAINING
NUM_EPOCHS = 3
BATCH_SIZE = 64
LEARNING_RATE = 0.0001

In [None]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Data Prepration

## Data Loading

In [None]:
# train_df = pd.read_csv('train.csv')
# test_df = pd.read_csv('test.csv')

# train_texts, train_labels = train_df['Description'].tolist(), (train_df['Class Index'] - 1).tolist()
# test_texts, test_labels = test_df['Description'].tolist(), (test_df['Class Index'] - 1).tolist()

In [None]:
train_df = pd.read_csv('./imdb_dataset/train_imdb.csv')
test_df = pd.read_csv('./imdb_dataset/test_imdb.csv')

train_texts, train_labels = train_df['review'].tolist(), (train_df['sentiment']).tolist()
test_texts, test_labels = test_df['review'].tolist(), (test_df['sentiment']).tolist()

## Tokenize

In [None]:
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME, do_lower_case=True)

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=MAX_LENGTH)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=MAX_LENGTH)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

## Custom Dataset

In [None]:
class AGNewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = AGNewsDataset(train_encodings, train_labels)
test_dataset = AGNewsDataset(test_encodings, test_labels)

# Model

In [None]:
class Bert(nn.Module):
    def __init__(self, num_labels, tune_only_last_layer=True):
        super(Bert, self).__init__()

        #Pre Trained Bert
        self.bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

        #Freezing Layers
        if tune_only_last_layer:
            for name, param in self.bert_model.named_parameters():
                if 'classifier' in name:
                  param.requires_grad = True
                else:
                  param.requires_grad = False

        self.num_labels = num_labels

        #Classification Layer
        self.dropout = nn.Dropout(0.2)
        self.last_layer_classifier = nn.Linear(self.bert_model.config.hidden_size, self.num_labels)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None,
                position_ids=None, head_mask=None, inputs_embeds=None,
                labels=None):

        outputs = self.bert_model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids,
                                    position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds)

        output = self.dropout(outputs[1])
        logits = self.last_layer_classifier(output)

        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits.view(-1, self.num_labels), labels.view(-1))
            return {'loss': loss, 'logits': logits}
        else:
            return {'logits': logits}

In [None]:
class TransformerRationalePredictor(torch.nn.Module):
  def __init__(self, num_layers, d_model, num_heads,
               dff, dropout_rate=0.1):
    super().__init__()

    self.num_layers = num_layers
    self.d_model = d_model
    self.num_heads = num_heads
    self.dff = dff

    self.linear = torch.nn.Linear(self.d_model, self.d_model* self.num_heads, dtype=torch.float64)
    self.norm = torch.nn.BatchNorm1d(self.d_model* self.num_heads)

    #TODO: ADD NUM_HEADS
    self.enc_layers = torch.nn.Sequential(
        *([nn.TransformerEncoderLayer(d_model=self.d_model * self.num_heads,
                                      dtype=torch.float64,
                                      nhead=self.num_heads,
                                      dim_feedforward=self.dff,
                                      dropout=dropout_rate,
                                      batch_first=True)] * num_layers)
        )

    self.linear2 = torch.nn.Linear(self.d_model * self.num_heads, self.d_model, dtype=torch.float64)
    self.relu = torch.nn.ReLU()


  def forward(self, x):
    '''
    inputs:
            x : [batch_size, num_tokens, d_model]
    '''
    x = self.linear(x)
    # May be batch normalization ?!
    x = self.enc_layers(x)
    x = self.linear2(x)
    # x = self.relu(x)
    return x  # Shape `(batch_size, seq_len, d_model)

In [None]:
class MyBert(nn.Module):
    def __init__(self, bert_model, num_labels=4):
        super(MyBert, self).__init__()

        self.num_labels = num_labels

        #Base Model
        self.bert_model = bert_model

        #Transformer Attention
        self.attention_mask_predictor = TransformerRationalePredictor(
            num_layers=NUM_LAYERS,
            d_model=D_MODEL,
            num_heads=NUM_HEADs,
            dff=DFF,
            )
        self.attention_mask_predictor.to(DEVICE)

    def get_mask(self, predicted_attention_mask):
        z = torch.nn.functional.softmax(predicted_attention_mask, -1)
        indices = torch.topk(z[:, :], k=K).indices
        mask = torch.zeros([z.shape[0], z.shape[1]]).to(DEVICE)
        mask.scatter_(1, indices, 1.)
        with torch.no_grad():
            neg = mask-z[:,:]
        ret = neg + z[:,:]
        return ret

    def cal_continuity_loss(self, z):
        return torch.mean(torch.abs(z[:, 1:] - z[:, :-1]))

    def forward(self, input_ids, attention_mask=None, token_type_ids=None,
                position_ids=None, head_mask=None, inputs_embeds=None,
                labels=None):

        print("----INPUT----")
        print(input_ids[0])
        print('--PRE-MASK---')
        print(attention_mask[0])

        predicted_attention_mask = (
            self.attention_mask_predictor(input_ids.to(torch.float64))
        )
        mask = self.get_mask(predicted_attention_mask)
        mask_loss = self.cal_continuity_loss(mask)
        attention_mask = attention_mask * mask

        print('--NEW-MASK---')
        print(attention_mask[0])


        outputs = self.bert_model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids,
                                  position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds,
                                  labels=labels)

        logits = outputs['logits']

        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = (
                loss_fn(logits.view(-1, self.num_labels), labels.view(-1))
                # / len(labels)
                + mask_loss
             )
            return {'loss': loss, 'logits': logits}
        else:
            return {'logits': logits}

NUM_LAYERS = 1
D_MODEL = MAX_LENGTH
NUM_HEADs = 1
DFF = 256
K = 50

# Fine Tune Last Layer (Classification Layer)

In [None]:
def fine_tune(model, num_epochs=NUM_EPOCHS, lr=LEARNING_RATE,
              batch_size=BATCH_SIZE, train_dataset=train_dataset, test_dataset=test_dataset):

    # Define data loader
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

    # Define the optimizer and the loss function
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

    # Define the training loop
    for epoch in range(num_epochs):
        train_loss = 0
        train_acc = 0
        pbar = tqdm(train_loader)
        for batch_idx, batch in enumerate(pbar):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs['loss']
            logits = outputs['logits']

            if torch.isnan(loss):
                print("BBBBBBBBUUUUUUUUUUUGGGGGGG")
                return input_ids, attention_mask, labels

            preds = torch.argmax(logits, dim=1)

            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            train_acc += accuracy_score(labels.cpu().numpy(), preds.cpu().numpy())
            avg_loss = train_loss / ((batch_idx + 1))
            avg_acc = train_acc / ((batch_idx + 1))

            pbar.set_description(f"AvgTrainLoss: {avg_loss:.4f}, AvgTrainAcc: {avg_acc:.4f}")

        train_loss /= len(train_loader)
        train_acc /= len(train_loader)

        model.eval()
        test_loss = 0
        test_acc = 0
        test_preds = []
        test_labels = []

        with torch.no_grad():
            for batch in tqdm(test_loader):
                input_ids = batch['input_ids'].to(DEVICE)
                attention_mask = batch['attention_mask'].to(DEVICE)
                labels = batch['labels'].to(DEVICE)

                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs['loss']
                logits = outputs['logits']
                test_loss += loss.item()

                preds = torch.argmax(logits, dim=1)
                test_acc += accuracy_score(labels.cpu().numpy(), preds.cpu().numpy())

                test_preds.extend(preds.cpu().numpy())
                test_labels.extend(labels.cpu().numpy())

            test_loss /= len(test_loader)
            test_acc /= len(test_loader)

            precision, recall, f1, _ = precision_recall_fscore_support(test_labels, test_preds, average='macro')

        print(f'Epoch {epoch + 1}: train_loss={train_loss:.4f}, train_acc={train_acc:.4f}, test_loss={test_loss:.4f}, test_acc={test_acc:.4f}, precision={precision:.4f}, recall={recall:.4f}, f1={f1:.4f}')

    return model


In [None]:
args = {
    'bert_tuning': {
        'num_epochs': 5,
        'lr': 0.0001,
        'batch_size': 64,
    },
    'bert_last_layer_tuning': {
        'num_epochs': 10,
        'lr': 0.001,
        'batch_size': 2048,
    },
    'my_bert_tuning': {
        'num_epochs': 10,
        'lr': 0.001,
        'batch_size': 64,
    }
}

In [None]:
bert_model = Bert(num_labels=2, tune_only_last_layer=True)
bert_model = bert_model.to(DEVICE)
fine_tuned_bert_model = fine_tune(bert_model, *args['bert_last_layer_tuning'].values())

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch 1: train_loss=0.7106, train_acc=0.5014, test_loss=0.6918, test_acc=0.5092, precision=0.5362, recall=0.5099, f1=0.4013


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch 2: train_loss=0.6805, train_acc=0.5649, test_loss=0.6727, test_acc=0.6175, precision=0.6215, recall=0.6179, f1=0.6151


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch 3: train_loss=0.6666, train_acc=0.6259, test_loss=0.6628, test_acc=0.6159, precision=0.6357, recall=0.6174, f1=0.6041


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch 4: train_loss=0.6557, train_acc=0.6339, test_loss=0.6523, test_acc=0.6422, precision=0.6542, recall=0.6416, f1=0.6342


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch 5: train_loss=0.6475, train_acc=0.6489, test_loss=0.6417, test_acc=0.6577, precision=0.6593, recall=0.6577, f1=0.6568


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch 6: train_loss=0.6394, train_acc=0.6615, test_loss=0.6352, test_acc=0.6706, precision=0.6708, recall=0.6700, f1=0.6696


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch 7: train_loss=0.6313, train_acc=0.6715, test_loss=0.6304, test_acc=0.6617, precision=0.6710, recall=0.6624, f1=0.6581


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch 8: train_loss=0.6265, train_acc=0.6695, test_loss=0.6223, test_acc=0.6783, precision=0.6785, recall=0.6780, f1=0.6777


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch 9: train_loss=0.6213, train_acc=0.6759, test_loss=0.6210, test_acc=0.6676, precision=0.6785, recall=0.6670, f1=0.6616


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch 10: train_loss=0.6142, train_acc=0.6832, test_loss=0.6121, test_acc=0.6851, precision=0.6877, recall=0.6853, f1=0.6843


In [None]:
torch.save(fine_tuned_bert_model.state_dict(), 'imdb_fine_tuned_bert_model_epoch=10_lr=0.001_batch_size=2048.pt')

In [None]:
fine_tuned_bert_model = fine_tune(fine_tuned_bert_model, *args['bert_last_layer_tuning'].values())

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch 1: train_loss=0.6263, train_acc=0.6544, test_loss=0.6159, test_acc=0.6752, precision=0.6918, recall=0.6777, f1=0.6716


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch 2: train_loss=0.6113, train_acc=0.6796, test_loss=0.6080, test_acc=0.6892, precision=0.6903, recall=0.6889, f1=0.6884


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch 3: train_loss=0.6068, train_acc=0.6858, test_loss=0.6035, test_acc=0.6917, precision=0.6899, recall=0.6898, f1=0.6898


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch 4: train_loss=0.6034, train_acc=0.6908, test_loss=0.6004, test_acc=0.6929, precision=0.6935, recall=0.6935, f1=0.6935


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch 5: train_loss=0.6009, train_acc=0.6938, test_loss=0.5985, test_acc=0.6906, precision=0.6922, recall=0.6900, f1=0.6891


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch 6: train_loss=0.5991, train_acc=0.6907, test_loss=0.5958, test_acc=0.6991, precision=0.7007, recall=0.6968, f1=0.6954


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch 7: train_loss=0.5951, train_acc=0.6955, test_loss=0.5937, test_acc=0.6986, precision=0.6993, recall=0.6993, f1=0.6993


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch 8: train_loss=0.5936, train_acc=0.6935, test_loss=0.5898, test_acc=0.7031, precision=0.7021, recall=0.7020, f1=0.7020


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch 9: train_loss=0.5901, train_acc=0.6975, test_loss=0.5863, test_acc=0.7035, precision=0.7021, recall=0.7020, f1=0.7020


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch 10: train_loss=0.5870, train_acc=0.6973, test_loss=0.5894, test_acc=0.6975, precision=0.7058, recall=0.6972, f1=0.6940


In [None]:
torch.save(fine_tuned_bert_model.state_dict(), 'imdb_fine_tuned_bert_model_epoch=20_lr=0.001_batch_size=2048.pt')

In [None]:
fine_tuned_bert_model = fine_tune(fine_tuned_bert_model, *args['bert_last_layer_tuning'].values())

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch 1: train_loss=0.5943, train_acc=0.6880, test_loss=0.5926, test_acc=0.6884, precision=0.7052, recall=0.6894, f1=0.6834


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch 2: train_loss=0.5894, train_acc=0.6938, test_loss=0.5827, test_acc=0.7066, precision=0.7063, recall=0.7061, f1=0.7061


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch 3: train_loss=0.5849, train_acc=0.7001, test_loss=0.5849, test_acc=0.6992, precision=0.7038, recall=0.7002, f1=0.6989


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch 4: train_loss=0.5819, train_acc=0.7008, test_loss=0.5792, test_acc=0.7098, precision=0.7091, recall=0.7089, f1=0.7088


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch 5: train_loss=0.5793, train_acc=0.7062, test_loss=0.5801, test_acc=0.7057, precision=0.7085, recall=0.7082, f1=0.7081


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch 6: train_loss=0.5793, train_acc=0.7015, test_loss=0.5761, test_acc=0.7120, precision=0.7102, recall=0.7101, f1=0.7101


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch 7: train_loss=0.5771, train_acc=0.7074, test_loss=0.5736, test_acc=0.7116, precision=0.7119, recall=0.7117, f1=0.7117


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch 8: train_loss=0.5741, train_acc=0.7068, test_loss=0.5726, test_acc=0.7129, precision=0.7139, recall=0.7134, f1=0.7132


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch 9: train_loss=0.5746, train_acc=0.7065, test_loss=0.5715, test_acc=0.7133, precision=0.7128, recall=0.7127, f1=0.7127


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch 10: train_loss=0.5731, train_acc=0.7060, test_loss=0.5711, test_acc=0.7162, precision=0.7159, recall=0.7155, f1=0.7154


In [None]:
torch.save(fine_tuned_bert_model.state_dict(), 'imdb_fine_tuned_bert_model_epoch=30_lr=0.001_batch_size=2048.pt')

In [None]:
fine_tuned_bert_model = Bert(num_labels=2, tune_only_last_layer=True)
fine_tuned_bert_model = fine_tuned_bert_model.to(DEVICE)
fine_tuned_bert_model.load_state_dict(torch.load('imdb_fine_tuned_bert_model_epoch=30_lr=0.001_batch_size=2048.pt'))

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [None]:
for name, param in fine_tuned_bert_model.named_parameters():
    param.requires_grad = False

my_bert_model = MyBert(fine_tuned_bert_model, num_labels=2).to(DEVICE)

In [None]:
fine_tuned_my_bert_model = fine_tune(my_bert_model, *args['my_bert_tuning'].values())

  0%|          | 0/391 [00:00<?, ?it/s]

  0%|          | 0/391 [00:00<?, ?it/s]

Epoch 1: train_loss=0.8985, train_acc=0.6909, test_loss=0.8575, test_acc=0.6915, precision=0.6916, recall=0.6916, f1=0.6915


  0%|          | 0/391 [00:00<?, ?it/s]

  0%|          | 0/391 [00:00<?, ?it/s]

Epoch 2: train_loss=0.8567, train_acc=0.6890, test_loss=0.8510, test_acc=0.6893, precision=0.6894, recall=0.6894, f1=0.6894


  0%|          | 0/391 [00:00<?, ?it/s]

  0%|          | 0/391 [00:00<?, ?it/s]

Epoch 3: train_loss=0.8544, train_acc=0.6928, test_loss=0.8587, test_acc=0.6878, precision=0.6878, recall=0.6878, f1=0.6877


  0%|          | 0/391 [00:00<?, ?it/s]

  0%|          | 0/391 [00:00<?, ?it/s]

Epoch 4: train_loss=0.8538, train_acc=0.6927, test_loss=0.8504, test_acc=0.6883, precision=0.6883, recall=0.6883, f1=0.6883


  0%|          | 0/391 [00:00<?, ?it/s]

  0%|          | 0/391 [00:00<?, ?it/s]

Epoch 5: train_loss=0.8516, train_acc=0.6910, test_loss=0.8512, test_acc=0.6910, precision=0.6910, recall=0.6910, f1=0.6910


  0%|          | 0/391 [00:00<?, ?it/s]

  0%|          | 0/391 [00:00<?, ?it/s]

Epoch 6: train_loss=0.8491, train_acc=0.6922, test_loss=0.8507, test_acc=0.6896, precision=0.6895, recall=0.6895, f1=0.6895


  0%|          | 0/391 [00:00<?, ?it/s]

  0%|          | 0/391 [00:00<?, ?it/s]

Epoch 7: train_loss=0.8490, train_acc=0.6927, test_loss=0.8478, test_acc=0.6901, precision=0.6902, recall=0.6902, f1=0.6902


  0%|          | 0/391 [00:00<?, ?it/s]

  0%|          | 0/391 [00:00<?, ?it/s]

Epoch 8: train_loss=0.8495, train_acc=0.6931, test_loss=0.8534, test_acc=0.6895, precision=0.6895, recall=0.6895, f1=0.6895


  0%|          | 0/391 [00:00<?, ?it/s]

  0%|          | 0/391 [00:00<?, ?it/s]

Epoch 9: train_loss=0.8497, train_acc=0.6908, test_loss=0.8509, test_acc=0.6896, precision=0.6896, recall=0.6896, f1=0.6896


  0%|          | 0/391 [00:00<?, ?it/s]

  0%|          | 0/391 [00:00<?, ?it/s]

Epoch 10: train_loss=0.8503, train_acc=0.6926, test_loss=0.8516, test_acc=0.6908, precision=0.6908, recall=0.6908, f1=0.6908


In [None]:
torch.save(fine_tuned_my_bert_model.state_dict(), 'imdb_fine_tuned_my_bert_epoch=10_lr=0.0001_batch_size=64.pt')

In [None]:
fine_tuned_my_bert_model = fine_tune(my_bert_model, *args['my_bert_tuning'].values())

  0%|          | 0/1875 [00:00<?, ?it/s]

  0%|          | 0/119 [00:00<?, ?it/s]

Epoch 1: train_loss=1.0639, train_acc=0.6771, test_loss=0.9647, test_acc=0.7056, precision=0.7789, recall=0.7057, f1=0.7118


  0%|          | 0/1875 [00:00<?, ?it/s]

  0%|          | 0/119 [00:00<?, ?it/s]

Epoch 2: train_loss=0.9740, train_acc=0.7235, test_loss=0.9774, test_acc=0.7275, precision=0.7886, recall=0.7275, f1=0.7322


  0%|          | 0/1875 [00:00<?, ?it/s]

  0%|          | 0/119 [00:00<?, ?it/s]

Epoch 3: train_loss=0.9650, train_acc=0.7317, test_loss=0.9647, test_acc=0.7246, precision=0.7830, recall=0.7246, f1=0.7293


  0%|          | 0/1875 [00:00<?, ?it/s]

  0%|          | 0/119 [00:00<?, ?it/s]

Epoch 4: train_loss=0.9502, train_acc=0.7392, test_loss=0.9492, test_acc=0.7405, precision=0.7948, recall=0.7404, f1=0.7442


  0%|          | 0/1875 [00:00<?, ?it/s]

  0%|          | 0/119 [00:00<?, ?it/s]

Epoch 5: train_loss=0.9371, train_acc=0.7448, test_loss=0.9418, test_acc=0.7357, precision=0.7917, recall=0.7357, f1=0.7396


In [None]:
torch.save(fine_tuned_my_bert_model.state_dict(), 'fine_tuned_my_bert_epoch=5_lr=0.0001_batch_size=64.pt')

# Debug

In [None]:
fine_tuned_bert_model = Bert(num_labels=2, tune_only_last_layer=True)
fine_tuned_bert_model = fine_tuned_bert_model.to(DEVICE)
fine_tuned_bert_model.load_state_dict(torch.load('imdb_fine_tuned_bert_model_epoch=30_lr=0.001_batch_size=2048.pt'))

fine_tuned_my_bert_model = MyBert(fine_tuned_bert_model, num_labels=2).to(DEVICE)
fine_tuned_my_bert_model.load_state_dict(torch.load('imdb_fine_tuned_my_bert_epoch=10_lr=0.0001_batch_size=64.pt'))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [None]:
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True)
for batch in test_loader:
    input_ids = batch['input_ids'].to(DEVICE)
    attention_mask = batch['attention_mask'].to(DEVICE)
    labels = batch['labels'].to(DEVICE)
    fine_tuned_my_bert_model(input_ids, attention_mask=attention_mask)
    break

----INPUT----
tensor([  101,  1000, 15640,  1000,  2003,  1996,  2190,  2773,  1045,  2071,
         2228,  2005,  2023,  2143,  1010,  2926,  6195,  1996, 10156,  4391,
         2009,  8267,  2013,  2070,  2060,  5198,  1012,  1026,  7987,  1013,
         1028,  1026,  7987,  1013,  1028,  2028,  2518,  2008,  2428, 27594,
         2015,  1996,  2143,  2003,  2008,  2009,  2003, 14477, 22083,  9072,
         2135,  7704,  1006,  1999,  2119,  9456,  1997,  1996,  2773,  1007,
         1012,  2025,  2069,   102], device='cuda:0')
--PRE-MASK---
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
--NEW-MASK---
tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 1., 1., 0.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 1., 1., 1.,
        1., 1., 1., 0., 1., 1., 0., 0., 0.

In [None]:
import numpy as np
input = [  101,  1000, 15640,  1000,  2003,  1996,  2190,  2773,  1045,  2071,
         2228,  2005,  2023,  2143,  1010,  2926,  6195,  1996, 10156,  4391,
         2009,  8267,  2013,  2070,  2060,  5198,  1012,  1026,  7987,  1013,
         1028,  1026,  7987,  1013,  1028,  2028,  2518,  2008,  2428, 27594,
         2015,  1996,  2143,  2003,  2008,  2009,  2003, 14477, 22083,  9072,
         2135,  7704,  1006,  1999,  2119,  9456,  1997,  1996,  2773,  1007,
         1012,  2025,  2069,   102]

string_list = tokenizer.decode(input).split()

new_mask = [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 1., 1., 0.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 1., 1., 1.,
        1., 1., 1., 0., 1., 1., 0., 0., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1.,
        1., 0., 1., 1., 1., 1., 1., 1., 0., 1.]

masked_string_list = [string_list[i] if new_mask[i] else 'NONE' for i in range(len(string_list))]

filtered_string_list = [string_list[i] for i in range(len(string_list)) if new_mask[i] == 0]

print('------ORIGINAL TEXT------')
print(' '.join(string_list))

print("-------MASKED TEXT-------")
print(' '.join(masked_string_list))

print("------MASKED TOKEN-------")
print(filtered_string_list)

------ORIGINAL TEXT------
[CLS] " disappointing " is the best word i could think for this film, especially considering the glowing reviews it receives from some other users. < br / > < br / > one thing that really spoils the film is that it is unabashedly partial ( in both senses of the word ). not only [SEP]
-------MASKED TEXT-------
[CLS] " disappointing " is the best word i could think NONE NONE NONE especially considering the NONE reviews it receives from some other users. < br NONE > < br NONE NONE one thing that really spoils the NONE is that NONE NONE NONE partial ( NONE both senses of the word ). not NONE [SEP]
------MASKED TOKEN-------
['for', 'this', 'film,', 'glowing', '/', '/', '>', 'film', 'it', 'is', 'unabashedly', 'in', 'only']
