<a href="https://colab.research.google.com/github/juhi10071998/Transfer_learning_exps/blob/main/bert_movie_reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pytorch_pretrained_bert pytorch-nlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import sys
import numpy as np
import random as rn
import torch
from pytorch_pretrained_bert import BertModel
from torch import nn
from torchnlp.datasets import imdb_dataset
from pytorch_pretrained_bert import BertTokenizer
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_
from IPython.display import clear_output

In [None]:
rn.seed(321)
np.random.seed(321)
torch.manual_seed(321)
torch.cuda.manual_seed(321)

In [None]:
train_data, test_data = imdb_dataset(train=True, test=True)
rn.shuffle(train_data)
rn.shuffle(test_data)
train_data = train_data[:1000]
test_data = test_data[:100]

In [None]:
train_data[0:3]

[{'sentiment': 'pos',
  'text': "War is hell. But this documentary of WWII is heaven.<br /><br />Not only is this series a breath-taking, almost-exhaustive look at the Second World War, it's a poetic masterpiece told clearly and superbly by Laurence Olivier.<br /><br />This documentary series defines the genre. It's sweepingly long, no doubt, but you will enjoy all of them and want to come back for more and more. (I have the series on DVD and I probably watch the series three times a year).<br /><br />Truly, this is an impeccable bit of film-making. Other than Olivier, the best part of the series is listening to the veterans tell their stories; whether it be about an actual battle or about finding a hog to butcher so they could have something delicious for supper.<br /><br />I'm going to go watch it right now (again, my... 11th time)."},
 {'sentiment': 'pos',
  'text': 'Due to the invention of a "The Domestication Collar", flesh-eating zombies are brought under control, and become prod

In [None]:
train_texts, train_labels = list(zip(*map(lambda d: (d['text'], d['sentiment']), train_data)))
test_texts, test_labels = list(zip(*map(lambda d: (d['text'], d['sentiment']), test_data)))
(T1,S1),(T2,S2)

[T1,T2,]
len(train_texts), len(train_labels), len(test_texts), len(test_labels)

(1000, 1000, 100, 100)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


In [None]:
tokenizer.tokenize('Hi my name is Dima')


['hi', 'my', 'name', 'is', 'dim', '##a']

In [None]:
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], train_texts))
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], test_texts))

len(train_tokens), len(test_tokens)   

(1000, 100)

In [None]:
train_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, train_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, test_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")

train_tokens_ids.shape, test_tokens_ids.shape

((1000, 512), (100, 512))

In [None]:
train_y = np.array(train_labels) == 'pos'
test_y = np.array(test_labels) == 'pos'
train_y.shape, test_y.shape, np.mean(train_y), np.mean(test_y)

((1000,), (100,), 0.489, 0.5)

In [None]:
train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]

Using the output of the pre trained BERT model, the final CLS token, and passing it to the classification layer. (feature based approach)


In [None]:
class BertBinaryClassifier(nn.Module):
  def __init__(self, dropout = 0.1):
    super(BertBinaryClassifier, self).__init__()
    self.bert = BertModel.from_pretrained('bert-base-uncased')
    self.dropout = nn.Dropout(dropout)
    self.linear = nn.Linear(768,1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, tokens, masks = None):
    _,pooled_output = self.bert(tokens, attention_mask = masks, output_all_encoded_layers = False)
    pooled_output = self.dropout(pooled_output)
    pooled_output = self.linear(pooled_output)
    probability = self.sigmoid(pooled_output)
    return probability


In [None]:
bert_model = BertBinaryClassifier()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
x = torch.tensor(train_tokens_ids[:3]).to(device)

In [None]:
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'


'0.012288M'

In [None]:
bert_model.cuda()
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'


'439.077376M'

In [None]:
y, pooled_output = bert_model.bert(x, output_all_encoded_layers=False)
print(f" y shape is {len(y)} and pooled output shape is {4}")

 y shape is 3 and pooled output shape is 4


In [None]:
prob = bert_model(x)

In [None]:
prob

tensor([[0.4063],
        [0.4388],
        [0.3783]], device='cuda:0', grad_fn=<SigmoidBackward0>)

In [None]:
y, x, pooled = None, None, None
torch.cuda.empty_cache()
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'5791.379968M'

In [None]:
BATCH_SIZE = 1
EPOCHS = 2

In [None]:
train_y = np.array(train_labels) == 'pos'
test_y = np.array(test_labels) == 'pos'
train_y.shape


(1000,)

In [None]:
train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(train_y.reshape(-1,1)).float()
test_tokens_tensor = torch.tensor(test_tokens_ids)
test_y_tensor = torch.tensor(test_y.reshape(-1,1)).float()
train_masks_tensor = torch.tensor(train_masks)
test_masks_tensor = torch.tensor(test_masks)

In [None]:
train_y_tensor.shape
test_y_tensor.shape

torch.Size([100, 1])

In [None]:
train_dataset = TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

In [None]:
optimizer = Adam(bert_model.parameters(), lr=3e-6)


In [None]:
param_optimizer = list(bert_model.sigmoid.named_parameters()) 
optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

In [None]:
torch.cuda.empty_cache()


In [None]:
for epoch_num in range(EPOCHS):
  bert_model.train()
  train_loss = 0
  for step_number, batch_data in enumerate(train_dataloader):
    print(f"step number is {step_number}")
    ## extract data from each batch into respective tokens, masks and labels and put in device as model bhi device me h
    token_ids, masks, labels = tuple(t.to(device) for t in batch_data)
    logits = bert_model(token_ids, masks)
    lossfun = nn.BCELoss()
    batch_loss = lossfun(logits, labels)
    train_loss+=batch_loss.item()
    bert_model.zero_grad()
    batch_loss.backward()
    clip_grad_norm_(parameters=bert_model.parameters(), max_norm=1.0)
    optimizer.step()
    clear_output(wait=True)

step number is 999


In [None]:
bert_model.eval()
bert_predicted = []
all_logits = []
with torch.no_grad():
  for step_num, batch_data in enumerate(test_dataloader):
    token_ids, masks, labels = tuple(t.to(device) for t in batch_data)
    logits = bert_model(token_ids, masks)
    numpy_logits = logits.cpu().detach().numpy()
    bert_predicted += list(numpy_logits[:, 0] > 0.5)
    all_logits += list(numpy_logits[:, 0])

In [None]:
from sklearn.metrics import classification_report
print(classification_report(test_y, bert_predicted))


              precision    recall  f1-score   support

       False       0.98      0.84      0.90        50
        True       0.86      0.98      0.92        50

    accuracy                           0.91       100
   macro avg       0.92      0.91      0.91       100
weighted avg       0.92      0.91      0.91       100



In [None]:
1