In [2]:
import os
import pandas as pd
import numpy as np
import transformers
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F
from torchsummary import summary
from torchmetrics import Accuracy, F1Score
from tqdm import tqdm

pd.set_option('display.max_colwidth', None)
RAW_DATASET_DIR = "raw_datasets/SST_2"

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cpu_device = torch.device("cpu")
device

device(type='cuda')

In [4]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)
df

Unnamed: 0,0,1
0,"a stirring , funny and finally transporting re imagining of beauty and the beast and 1930s horror films",1
1,apparently reassembled from the cutting room floor of any given daytime soap,0
2,"they presume their audience wo n't sit still for a sociology lesson , however entertainingly presented , so they trot out the conventional science fiction elements of bug eyed monsters and futuristic women in skimpy clothes",0
3,"this is a visually stunning rumination on love , memory , history and the war between art and commerce",1
4,jonathan parker 's bartleby should have been the be all end all of the modern office anomie films,1
...,...,...
6915,"painful , horrifying and oppressively tragic , this film should not be missed",1
6916,"take care is nicely performed by a quintet of actresses , but nonetheless it drags during its 112 minute length",0
6917,"the script covers huge , heavy topics in a bland , surfacey way that does n't offer any insight into why , for instance , good things happen to bad people",0
6918,a seriously bad film with seriously warped logic by writer director kurt wimmer at the screenplay level,0


### Word piece tokenizer

In [5]:
bert_model_name = "bert-base-uncased"

In [6]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

sequence = "A Titan RTX has 24GB of VRAM"

In [7]:
tokenized_sequence = tokenizer.tokenize(sequence)
print(tokenized_sequence)

['a', 'titan', 'rt', '##x', 'has', '24', '##gb', 'of', 'vr', '##am']


In [8]:
encoded_sequence = tokenizer(sequence)["input_ids"]
print(encoded_sequence)

[101, 1037, 16537, 19387, 2595, 2038, 2484, 18259, 1997, 27830, 3286, 102]


In [9]:
decoded_sequence = tokenizer.decode(encoded_sequence)
print(decoded_sequence)

[CLS] a titan rtx has 24gb of vram [SEP]


### Dataloader

In [12]:
class BertDataset(Dataset):
    def __init__(self, tokenizer,max_length, device=torch.device("cuda"), split="train"):
        super(BertDataset, self).__init__()
        self.data_df=pd.read_csv(os.path.join(RAW_DATASET_DIR, split + ".tsv"), delimiter='\t')
        self.tokenizer=tokenizer
        self.target=self.data_df.iloc[:,1]
        self.max_length=max_length
        
    def __len__(self):
        return len(self.data_df)
    
    def __getitem__(self, index):
        
        text1 = self.data_df.iloc[index,0].lower()
        
        inputs = self.tokenizer.encode_plus(
            text1 ,
            None, # since we have only 1 sentence as input
            pad_to_max_length=True,
            add_special_tokens=True,
            return_attention_mask=True,
            max_length=self.max_length,
        )
        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long).to(device),
            'mask': torch.tensor(mask, dtype=torch.long).to(device),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long).to(device),
            'target': torch.tensor(self.data_df.iloc[index, 1], dtype=torch.long).to(device)
            }

In [13]:
tokenizer = transformers.BertTokenizer.from_pretrained(bert_model_name)
BATCH_SIZE = 32
MAX_SENT_LENGTH = 56

# train dataset
train_dataset= BertDataset(tokenizer, max_length=MAX_SENT_LENGTH, split="train")
train_dataloader=DataLoader(dataset=train_dataset,batch_size=BATCH_SIZE)

# dev dataset
dev_dataset= BertDataset(tokenizer, max_length=MAX_SENT_LENGTH, split="dev")
dev_dataloader=DataLoader(dataset=dev_dataset,batch_size=BATCH_SIZE)

# test dataset
test_dataset= BertDataset(tokenizer, max_length=MAX_SENT_LENGTH, split="test")
test_dataloader=DataLoader(dataset=test_dataset,batch_size=BATCH_SIZE)

In [14]:
# testing data loaders
next(iter(train_dataloader));

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


### Model

In [15]:
class BERT(nn.Module):
    def __init__(self, d_model=768, H = 50, n_classes=2):
        super(BERT, self).__init__()
        self.bert_model = transformers.BertModel.from_pretrained(bert_model_name)
        self.classifier = nn.Sequential(
            nn.Linear(d_model, H),
            nn.ReLU(),
            nn.Linear(H, n_classes)
        )
        
    def forward(self,ids,mask,token_type_ids):
        # need to pass positional embeddings
        last_hidden_states, _ = self.bert_model(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
        # last_hidden_state.shape = (batch_size, sequence_length, hidden_size)
        
        # Extract the last hidden state of the token `[CLS]` for classification task
        cls = last_hidden_states[:, 0, :] # (Batch, MAX_length, hidden_size)
        logits = self.classifier(cls)
        return logits

In [16]:
bert_model_name

'bert-base-uncased'

In [17]:
tokenizer = transformers.BertTokenizer.from_pretrained(bert_model_name)
op = tokenizer.encode_plus(
            "I'm sentence 1", # text
            "I am going to ve a 2nd sentence", # text pair
            apad_to_max_length=True,
            add_special_tokens=True,
            return_attention_mask=True,
            max_length=56,
        )
op

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Keyword arguments {'apad_to_max_length': True} not recognized.
Keyword arguments {'apad_to_max_length': True} not recognized.


{'input_ids': [101, 1045, 1005, 1049, 6251, 1015, 102, 1045, 2572, 2183, 2000, 2310, 1037, 3416, 6251, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

## Training

In [20]:
model=BERT(d_model=768, H=50).to(device)
loss_fn = nn.CrossEntropyLoss()

#Initialize Optimizer
lr = 5e-5
optimizer= optim.Adam(model.parameters(),lr= lr)

In [159]:
# # only finetune classification head
# for param in model.bert_model.parameters():
#     param.requires_grad = False

In [21]:
epochs = 3
train_losses = []

for epoch in range(epochs):
    model.train()
    loop=tqdm(enumerate(train_dataloader),leave=False,total=len(train_dataloader))
    print(epoch)
    total_matches = 0
    for batch, dl in loop:
        
        # input
        ids, token_type_ids, mask, label = dl['ids'], dl['token_type_ids'], dl['mask'], dl['target']
        optimizer.zero_grad()
        output=F.softmax(model(ids=ids,mask=mask,token_type_ids=token_type_ids), dim=1) # (B, 2)
        
        # loss
        loss=loss_fn(output,label)
        loss.backward()
        train_losses.append(loss)

        optimizer.step()
        
        # prediction
        pred = torch.argmax(output, dim=-1) # (B,1)
        accuracy = Accuracy(task = "binary").to(device)(pred, label)
        total_matches += (torch.sum(pred == label)).item()

        # Show progress while training
        loop.set_description_str(f"Epoch={epoch}/{epochs} loss={loss.item()} acc={accuracy}")

    
    print(f"Train Accuracy :{epoch} = {total_matches/len(train_dataset)}") 

In [161]:
# save model
model_name = f"{model_name}_SST2_{epochs}_{lr}_FULL.pt"
torch.save(model.state_dict(), f"ckpts/{model_name}")

## Tesing

In [22]:
# test dataset
test_losses = []

model.eval()
loop=tqdm(enumerate(test_dataloader),leave=False,total=len(test_dataloader))
total_matches = 0
with torch.no_grad():
    for batch, dl in loop:
        ids, token_type_ids, mask, label = dl['ids'], dl['token_type_ids'], dl['mask'], dl['target']
        output=F.softmax(model(ids=ids,mask=mask,token_type_ids=token_type_ids), dim=1)
        loss=loss_fn(output,label)
        test_losses.append(loss)

        pred = torch.argmax(output, dim=-1)

        matches = torch.sum(pred == label)
        total_matches += matches.item()

        # Show progress while training
        loop.set_description(f'loss={loss.item()}')


    print(f"Test Accuracy :{total_matches/len(test_dataset)}")        

## Custom Sentences

In [59]:
class BertCustomDataset(Dataset):
    def __init__(self, tokenizer, max_length, sentences, device=torch.device("cuda")):
        super(BertCustomDataset, self).__init__()
        self.data_df=pd.DataFrame(custom_sentences, columns=["text"])
        self.tokenizer=tokenizer
        self.max_length=max_length
        
    def __len__(self):
        return len(self.data_df)
    
    def __getitem__(self, index):
        
        text1 = self.data_df.iloc[index,0]
        
        inputs = self.tokenizer.encode_plus(
            text1 ,
            None,
            pad_to_max_length=True,
            add_special_tokens=True,
            return_attention_mask=True,
            max_length=self.max_length,
        )
        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long).to(device),
            'mask': torch.tensor(mask, dtype=torch.long).to(device),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long).to(device),
            }

In [102]:
custom_sentences = ["I do not like food there.", "I like these mangoes", "You made 1 run in only 50 balls", "Opps! You made 1 run in only 50 balls", "Wow! You made 1 run in only 50 balls", "You made only 1 run in 50 balls", "Opps! You made only 1 run in 50 balls", "Wow! You made only 1 run in 50 balls"]
custom_sentences = ["You made 100 run in only 50 balls", "You made only 1 run in 50 balls", "Wow! You made only 1 run in 50 balls", "Despite being a topper you are just passed", "Despite being a back bencher you are passed"]

custom_dataset = BertCustomDataset(tokenizer, 56, custom_sentences)
custom_dataloader = DataLoader(custom_dataset, batch_size=len(custom_sentences))
model.eval()
with torch.no_grad():
    data = next(iter(custom_dataloader))
    ids, token_type_ids, mask = data['ids'], data['token_type_ids'], data['mask']
    output=F.softmax(model(ids=ids,mask=mask,token_type_ids=token_type_ids), dim=1)
#     print("Prediction:")
#     print(list(zip(custom_sentences, torch.argmax(output, dim=1))))
    
df = pd.DataFrame(custom_sentences, columns=["input"])
df["Prediction"] = torch.argmax(output, dim=-1).to(cpu_device)
df["Prediction"] = df["Prediction"].apply(lambda x: {0: "Negative", 1: "Positive"}[x])
df["Probability"] = torch.max(output, dim=-1).values.to(cpu_device)
df



Unnamed: 0,input,Prediction,Probability
0,You made 100 run in only 50 balls,Negative,0.99999
1,You made only 1 run in 50 balls,Negative,0.999993
2,Wow! You made only 1 run in 50 balls,Negative,0.66792
3,Despite being a topper you are just passed,Negative,0.581516
4,Despite being a back bencher you are passed,Positive,0.999947


Unnamed: 0,text,Prediction,Probability
0,I do not like food there.,0,0.999993
1,I like these mangoes,1,0.999988
2,Opps! You made 1 runs in only 50 balls,0,0.999992
3,Wow! You made only 1 run in 50 balls,0,0.667919
