# BERT: Bidirectional Encoder Representations from Transformers

The pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task-specific architecture modifications.

<img src="https://i.imgur.com/O7ps2Hl.jpg" alt="ensemble" width="800px"/>

**Reference**
* Hugging Face Models : [link](https://huggingface.co/models)
* Bert-base-uncased : [link](https://huggingface.co/bert-base-uncased)
* Hugging Face BERT Docs: [link](https://huggingface.co/transformers/model_doc/bert.html)
* BERT Paper : [link](https://arxiv.org/abs/1810.04805)

In [None]:
!pip install transformers==3



## Import

In [1]:
from transformers import BertTokenizer, BertModel

import shutil, sys  
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDClassifier

import torch
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

root_path = 'Sentiment Classification on Movie Reviews/'

## BERT Tokenizer

The `tokenizer.encode_plus` function combines multiple steps for us:

1. Split the sentence into tokens.
2. Add the special `[CLS]` and `[SEP]` tokens.
3. Map the tokens to their IDs.
4. Pad or truncate all sentences to the same length.
5. Create the `attention masks` which explicitly differentiate real tokens from `[PAD]` tokens.


**Reference**
* Utilities for Tokenizers `encode_plus()`: [Docs](https://huggingface.co/transformers/internal/tokenization_utils.html#transformers.tokenization_utils_base.PreTrainedTokenizerBase.encode_plus)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

## Custom Dataset Class

* **IMDB Dataset**: [Kaggle](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews?select=IMDB+Dataset.csv)

In [None]:
class IMDBDataset(Dataset):
    def __init__(self, mode, filepath, tokenizer, max_len=256):
        assert mode in ['train', 'val']

        self.mode = mode
        # self.df = pd.read_csv(filepath).sample(frac=0.1) # get smaple
        self.df = pd.read_csv(filepath)  # please use this line
        self.tokenizer = tokenizer
        self.max_len = max_len

        # label to index
        self.label_map = {
            'positive':1,
            'negative':0
        }

        self.len = len(self.df)
        self.train_len = int(self.len * 0.8)
        if mode == 'train':
            self.df = self.df[: self.train_len]
            print('train size:', len(self.df))
        else: 
            self.df = self.df[self.train_len:]
            print('validation size:', len(self.df))
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.review.str.replace(r'(<.*\/>)', '').iloc[idx]
        label_str = self.df.sentiment.iloc[idx]
        label = self.label_map[label_str]

        inputs = self.tokenizer.encode_plus(
            text=text,
            text_pair=None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True
        )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(label, dtype=torch.float)
        }

## Dataset & DataLoader

In [None]:
# train
train_dataset = IMDBDataset('train', root_path + 'IMDB Dataset.csv', tokenizer)
test_dataset = IMDBDataset('val', root_path + 'IMDB Dataset.csv', tokenizer)

# test
train_dataloader = DataLoader(train_dataset, 16, shuffle=True)
test_dataloader = DataLoader(test_dataset, 16, shuffle=True)

train size: 40000
validation size: 10000


## Models

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
class PretrainBERT(torch.nn.Module):
    def __init__(self):
        super(PretrainBERT, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')

    def forward(self, ids, mask, token_type_ids):
        _, features = self.bert(ids, attention_mask = mask, token_type_ids = token_type_ids)
        return features

In [None]:
model = PretrainBERT().to(device)

In [None]:
model

PretrainBERT(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)

In [None]:
clf = SGDClassifier(learning_rate = 'constant', eta0=0.01)

## Features Extraction & Training Model

In [None]:
for b_idx, data in enumerate(train_dataloader):
    ids = data['ids'].to(device)
    mask = data['mask'].to(device)
    token_type_ids = data['token_type_ids'].to(device)
    targets = data['targets'].numpy() # shape: batch_size
    features = model(ids, mask, token_type_ids) # shape: batch_szie, 768
    
    features = features.to('cpu').detach().numpy()

    clf.partial_fit(features, targets, classes=np.unique(targets))
    del features

## Model Validation

In [None]:
acc = 0
count = 0
for b_idx, data in enumerate(test_dataloader):
    ids = data['ids'].to(device)
    mask = data['mask'].to(device)
    token_type_ids = data['token_type_ids'].to(device)
    targets = data['targets'].numpy() # shape: batch_size
    features = model(ids, mask, token_type_ids) # shape: batch_szie, 768
    
    features = features.to('cpu').detach().numpy()
    e_acc = clf.score(features, targets)
    acc+= e_acc
    count = b_idx

print('- Accuracy Mean:', acc/count)

- Accuracy Mean: 0.6332131410256411
