In [None]:
!pip install transformers

In [3]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

sentence_a = "this is a sentence"
sentence_b = "this is another sentence"

encoding = tokenizer(sentence_a, sentence_b, padding="max_length", truncation=True)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [22]:
from transformers import BertTokenizer, BertModel, logging
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

## BERT Tokenizer

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

## Set the Device

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

## Dataset Class

* `[CLS]` Sentence 1  (we use label text as an example here) `[SET]` Senetence 2 review text`[SEP]`.

In [68]:
class IMDB(Dataset):
    def __init__(self, mode, filepath, tokenizer, max_len=64):
        assert mode in ['train', 'val']

        self.mode = mode
        self.df = pd.read_csv(filepath)

        self.tokenizer = tokenizer
        self.max_len = max_len

        self.total_len = len(self.df)

        self.train_len = int(self.total_len * 0.8)
        
        if mode == 'train':
            self.df = self.df[: self.train_len]
        else:
            self.df = self.df[self.train_len:]

        print(f'* {mode} Size:', len(self.df)) 
    
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.review.str.replace(r'(<.*\/>)', '').iloc[idx]
        label = self.df.sentiment.iloc[idx]

        # longest_first: removing a token from the longest sequence in the pair until the proper length is reached.
        inputs = self.tokenizer.encode_plus(
            text=label, 
            text_pair=text, 
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length", 
            truncation='longest_first' 
            )
    
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']
        # decoded_sequence = tokenizer.decode(ids)
        # print(decoded_sequence)

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long)
        }


### Datasets & DataLoader

In [72]:
# dataset
train_dataset = IMDB('train', '/content/IMDB Dataset.csv', tokenizer)
test_dataset = IMDB('val', '/content/IMDB Dataset.csv', tokenizer)

# dataloader
train_dataloader = DataLoader(train_dataset, 4, shuffle=True)
test_dataloader = DataLoader(test_dataset, 4, shuffle=True)

* train Size: 40000
* val Size: 10000


In [71]:
for idx, data in enumerate(train_dataloader):
    break

[CLS] negative [SEP] very disappointing film. by the end i no longer cared for any of the characters. i did enjoy seeing ving rhames in a very small part, and william macy was good as always, still not worth watching. it starts out strong and just keeps getting weaker and weaker. insomnia [SEP]
[CLS] positive [SEP] jack frost 2, is probably the most cheesiest movie i have ever seen in my life. the complete title of the film, is jack frost 2 : revenge of the mutant killer snowman. horror movie fans that have a taste for campy story lines, will be delighted to watch this. [SEP]
[CLS] positive [SEP] if you are looking for a modern film version of buster crabbe or johnny weismuller's overcoming the machinations of unscrupulous, white safari guides or cunning, black tribesmen, while saving the animal kingdom, this is not the movie for you. this is a recount [SEP]
[CLS] negative [SEP] worst movie on earth. i don't even know where to begin but i hope i can save another person from punishing t