In [None]:
!pip install torchtext --user

In [None]:
!pip install torchdata --user

In [5]:
import torch

In [1]:
from torchtext.datasets import IMDB 
train_dataset = IMDB(split='train') 
test_dataset = IMDB(split='test')

In [2]:
len(list(train_dataset))

12500

In [3]:
len(list(test_dataset))

25000

In [6]:
from torch.utils.data.dataset import random_split 
torch.manual_seed(1) 
train_dataset, valid_dataset = random_split(list(train_dataset), [12000, 500])

In [11]:
import re 
from collections import Counter, OrderedDict 
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text) 
    emoticons = re.findall( '(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower() )  
    text = re.sub('[\W]+', ' ', text.lower())+' '.join(emoticons).replace('-', '')
    tokenized = text.split()
    return tokenized

In [12]:
token_counts = Counter() 
for label, line in train_dataset: 
    tokens = tokenizer(line) 
    token_counts.update(tokens) 
print('Vocab-size:', len(token_counts))

Vocab-size: 54011


In [13]:
pepe = tokenizer(next(iter(train_dataset))[1])

In [15]:
## Step 3: encoding each unique token into integers 
from torchtext.vocab import vocab 
sorted_by_freq_tuples = sorted(  token_counts.items(), key=lambda x: x[1], reverse=True  ) 
ordered_dict = OrderedDict(sorted_by_freq_tuples)
vocab = vocab(ordered_dict)
vocab.insert_token("<pad>", 0)
vocab.insert_token("<unk>", 1)
vocab.set_default_index(1)

In [19]:
print([vocab[token] for token in ['this', 'is', 'an', 'example']])

[11, 7, 40, 431]


In [24]:
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
label_pipeline = lambda x: 1. if x == 'pos' else 0.

In [25]:
import torch.nn as nn

In [21]:
def collate_batch(batch): 
    label_list, text_list, lengths = [], [], [] 
    for _label, _text in batch: 
        label_list.append(label_pipeline(_label)) 
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64) 
        text_list.append(processed_text) 
        lengths.append(processed_text.size(0)) 
    label_list = torch.tensor(label_list) 
    lengths = torch.tensor(lengths) 
    padded_text_list = nn.utils.rnn.pad_sequence( text_list, batch_first=True) 
    return padded_text_list, label_list, lengths

In [26]:
from torch.utils.data import DataLoader 
dataloader = DataLoader(train_dataset, batch_size=4, shuffle=False, collate_fn=collate_batch)

In [27]:
next(iter(dataloader))


(tensor([[  165,     9,   793,  ...,     6,     2,   961],
         [   11,    15,    14,  ...,     0,     0,     0],
         [ 5664, 33222,  5073,  ...,     0,     0,     0],
         [   11,    15,     7,  ...,     0,     0,     0]]),
 tensor([0., 0., 0., 0.]),
 tensor([925, 202, 185, 230]))