!pip install torchtext --user

!pip install torchdata --user

In [1]:
import torch
import os

In [2]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1" 
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cuda')
device

device(type='cuda')

In [3]:
from torchtext.datasets import IMDB 
train_dataset = IMDB(split='train') 
test_dataset = IMDB(split='test')

In [4]:
len(list(train_dataset))

12500

In [5]:
len(list(test_dataset))

25000

In [6]:
from torch.utils.data.dataset import random_split 
torch.manual_seed(1) 
train_dataset, valid_dataset = random_split(list(train_dataset), [12000, 500])

In [7]:
import re 
from collections import Counter, OrderedDict 
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text) 
    emoticons = re.findall( '(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower() )  
    text = re.sub('[\W]+', ' ', text.lower())+' '.join(emoticons).replace('-', '')
    tokenized = text.split()
    return tokenized

In [8]:
token_counts = Counter() 
for label, line in train_dataset: 
    tokens = tokenizer(line) 
    token_counts.update(tokens) 
print('Vocab-size:', len(token_counts))

Vocab-size: 54011


In [9]:
pepe = tokenizer(next(iter(train_dataset))[1])

In [10]:
## Step 3: encoding each unique token into integers 
from torchtext.vocab import vocab 
sorted_by_freq_tuples = sorted(  token_counts.items(), key=lambda x: x[1], reverse=True  ) 
ordered_dict = OrderedDict(sorted_by_freq_tuples)
vocab = vocab(ordered_dict)
vocab.insert_token("<pad>", 0)
vocab.insert_token("<unk>", 1)
vocab.set_default_index(1)

In [11]:
print([vocab[token] for token in ['this', 'is', 'an', 'example']])

[11, 7, 40, 431]


In [12]:
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
label_pipeline = lambda x: 1. if x == 'pos' else 0.

In [13]:
import torch.nn as nn

In [14]:
def collate_batch(batch): 
    label_list, text_list, lengths = [], [], [] 
    for _label, _text in batch: 
        label_list.append(label_pipeline(_label)) 
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64) 
        text_list.append(processed_text) 
        lengths.append(processed_text.size(0)) 
    label_list = torch.tensor(label_list) 
    lengths = torch.tensor(lengths) 
    padded_text_list = nn.utils.rnn.pad_sequence( text_list, batch_first=True) 
    return padded_text_list, label_list, lengths

In [15]:
from torch.utils.data import DataLoader 
dataloader = DataLoader(train_dataset, batch_size=4, shuffle=False, collate_fn=collate_batch)

In [16]:
next(iter(dataloader))[0][0]


tensor([  165,     9,   793,     9,   166,     6,  3232,   125,    12,    11,
           15,    10,     4,     5,   405,     7,    67,    90,    87,     4,
           90,   899,    38,    33,   542,   702, 33216,    41,   974,  1095,
           32,  6250,     8,     7,  1038,   527,     4,   642,    10,     2,
          211,  1674,    19,     3,   175,     5,  1375,     8,     7,     3,
           67,    54,    15,    16,    12,     9,   195,     8,     2,   119,
          399,   203,   247,    75,  1925,   117,    62,    18,    46,    22,
           51,   311,     3,  2674,   244,  2675,     9,   166,   234,     6,
          194,    45,     2,  1363,  2732,  1166,     5,    11,    21,     4,
           83,  2014,     8,     7,     6,  1892,    80,     3,   175,     5,
         1375,    19,    43,     8,     7,   961,     9,  4284,    12,    11,
           21,   876,    10,     2,   737,  1472,   209,   112,    12,     2,
          278,   107,   866,  4285, 22091,     7,     3,   500, 

In [17]:
batch_size = 32
train_dl = DataLoader(train_dataset, batch_size=batch_size,  shuffle=True, collate_fn=collate_batch, pin_memory=False) 
valid_dl = DataLoader(valid_dataset, batch_size=batch_size,  shuffle=False, collate_fn=collate_batch, pin_memory=False)
test_dl = DataLoader(test_dataset, batch_size=batch_size,  shuffle=False, collate_fn=collate_batch, pin_memory=False)

In [18]:
embedding = nn.Embedding(  num_embeddings=10,  embedding_dim=3,  padding_idx=0)

In [19]:
class RNN(nn.Module): 
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size,  fc_hidden_size): 
        super().__init__() 
        self.embedding = nn.Embedding(vocab_size,  embed_dim,  padding_idx=0) 
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size,  batch_first=True) 
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size) 
        self.relu = nn.ReLU()  
        self.fc2 = nn.Linear(fc_hidden_size, 1) 
        self.sigmoid = nn.Sigmoid()  
    def forward(self, text, lengths): 
        out = self.embedding(text) 
        out = nn.utils.rnn.pack_padded_sequence(  out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True  ) 
        out, (hidden, cell) = self.rnn(out) 
        out = hidden[-1, :, :] 
        out = self.fc1(out) 
        out = self.relu(out) 
        out = self.fc2(out) 
        out = self.sigmoid(out) 
        return out

In [20]:
vocab_size = len(vocab) 
embed_dim = 20 
rnn_hidden_size = 64 
fc_hidden_size = 64 
torch.manual_seed(1) 
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size)

In [21]:
loss_fn = nn.BCELoss() 
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [23]:
def train(dataloader): 
    model.train() 
    total_acc, total_loss = 0, 0 
    for text_batch, label_batch, lengths in dataloader: 
        optimizer.zero_grad() 
        pred = model(text_batch, lengths)[:, 0] 
        loss = loss_fn(pred, label_batch) 
        loss.backward() 
        optimizer.step() 
        total_acc += (  (pred >= 0.5).float() == label_batch  ).float().sum().item() 
        total_loss += loss.item()*label_batch.size(0) 
    return total_acc/len(dataloader.dataset),  total_loss/len(dataloader.dataset)

In [25]:
def evaluate(dataloader): 
    model.eval() 
    total_acc, total_loss = 0, 0
    with torch.no_grad(): 
        for text_batch, label_batch, lengths in dataloader: 
            pred = model(text_batch, lengths)[:, 0] 
            loss = loss_fn(pred, label_batch) 
            total_acc += (  (pred>=0.5).float() == label_batch  ).float().sum().item() 
            total_loss += loss.item()*label_batch.size(0) 
    return total_acc/len(dataloader.dataset),  total_loss/len(dataloader.dataset)

In [26]:
num_epochs = 10 
torch.manual_seed(1) 
for epoch in range(num_epochs): 
    acc_train, loss_train = train(train_dl) 
    acc_valid, loss_valid = evaluate(valid_dl) 
    print(f'Epoch {epoch} accuracy:{acc_train:.4f}'  f' val_accuracy: {acc_valid:.4f}')

Epoch 0 accuracy:1.0000 val_accuracy: 1.0000
Epoch 1 accuracy:1.0000 val_accuracy: 1.0000
Epoch 2 accuracy:1.0000 val_accuracy: 1.0000


KeyboardInterrupt: 