In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-1.11.0-py3-none-any.whl (264 kB)
[?25l[K     |█▎                              | 10 kB 30.8 MB/s eta 0:00:01[K     |██▌                             | 20 kB 26.3 MB/s eta 0:00:01[K     |███▊                            | 30 kB 17.1 MB/s eta 0:00:01[K     |█████                           | 40 kB 14.8 MB/s eta 0:00:01[K     |██████▏                         | 51 kB 9.9 MB/s eta 0:00:01[K     |███████▍                        | 61 kB 11.2 MB/s eta 0:00:01[K     |████████▋                       | 71 kB 10.2 MB/s eta 0:00:01[K     |██████████                      | 81 kB 11.3 MB/s eta 0:00:01[K     |███████████▏                    | 92 kB 8.9 MB/s eta 0:00:01[K     |████████████▍                   | 102 kB 9.1 MB/s eta 0:00:01[K     |█████████████▋                  | 112 kB 9.1 MB/s eta 0:00:01[K     |██████████████▉                 | 122 kB 9.1 MB/s eta 0:00:01[K     |████████████████                | 133 kB 9.1 MB/s eta 0:00

In [2]:
import functools
import sys

import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
import tqdm

import datasets
import numpy as np

In [3]:
tr_data, te_data = datasets.load_dataset('imdb', split=['train', 'test'])

Downloading:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a...


Downloading:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a. Subsequent calls will reuse this data.


In [4]:
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

def tokenize_data(example, tokenizer, max_length):
    tokens = tokenizer(example['text'])[:max_length]
    return {'tokens': tokens}

tr_data = tr_data.map(tokenize_data, fn_kwargs={'tokenizer': tokenizer, 'max_length': 256})
te_data = te_data.map(tokenize_data, fn_kwargs={'tokenizer': tokenizer, 'max_length': 256})

  0%|          | 0/25000 [00:00<?, ?ex/s]

  0%|          | 0/25000 [00:00<?, ?ex/s]

In [5]:
test_size = 0.3

train_valid_data = tr_data.train_test_split(test_size=test_size)
tr_data = train_valid_data['train']
valid_data = train_valid_data['test']

min_freq = 7
special_tokens = ['<unk>', '<pad>']

vocab = torchtext.vocab.build_vocab_from_iterator(tr_data['tokens'],
                                                  min_freq=min_freq,
                                                  specials=special_tokens)

In [6]:
unk_index = vocab['<unk>']
pad_index = vocab['<pad>']

In [7]:
vocab.set_default_index(unk_index)

In [8]:
def numericalize_data(example, vocab):
    ids = [vocab[token] for token in example['tokens']]
    return {'ids': ids}

In [9]:
tr_data = tr_data.map(numericalize_data, fn_kwargs={'vocab': vocab})
valid_data = valid_data.map(numericalize_data, fn_kwargs={'vocab': vocab})
te_data = te_data.map(numericalize_data, fn_kwargs={'vocab': vocab})

  0%|          | 0/17500 [00:00<?, ?ex/s]

  0%|          | 0/7500 [00:00<?, ?ex/s]

  0%|          | 0/25000 [00:00<?, ?ex/s]

In [10]:
tr_data = tr_data.with_format(type='torch', columns=['ids', 'label'])
valid_data = valid_data.with_format(type='torch', columns=['ids', 'label'])
te_data = te_data.with_format(type='torch', columns=['ids', 'label'])

In [11]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout_rate, 
                 pad_index):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_index)
        self.convs = nn.ModuleList([nn.Conv1d(embedding_dim, 
                                              n_filters, 
                                              filter_size) 
                                    for filter_size in filter_sizes])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout_rate)
        
    def forward(self, ids):
        embedded = self.dropout(self.embedding(ids))
        embedded = embedded.permute(0,2,1)
        conved = [torch.relu(conv(embedded)) for conv in self.convs]
        pooled = [conv.max(dim=-1).values for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim=-1))
        prediction = self.fc(cat)
        return prediction

In [12]:
vocab_size = len(vocab)
embedding_dim = 300
n_filters = 120
filter_sizes = [3,5,7,9]
output_dim = len(tr_data.unique('label'))
dropout_rate = 0.3

model = CNN(vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout_rate, pad_index)

In [13]:
def initializeWeights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight)
        nn.init.zeros_(m.bias)
    elif isinstance(m, nn.Conv1d):
        nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
        nn.init.zeros_(m.bias)

model.apply(initializeWeights)

CNN(
  (embedding): Embedding(16932, 300, padding_idx=1)
  (convs): ModuleList(
    (0): Conv1d(300, 120, kernel_size=(3,), stride=(1,))
    (1): Conv1d(300, 120, kernel_size=(5,), stride=(1,))
    (2): Conv1d(300, 120, kernel_size=(7,), stride=(1,))
    (3): Conv1d(300, 120, kernel_size=(9,), stride=(1,))
  )
  (fc): Linear(in_features=480, out_features=2, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)

In [14]:
vectors = torchtext.vocab.FastText()
pretrained_embedding = vectors.get_vecs_by_tokens(vocab.get_itos())
model.embedding.weight.data = pretrained_embedding

.vector_cache/wiki.en.vec: 6.60GB [02:58, 36.9MB/s]                            
  0%|          | 0/2519370 [00:00<?, ?it/s]Skipping token b'2519370' with 1-dimensional vector [b'300']; likely a header
100%|██████████| 2519370/2519370 [04:19<00:00, 9690.40it/s]


In [15]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = criterion.to(device)

In [16]:
def collate(batch, pad_index):
    batch_ids = [i['ids'] for i in batch]
    batch_ids = nn.utils.rnn.pad_sequence(batch_ids, padding_value=pad_index, batch_first=True)
    batch_label = [i['label'] for i in batch]
    batch_label = torch.stack(batch_label)
    batch = {'ids': batch_ids,
             'label': batch_label}
    return batch

In [17]:
batch_size = 512
collate = functools.partial(collate, pad_index=pad_index)
tr_dataloader = torch.utils.data.DataLoader(tr_data, batch_size=batch_size, collate_fn=collate, shuffle=True)
valid_dataloader = torch.utils.data.DataLoader(valid_data, batch_size=batch_size, collate_fn=collate)
te_dataloader = torch.utils.data.DataLoader(te_data, batch_size=batch_size, collate_fn=collate)

In [19]:
def train_cnn(dataloader, model, criterion, optimizer, device):

    model.train()
    epoch_losses = []
    epoch_accs = []

    for batch in tqdm.tqdm(dataloader, desc='training the model...', file=sys.stdout):
        ids = batch['ids'].to(device)
        label = batch['label'].to(device)
        prediction = model(ids)
        loss = criterion(prediction, label)
        accuracy = get_accuracy(prediction, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
        epoch_accs.append(accuracy.item())

    return epoch_losses, epoch_accs

In [23]:
def evaluate_cnn(dataloader, model, criterion, device):
    
    model.eval()
    epoch_losses = []
    epoch_accs = []

    with torch.no_grad():
        for batch in tqdm.tqdm(dataloader, desc='evaluating the model...', file=sys.stdout):
            ids = batch['ids'].to(device)
            label = batch['label'].to(device)
            prediction = model(ids)
            loss = criterion(prediction, label)
            accuracy = get_accuracy(prediction, label)
            epoch_losses.append(loss.item())
            epoch_accs.append(accuracy.item())

    return epoch_losses, epoch_accs

In [21]:
def get_accuracy(prediction, label):
    batch_size, _ = prediction.shape
    predicted_classes = prediction.argmax(dim=-1)
    correct_predictions = predicted_classes.eq(label).sum()
    return correct_predictions / batch_size

In [22]:
n_epochs = 5
best_valid_loss = float('inf')

train_losses = []
train_accs = []
valid_losses = []
valid_accs = []

for epoch in range(n_epochs):

    train_loss, train_acc = train_cnn(tr_dataloader, model, criterion, optimizer, device)
    valid_loss, valid_acc = evaluate_cnn(valid_dataloader, model, criterion, device)

    train_losses.extend(train_loss)
    train_accs.extend(train_acc)
    valid_losses.extend(valid_loss)
    valid_accs.extend(valid_acc)
    
    epoch_train_loss = np.mean(train_loss)
    epoch_train_acc = np.mean(train_acc)
    epoch_valid_loss = np.mean(valid_loss)
    epoch_valid_acc = np.mean(valid_acc)
    
    if epoch_valid_loss < best_valid_loss:
        best_valid_loss = epoch_valid_loss
        torch.save(model.state_dict(), 'cnn_model.pt')
    
    print(f'epoch: {epoch+1}')
    print(f'train_loss: {epoch_train_loss:.3f}, train_acc: {epoch_train_acc:.3f}')
    print(f'valid_loss: {epoch_valid_loss:.3f}, valid_acc: {epoch_valid_acc:.3f}')

training the model...: 100%|██████████| 35/35 [00:11<00:00,  3.11it/s]
evaluating...: 100%|██████████| 15/15 [00:02<00:00,  6.34it/s]
epoch: 1
train_loss: 0.829, train_acc: 0.579
valid_loss: 0.551, valid_acc: 0.720
training the model...: 100%|██████████| 35/35 [00:09<00:00,  3.54it/s]
evaluating...: 100%|██████████| 15/15 [00:02<00:00,  6.62it/s]
epoch: 2
train_loss: 0.523, train_acc: 0.736
valid_loss: 0.403, valid_acc: 0.824
training the model...: 100%|██████████| 35/35 [00:10<00:00,  3.40it/s]
evaluating...: 100%|██████████| 15/15 [00:02<00:00,  6.50it/s]
epoch: 3
train_loss: 0.385, train_acc: 0.831
valid_loss: 0.341, valid_acc: 0.852
training the model...: 100%|██████████| 35/35 [00:10<00:00,  3.43it/s]
evaluating...: 100%|██████████| 15/15 [00:02<00:00,  6.81it/s]
epoch: 4
train_loss: 0.312, train_acc: 0.867
valid_loss: 0.321, valid_acc: 0.865
training the model...: 100%|██████████| 35/35 [00:09<00:00,  3.51it/s]
evaluating...: 100%|██████████| 15/15 [00:02<00:00,  6.95it/s]
epoch:

In [24]:
model.load_state_dict(torch.load('cnn_model.pt'))
test_loss, test_acc = evaluate_cnn(te_dataloader, model, criterion, device)
epoch_test_loss = np.mean(test_loss)
epoch_test_acc = np.mean(test_acc)

print(f'test_loss: {epoch_test_loss:.3f}, test_acc: {epoch_test_acc:.3f}')

evaluating the model...: 100%|██████████| 49/49 [00:08<00:00,  6.12it/s]
test_loss: 0.310, test_acc: 0.867


In [25]:
def predSentiment(text, model, tokenizer, vocab, device, min_length, pad_index):
    tokens = tokenizer(text)
    ids = [vocab[t] for t in tokens]
    if len(ids) < min_length:
        ids += [pad_index] * (min_length - len(ids))
    tensor = torch.LongTensor(ids).unsqueeze(dim=0).to(device)
    prediction = model(tensor).squeeze(dim=0)
    probability = torch.softmax(prediction, dim=-1)
    return prediction.argmax(dim=-1).item(), probability[prediction.argmax(dim=-1).item()].item() #returning class and probability

In [26]:
min_length = max(filter_sizes)

predSentiment("This film is terrible!", model, tokenizer, vocab, device, min_length, pad_index)

(0, 0.9363068342208862)

In [27]:
predSentiment("This film is not great!", model, tokenizer, vocab, device, min_length, pad_index)

(1, 0.8856527805328369)

In [28]:
predSentiment("This film is great!", model, tokenizer, vocab, device, min_length, pad_index)

(1, 0.9504768252372742)

In [29]:
predSentiment("This film is terrible, it's not that great!", model, tokenizer, vocab, device, min_length, pad_index)

(0, 0.8816438913345337)