In [None]:
!pip install datasets



In [None]:
from datasets import load_dataset
imdb = load_dataset("imdb")

Reusing dataset imdb (/root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3)


In [None]:
print(imdb)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [None]:
train_data, test_data = imdb['train'], imdb['test']

In [None]:
print(train_data[100])

{'label': 1, 'text': "I was prepared for a turgid talky soap opera cum travelogue, but was pleased to find a fast-paced script, an underlying moral, excellent portrayals from all the actors, especially Peter Finch, amazing special effects, suspense, and beautiful cinematography--there's even a shot of the majestic stone Buddhas recently destroyed by the Taliban. Not to mention Elizabeth Taylor at her most gloriously beautiful and sympathetic, before she gave in to the gaspy hysterics that marred her later work. All the supporting players round it out, and I do wonder who trained all those elephants.<br /><br />Speaking of the stone-Buddha sequence, you really can discern that it's Vivien Leigh in the long shots. Her shape and the way she moves is distinct from Taylor's. The only thing marring that sequence are the poorly done process shots, where the background moves by much too fast for horses at a walk.<br /><br />If you want a thought-provoking film that is beautiful to watch and ne

In [None]:
vocab_size = 10000
embedding_dim = 64
max_length = 140

Chúng ta sẽ xây dựng một mô hinh phân loại cảm xúc Sentiment Analysis (Easy nhất có thể)

Các bước cần thực hiện là:
- 1. Tiền xử lý:
    - Tách từ
    - Tạo bộ vocab
    - Chuyển câu thành list các token index
    - Padding để các câu bằng nhau
    - Chuyển các câu từ `numpy` sang `torch.LongTensor`

- 2. Xây dựng model: Các model trong pytorch sẽ extend từ `torch.nn.Module`. Model của chúng ta sẽ thực sự basic (and stupid ofc). Model baseline sẽ gồm:
    - 1. Layer Embedding, mapping token sang vector
    - 2. Flatten các vector: 1 câu gồm N tokens, có embedding D, sẽ flatten thành vector (NxD,)
    - 3. Stack các tầng Dense
    - 4. Kết thúc là 1 tầng Dense output 1, có activation sigmoid :D

- 3. Huấn luyện:
    - Forward từng batch input, có prediction probabilities
    - Tính loss
    - Backward và update

## 1. Tiền xử lý

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [None]:
import re

WHITE_SPACE_TOKENIZER = lambda string: re.split(r"\s+", string)

In [None]:
from collections import Counter
from itertools import chain
from tqdm.notebook import tqdm
from typing import List


class SimpleTokenizer:
    """A mapping from word to token's index and vice versa"""
    def __init__(self, tokenizer=WHITE_SPACE_TOKENIZER):
        self.token2id = {}
        self.id2token = {}
        self.pad = self._add_token("<pad>")
        self.unk = self._add_token("<unk>")
        self.dismissed_tokens = [self.pad]
        self.tokenizer = tokenizer
        
    def _add_token(self, token: str):
        if token not in self.token2id:
            idx = len(self.id2token)
            self.id2token[idx] = token
            self.token2id[token] = idx
            return idx

    def fit_on_texts(self, corpus: List[str], num_words: int = 16000, freq_cutoff: int = 1):
        """Read corpus and build mapping from token to word"""
        lines = corpus
        tokenized_lines = [
            [token for token in self.tokenizer(line) if token]
            for line in tqdm(lines)
        ]

        word_freq = Counter(chain(*tokenized_lines))
        valid_words = [w for w, v in word_freq.items() if v >= freq_cutoff]
        print(
            "number of word types: {}, number of word types w/ frequency >= {}: {}".format(
                len(word_freq), freq_cutoff, len(valid_words)
            )
        )
        top_words = sorted(valid_words, key=lambda w: word_freq[w], reverse=True)[:num_words-2]  # Not include pad and unk tokens, sorry it's ugly =((
        for word in top_words:
            self._add_token(word)

    def encode(self, string):
        """Tokenize a string and mapping each token to its index in vocab"""
        return [self.token2id.get(token, self.unk) for token in self.tokenizer(string) if token]

    def decode(self, list_indices: List[int]):
        return " ".join(
            [
                self.id2token.get(idx, "")
                for idx in list_indices
                if idx not in self.dismissed_tokens # Not show dismissed token
            ]
        )

    def pad_sequence(self, seq: List[int], maxlen=256):
        strip_seq = seq[:maxlen]
        num_pad_indices = maxlen - len(strip_seq)
        return strip_seq + [self.pad] * num_pad_indices

    def texts_to_sequences(self, lines: List[str], maxlen=256):
        return [self.pad_sequence(self.encode(line), maxlen) for line in tqdm(lines)]

In [None]:
tokenizer = SimpleTokenizer(WHITE_SPACE_TOKENIZER)
tokenizer.fit_on_texts(train_data['text'], num_words=vocab_size)

HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))


number of word types: 280617, number of word types w/ frequency >= 1: 280617


In [None]:
train_sequences = tokenizer.texts_to_sequences(train_data['text'], maxlen=max_length)

HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))




In [None]:
tokenizer.decode(tokenizer.encode(train_data[0]['text']))

"<unk> High is a cartoon comedy. It ran at the same time as some other programs about school life, such as <unk> My 35 years in the teaching profession lead me to believe that <unk> <unk> satire is much closer to reality than is <unk> The <unk> to survive <unk> the insightful students who can see right through their pathetic <unk> <unk> the <unk> of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately <unk> <unk> at <unk> <unk> A classic line: <unk> I'm here to <unk> one of your <unk> <unk> <unk> to <unk> <unk> I expect that many adults of my age think that <unk> High is far <unk> What a pity that it <unk>"

In [None]:
print(train_sequences[0])
print(tokenizer.decode(train_sequences[0]))

[1, 2717, 7, 3, 1398, 1350, 56, 2215, 31, 2, 169, 83, 15, 46, 80, 9006, 42, 518, 776, 138, 15, 1, 359, 7182, 190, 8, 2, 5768, 8850, 493, 84, 6, 263, 10, 1, 1, 2618, 7, 78, 2641, 6, 897, 70, 7, 1, 19, 1, 6, 2701, 1, 2, 7910, 1890, 35, 64, 67, 257, 148, 55, 1703, 1, 1, 2, 1, 5, 2, 213, 5388, 37, 2973, 84, 5, 2, 8534, 9, 635, 4, 55, 9007, 283, 9, 207, 2, 482, 8, 61, 3, 1831, 4548, 762, 6, 4381, 215, 2, 3138, 9, 1369, 1, 1, 31, 1, 1, 133, 441, 7520, 1, 160, 225, 6, 1, 32, 5, 117, 1, 1, 1, 6, 1, 1, 9, 543, 10, 102, 2041, 5, 66, 877, 98, 10, 1, 2717, 7, 237, 1, 255, 3, 3139, 10, 12, 1]
<unk> High is a cartoon comedy. It ran at the same time as some other programs about school life, such as <unk> My 35 years in the teaching profession lead me to believe that <unk> <unk> satire is much closer to reality than is <unk> The <unk> to survive <unk> the insightful students who can see right through their pathetic <unk> <unk> the <unk> of the whole situation, all remind me of the schools I knew and t

In [None]:
train_sequences = torch.LongTensor(train_sequences)
print(train_sequences.shape)

torch.Size([25000, 140])


In [None]:
test_sequences = tokenizer.texts_to_sequences(test_data['text'], maxlen=max_length)
test_sequences = torch.LongTensor(test_sequences)
print(test_sequences.shape)
print(tokenizer.decode(test_sequences[0].tolist()))

HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))


torch.Size([25000, 140])
I went and saw this movie last night after being <unk> to by a few friends of <unk> I'll admit that I was reluctant to see it because from what I knew of <unk> <unk> he was only able to do comedy. I was wrong. <unk> played the character of Jake <unk> very well, and Kevin <unk> played Ben <unk> with such <unk> The sign of a good movie is that it can toy with our emotions. This one did exactly that. The entire theater (which was sold <unk> was overcome by laughter during the first half of the movie, and were moved to tears during the second half. While <unk> the theater I not only saw many women in <unk> but many full grown men as well, trying desperately not to let anyone see them <unk> This movie was


In [None]:
train_labels = torch.Tensor(train_data['label']).view(-1, 1)
test_labels = torch.Tensor(test_data['label']).view(-1, 1)

## Xây dựng model, hàm loss và optimizer

Model của chúng ta sẽ rấttttttt đơn giản >.< 

In [None]:
model = nn.Sequential(
    nn.Embedding(vocab_size, embedding_dim=embedding_dim, padding_idx=tokenizer.pad),
    nn.Flatten(),
    nn.Linear(embedding_dim*max_length, 10),
    nn.ReLU(),
    nn.Linear(10, 1),
    nn.Sigmoid()
)

In [None]:
print(model)
assert model(train_sequences[:3]).shape == (3, 1), "Có gì đó sai sai"

Sequential(
  (0): Embedding(10000, 64, padding_idx=0)
  (1): Flatten(start_dim=1, end_dim=-1)
  (2): Linear(in_features=8960, out_features=10, bias=True)
  (3): ReLU()
  (4): Linear(in_features=10, out_features=1, bias=True)
  (5): Sigmoid()
)


In [None]:
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters())
N_EPOCHS = 20
BATCH_SIZE = 128

train_dataloader = DataLoader(list(zip(train_sequences, train_labels)), batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(list(zip(test_sequences, test_labels)), batch_size=BATCH_SIZE)

In [None]:
for batch_X, batch_Y in train_dataloader:
    print(batch_X.shape)
    for ids, lab in list(zip(batch_X, batch_Y))[:10]:
        print(lab, tokenizer.decode(ids.tolist()))
    break

torch.Size([128, 140])
tensor([0.]) <unk> again from the <unk> What ever happened to the great Barry <unk> He directed two of my all-time favorites in <unk> and <unk> He had some fine movies as well <unk> but always provided something of interest ... until now. I believe the worst thing you can ever say about a comedy is that it is boring. <unk> is the definition of boring. Never of big fan of pure slap stick <unk> and <unk> I was just stunned at how <unk> this movie is. There are maybe 2 <unk> in the whole thing - if you can pay attention that long. The best part of the film is the running gag of the title song by a <unk> <unk> If the film had been written as well as the song, it would have been <unk> Rachel <unk> is a
tensor([0.]) Although in some aspects Seven <unk> is solid and interesting in some of its narrative style, <unk> <unk> project is rather mediocre. The movie becomes more and more sappy and manipulative as it move toward the <unk> hearts human and <unk> eyes physical and

In [None]:
for epoch in range(N_EPOCHS):
    epoch_losses = []
    for inputs, labels in tqdm(train_dataloader, desc=f'EPOCH {epoch:02d}'):
        optimizer.zero_grad()
        prediction = model(inputs)
        loss = criterion(prediction, labels)
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
    print(sum(epoch_losses) / len(epoch_losses))
print("Finished training")

HBox(children=(FloatProgress(value=0.0, description='EPOCH 00', max=196.0, style=ProgressStyle(description_wid…


0.6904374485721394


HBox(children=(FloatProgress(value=0.0, description='EPOCH 01', max=196.0, style=ProgressStyle(description_wid…


0.5723713782368874


HBox(children=(FloatProgress(value=0.0, description='EPOCH 02', max=196.0, style=ProgressStyle(description_wid…


0.36304976912785547


HBox(children=(FloatProgress(value=0.0, description='EPOCH 03', max=196.0, style=ProgressStyle(description_wid…


0.18574763910502803


HBox(children=(FloatProgress(value=0.0, description='EPOCH 04', max=196.0, style=ProgressStyle(description_wid…


0.08974502032280576


HBox(children=(FloatProgress(value=0.0, description='EPOCH 05', max=196.0, style=ProgressStyle(description_wid…


0.049015900747356365


HBox(children=(FloatProgress(value=0.0, description='EPOCH 06', max=196.0, style=ProgressStyle(description_wid…


0.03152686369377283


HBox(children=(FloatProgress(value=0.0, description='EPOCH 07', max=196.0, style=ProgressStyle(description_wid…


0.024478453339780777


HBox(children=(FloatProgress(value=0.0, description='EPOCH 08', max=196.0, style=ProgressStyle(description_wid…


0.020203364331142178


HBox(children=(FloatProgress(value=0.0, description='EPOCH 09', max=196.0, style=ProgressStyle(description_wid…


0.01762387785010458


HBox(children=(FloatProgress(value=0.0, description='EPOCH 10', max=196.0, style=ProgressStyle(description_wid…


0.016011005304920087


HBox(children=(FloatProgress(value=0.0, description='EPOCH 11', max=196.0, style=ProgressStyle(description_wid…


0.014050147132662942


HBox(children=(FloatProgress(value=0.0, description='EPOCH 12', max=196.0, style=ProgressStyle(description_wid…


0.013197235774061148


HBox(children=(FloatProgress(value=0.0, description='EPOCH 13', max=196.0, style=ProgressStyle(description_wid…


0.012493864271423913


HBox(children=(FloatProgress(value=0.0, description='EPOCH 14', max=196.0, style=ProgressStyle(description_wid…


0.011534719830093791


HBox(children=(FloatProgress(value=0.0, description='EPOCH 15', max=196.0, style=ProgressStyle(description_wid…


0.010994643163160487


HBox(children=(FloatProgress(value=0.0, description='EPOCH 16', max=196.0, style=ProgressStyle(description_wid…


0.010554827777793565


HBox(children=(FloatProgress(value=0.0, description='EPOCH 17', max=196.0, style=ProgressStyle(description_wid…


0.010041357181985311


HBox(children=(FloatProgress(value=0.0, description='EPOCH 18', max=196.0, style=ProgressStyle(description_wid…


0.009738462535802237


HBox(children=(FloatProgress(value=0.0, description='EPOCH 19', max=196.0, style=ProgressStyle(description_wid…


0.009268549476288097
Finished training


In [None]:
test_sen = ["This movie is bad"]

test_seq = tokenizer.texts_to_sequences(test_sen, maxlen=max_length)
test_seq = torch.LongTensor(test_seq)
print(test_seq.shape)
print(model(test_seq))


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


torch.Size([1, 140])
tensor([[0.0004]], grad_fn=<SigmoidBackward>)
