<a href="https://colab.research.google.com/github/jaeohshin/ML_with_Pytorch_Sklearn_rasbt/blob/main/ch15_part2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from IPython.display import Image
%matplotlib inline

In [None]:
import torch
import torch.nn as nn

In [None]:
pip install torchtext

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=2.3.0->torchtext)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=2.3.0->torchtext)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=2.3.0->torchtext)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=2.3.0->torchtext)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=2.3.0->torchtext)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch>=2.3.0->torchtext)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 

In [None]:
pip install portalocker

Collecting portalocker
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Downloading portalocker-2.10.1-py3-none-any.whl (18 kB)
Installing collected packages: portalocker
Successfully installed portalocker-2.10.1


In [None]:
from torchtext.datasets import IMDB
from torch.utils.data.dataset import random_split



In [None]:
pip install torchdata

Collecting torchdata
  Downloading torchdata-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading torchdata-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchdata
Successfully installed torchdata-0.7.1


In [None]:
# Step 1: load and create the datasets

train_dataset = IMDB(split='train')
test_dataset = IMDB(split='test')

test_dataset = list(test_dataset)

torch.manual_seed(1)

train_dataset, valid_dataset = random_split(
    list(train_dataset), [20000, 5000])

In [None]:
## Step 2: find unique tokens (words)
import re
from collections import Counter, OrderedDict

token_counts = Counter()

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = text.split()
    return tokenized

for label, line in train_dataset:
    tokens = tokenizer(line)
    token_counts.update(tokens)

print('Vocab-size:', len(token_counts))

Vocab-size: 69023


In [None]:
## Step 3: encoding each unique token into integers
from torchtext.vocab import vocab

sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)

vocab = vocab(ordered_dict)

vocab.insert_token("<pad>", 0)
vocab.insert_token("<unk>", 1)
vocab.set_default_index(1)

print([vocab[token] for token in ['I', 'will', 'love', 'you']])
print([vocab[token] for token in ['I', 'cool', 'a', 'boy']])

[1, 77, 115, 23]
[1, 620, 4, 408]


In [None]:
if torch.cuda.is_available():
    print("Warning: This code will run on GPU")



In [None]:
import torchtext

In [None]:
## Step 3-A: define the functions for transformation

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]

from torchtext import __version__ as torchtext_version
from pkg_resources import parse_version

if parse_version(torchtext.__version__) > parse_version("0.10"):
    label_pipeline = lambda x: 1. if x == 2 else 0.         # 1 ~ 부정 리뷰, 2 ~ 긍정 리뷰
else:
    label_pipeline = lambda x: 1. if x == 'pos' else 0.

def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for _label, _text in batch:
        label_list.append(label_pipeline(_label)) ## Either 1 or 0
        processed_text = torch.tensor(text_pipeline(_text),  ##text --> torken
                                      dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)
    padded_text_list = nn.utils.rnn.pad_sequence(
        text_list, batch_first=True)
    return padded_text_list.to(device), label_list.to(device), lengths.to(device)


In [None]:
## Take a small batch

from torch.utils.data import DataLoader
dataloader = DataLoader(train_dataset, batch_size=5, shuffle=False, collate_fn=collate_batch)
text_batch, label_batch, length_batch = next(iter(dataloader))
print(text_batch)
print(label_batch)
print(length_batch)
print(text_batch.shape)

tensor([[   35,  1739,     7,  ...,     0,     0,     0],
        [  216,   175,   724,  ...,     0,     0,     0],
        [   10,   121,    24,  ...,     8,    13,   428],
        [18923,     7,     4,  ...,     0,     0,     0],
        [   10,   256,     2,  ...,     0,     0,     0]], device='cuda:0')
tensor([1., 1., 1., 0., 1.], device='cuda:0')
tensor([165,  86, 218, 145, 116], device='cuda:0')
torch.Size([5, 218])


In [None]:
## Step 4: batching the datasets

batch_size = 32

train_dl = DataLoader(train_dataset, batch_size=batch_size,
                      shuffle=True, collate_fn=collate_batch)
valid_dl = DataLoader(valid_dataset, batch_size=batch_size,
                      shuffle=False, collate_fn=collate_batch)
test_dl = DataLoader(test_dataset, batch_size=batch_size,
                      shuffle=False, collate_fn=collate_batch)

In [None]:
## I don't understand this
embedding = nn.Embedding(
    num_embeddings=10,
    embedding_dim=3,
    padding_idx=0
)

In [None]:
embedding

Embedding(10, 3, padding_idx=0)

In [None]:
text_encoded_input = torch.LongTensor([1, 2, 4, 5])
print(embedding(text_encoded_input))

tensor([[ 0.7039, -0.8321, -0.4651],
        [-0.3203,  2.2408,  0.5566],
        [-0.4643,  0.3046,  0.7046],
        [-0.7106, -0.2959,  0.8356]], grad_fn=<EmbeddingBackward0>)


In [None]:
## RNN model

## Fully connected NN with one hidden layer

"""
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.rnn = nn.RNN(input_size,
                          hidden_size,
                          num_layers=2,
                          batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        _, hidden = self.rnn(x)
        out = hidden[-1, :, :]
        out = self.fc(out)
        return out

model = RNN(64, 32)
print(model)

#print(torch.randn(5, 3, 64))
model(torch.randn(5, 1, 64))
"""

'\nclass RNN(nn.Module):\n    def __init__(self, input_size, hidden_size):\n        super().__init__()\n        self.rnn = nn.RNN(input_size,\n                          hidden_size,\n                          num_layers=2,\n                          batch_first=True)\n        self.fc = nn.Linear(hidden_size, 1)\n\n    def forward(self, x):\n        _, hidden = self.rnn(x)\n        out = hidden[-1, :, :]\n        out = self.fc(out)\n        return out\n\nmodel = RNN(64, 32)\nprint(model)\n\n#print(torch.randn(5, 3, 64))\nmodel(torch.randn(5, 1, 64))\n'

In [None]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,
                                      embed_dim,
                                      padding_idx=0)
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size,
                           batch_first=True)
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False,
            batch_first=True
        )
        out, (hidden, cell) = self.rnn(out)
        out = hidden[-1, :, :]
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out= self.sigmoid(out)
        return out

vocab_size = len(vocab)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size)
model = model.to(device)

In [None]:
def train(dataloader):
    model.train()
    total_acc, total_loss = 0, 0
    for text_batch, label_batch, lengths in dataloader:
        optimizer.zero_grad()
        pred = model(text_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)
        loss.backward()
        optimizer.step()

        ## if pred >=0.5 -> positive, <0.5 -> negative review
        total_acc += ((pred>=0.5).float() == label_batch).float().sum().item() ## correct prediction
        total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

def evaluate(dataloader):
    model.eval()
    total_acc, total_loss = 0, 0
    with torch.no_grad():
        for text_batch, label_batch, lengths in dataloader:
            pred = model(text_batch, lengths)[:, 0]
            loss = loss_fn(pred, label_batch)
            total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
            total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(list(dataloader.dataset)), total_loss/len(list(dataloader.dataset))



In [None]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10

torch.manual_seed(1)

for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl)
    acc_valid, loss_valid = evaluate(valid_dl)
    print(f'에포크 {epoch} 정확도: {acc_train:.4f} 검증 정확도: {acc_valid:.4f}')

에포크 0 정확도: 0.6096 검증 정확도: 0.6852
에포크 1 정확도: 0.7257 검증 정확도: 0.7452
에포크 2 정확도: 0.7466 검증 정확도: 0.6284
에포크 3 정확도: 0.7253 검증 정확도: 0.5366
에포크 4 정확도: 0.7972 검증 정확도: 0.7492
에포크 5 정확도: 0.8619 검증 정확도: 0.7784
에포크 6 정확도: 0.8911 검증 정확도: 0.8040
에포크 7 정확도: 0.9162 검증 정확도: 0.8574
에포크 8 정확도: 0.9328 검증 정확도: 0.8598
에포크 9 정확도: 0.9504 검증 정확도: 0.8634


In [None]:
acc_test, _ = evaluate(test_dl)
print(f'테스트 정확도: {acc_test:.4f}')

테스트 정확도: 0.8571
