In [2]:
!pip install -U torchtext==0.6.0

Collecting torchtext==0.6.0
  Downloading torchtext-0.6.0-py3-none-any.whl (64 kB)
[K     |████████████████████████████████| 64 kB 2.6 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 16.4 MB/s 
Installing collected packages: sentencepiece, torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.12.0
    Uninstalling torchtext-0.12.0:
      Successfully uninstalled torchtext-0.12.0
Successfully installed sentencepiece-0.1.96 torchtext-0.6.0


In [3]:
!python -m spacy download en

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 8.9 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [4]:
import torch
from torchtext import data


TEXT = data.Field(tokenize = 'spacy',
                  tokenizer_language = 'en',
                  include_lengths = True)

LABEL = data.LabelField(dtype = torch.float) # postive = 1 / negative = 0

In [5]:
### Load and Split data

from torchtext import datasets

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:01<00:00, 69.5MB/s]


In [6]:
import random

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [7]:
print(f'training examples : {len(train_data)}')
print(f'validations examples : {len(train_data)}')
print(f'test examples : {len(test_data)}')

training examples : 17500
validations examples : 17500
test examples : 25000


In [8]:
### Build Vocabulary

### Use Pretrained Word Embedding -> Glove.6B.100d(embedding vector 100 dimension)

MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train_data,
                 max_size = MAX_VOCAB_SIZE,
                 vectors = 'glove.6B.100d',
                 unk_init = torch.Tensor.normal_) # unk_init => initialization pre-trained embeddings
LABEL.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [02:39, 5.40MB/s]                           
100%|█████████▉| 399999/400000 [00:13<00:00, 28959.23it/s]


In [9]:
print(f'Unique tokens in TEXT vocabulary : {len(TEXT.vocab)}')

Unique tokens in TEXT vocabulary : 25002


In [10]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [11]:
BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    device = device
)

### **LSTM (Long short-Term Memory)**
- 기존 RNN의 문제점인 vanishing gradient를 해결
- LSTM은 이전 step의 정보를 얼마나 forget할지 곱해주고 그 결과를 현재의 정보의 더해서 다음 step으로 전달


### **Bidirectional RNN**

- 과거의 hidden state를 저장하는 기존 RNN과 달리 미래의 hidden state 또한 고려하도록 확장된 모델  

In [46]:
import torch.nn as nn

class RNN(nn.Module):
  def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers,
               bidirectional, dropout, pad_idx):
    super().__init__()

    self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx = pad_idx)

    self.rnn = nn.LSTM(embedding_dim,
                       hidden_dim,
                       num_layers = n_layers,
                       bidirectional = bidirectional,
                       dropout = dropout)
    
    self.bidirectional = bidirectional

    if bidirectional:
      self.fc = nn.Linear(hidden_dim * 2, output_dim) ## bidirectional 
    else:
      self.fc = nn.Linear(hidden_dim, output_dim)
    
    self.dropout = nn.Dropout(dropout)

  def forward(self, text, text_length):

    # input_text = [sentence_length, batch_size]

    embedded = self.dropout(self.embedding(text))
    # embedded = [sentence_length, batch_size, embedding_dim]

    # pack sequence
    packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_length.to('cpu'))
    
    # output of LSTM are hidden_state and cell state(memory cell)
    packed_output, (hidden, cell) = self.rnn(packed_embedded)

    # unpack sequence -> assigning elements into variables
    
    output, output_length = nn.utils.rnn.pad_packed_sequence(packed_output)

    # output = [sentence_length, batch_size, hidden_dim * 2]
    # hidden = [num_layers * 2, batch_size, hidden_dim] -> short-term memory
    # cell = [num_layers *2, batch_size, hidden_dim] -> long-term memory

    # left final layer of hidden state

    if self.bidirectional:
      hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1, :, :]), dim = 1))
    else:
      hidden = self.dropout(hidden[-1, :, :])
    
    # hidden = [batch_size, hidden_dim * 2]

    return self.fc(hidden)

In [47]:
INPUT_DIM = len(TEXT.vocab) 
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] # = 1 (<pad> token의 index)

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

In [48]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([25002, 100])


In [49]:
### initial weight -> UNK_TOKEN, PADDING TOKEN

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token] 

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [50]:
### Training

import torch.optim as optim

optimizer =optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

# GPU
model = model.to(device)
criterion = criterion.to(device)

def binary_accuracy(preds, y):

  rounded_preds = torch.round(torch.sigmoid(preds))
  # rounded_preds : [batch size]
  # y : batch.label
  correct = (rounded_preds == y).float()
  acc = correct.sum() / len(correct)
  return acc


def train(model, iterator, optimizer, criterion):

  epoch_loss = 0
  epoch_acc = 0

  model.train()

  for batch in iterator:

    optimizer.zero_grad()

    text, text_length = batch.text

    # [batch_size, 1] -> [batch_size]
    predictions = model(text, text_length).squeeze(1)

    # output / label -> loss
    loss = criterion(predictions, batch.label)

    acc = binary_accuracy(predictions, batch.label)

    # backpropagation
    loss.backward() 

    # parameter update 
    optimizer.step()

    epoch_loss += loss.item()
    epoch_acc += acc.item()

  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [55]:
### evaluate

def evaluate(model, iterator, criterion):
  epoch_loss = 0
  epoch_acc = 0

  model.eval()

  with torch.no_grad():
    
    for batch in iterator:
      text, text_length = batch.text

      predictions = model(text, text_length).squeeze(1)

      loss = criterion(predictions, batch.label)
      acc = binary_accuracy(predictions, batch.label)

      epoch_loss += loss.item()
      epoch_acc += acc.item()

  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [56]:
import time

def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
  return elapsed_mins, elapsed_secs

N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

  start_time = time.time()
  
  train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
  valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

  end_time = time.time()

  epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    

  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    
  print(f'Epoch: {epoch + 1:02} | Epoch Time : {epoch_mins}m {epoch_secs}s')
  print(f'\tTrain Loss : {train_loss:.3f} | Train Acc : {train_acc * 100:.2f}%')
  print(f'\tValid Loss : {valid_loss:.3f} | Valid Acc : {valid_acc * 100:.2f}%')

Epoch: 01 | Epoch Time : 0m 34s
	Train Loss : 0.569 | Train Acc : 70.68%
	Valid Loss : 0.710 | Valid Acc : 67.01%
Epoch: 02 | Epoch Time : 0m 34s
	Train Loss : 0.591 | Train Acc : 68.27%
	Valid Loss : 0.627 | Valid Acc : 63.14%
Epoch: 03 | Epoch Time : 0m 33s
	Train Loss : 0.609 | Train Acc : 66.29%
	Valid Loss : 0.599 | Valid Acc : 66.42%
Epoch: 04 | Epoch Time : 0m 34s
	Train Loss : 0.549 | Train Acc : 72.64%
	Valid Loss : 0.603 | Valid Acc : 61.63%
Epoch: 05 | Epoch Time : 0m 34s
	Train Loss : 0.490 | Train Acc : 77.15%
	Valid Loss : 0.816 | Valid Acc : 60.09%


In [57]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss : {test_loss:3f} | Test Acc : {test_acc*100:.2f}%')

Test Loss : 0.816957 | Test Acc : 60.02%


In [30]:
### Test

import spacy
nlp = spacy.load('en')

def predict_sentiment(model, sentence):
  model.eval()
  tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
  indexed = [TEXT.vocab.stoi[t] for t in tokenized]
  length = [len(indexed)]
  tensor = torch.LongTensor(length)
  length_tensor = torch.LongTensor(length)
  prediction = torch.sigmoid(model(tensor, length_tensor))
  return prediction.item()

def predict_test_sentiment(model, tokenized):
  model.eval()
  indexed = [TEXT.vocab.stoi[t] for t in tokenized]
  length = [len(indexed)]
  tensor = torch.unsqueeze(1)
  length_tensor = torch.LongTensor(length)
  prediction = torch.sigmoid(model(tensor, length_tensor))
  return prediction.item()

In [59]:
### Multi-layer

INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 3
BIDIRECTIONAL = True
DROPOUT = 0.2
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] # = 1 (<pad> token의 index)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)


pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)
 
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

optimizer =optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
# GPU
model = model.to(device)
criterion = criterion.to(device)

N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model5.pt')

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 53s
	Train Loss: 0.653 | Train Acc: 62.11%
	 Val. Loss: 0.622 |  Val. Acc: 67.58%
Epoch: 02 | Epoch Time: 0m 53s
	Train Loss: 0.569 | Train Acc: 71.70%
	 Val. Loss: 0.496 |  Val. Acc: 76.92%
Epoch: 03 | Epoch Time: 0m 53s
	Train Loss: 0.402 | Train Acc: 82.32%
	 Val. Loss: 0.365 |  Val. Acc: 85.15%
Epoch: 04 | Epoch Time: 0m 53s
	Train Loss: 0.292 | Train Acc: 88.17%
	 Val. Loss: 0.315 |  Val. Acc: 87.05%
Epoch: 05 | Epoch Time: 0m 53s
	Train Loss: 0.231 | Train Acc: 91.18%
	 Val. Loss: 0.303 |  Val. Acc: 88.12%
Test Loss: 0.317 | Test Acc: 87.55%
