In [1]:
!pip install torchtext==0.4
!pip install transformers

Collecting torchtext==0.4
  Using cached torchtext-0.4.0-py3-none-any.whl (53 kB)
Installing collected packages: torchtext
Successfully installed torchtext-0.4.0
Collecting transformers
  Using cached transformers-3.0.2-py3-none-any.whl (769 kB)
Processing /home/jovyan/.cache/pip/wheels/69/09/d1/bf058f7d6fa0ecba2ce7c66be3b8d012beb4bf61a6e0c101c0/sacremoses-0.0.43-py3-none-any.whl
Collecting sentencepiece!=0.1.92
  Using cached sentencepiece-0.1.91-cp37-cp37m-manylinux1_x86_64.whl (1.1 MB)
Collecting filelock
  Using cached filelock-3.0.12-py3-none-any.whl (7.6 kB)
Collecting tokenizers==0.8.1.rc1
  Using cached tokenizers-0.8.1rc1-cp37-cp37m-manylinux1_x86_64.whl (3.0 MB)
Collecting regex!=2019.12.17
  Using cached regex-2020.7.14-cp37-cp37m-manylinux2010_x86_64.whl (660 kB)
Installing collected packages: regex, sacremoses, sentencepiece, filelock, tokenizers, transformers
Successfully installed filelock-3.0.12 regex-2020.7.14 sacremoses-0.0.43 sentencepiece-0.1.91 tokenizers-0.8.1rc1 

## Preparing Data

    1. random state: random / numpy random / torch random
    2. make tokenizer: 형태소 자르기 / 최대 길이 조절
    3. read data: torchtext.Field 활용 / vocab 만들기 (여기서는 tokenizer에 vocab attribute가 이미 존재)
    4. batch iterator: batch size를 설정하고 iterator 만들기

In [2]:
import torch
import random
import numpy as np

SEED = 1234

# for random works, python, numpy, pytorch random seeds are needed.
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [35]:
# vocab 확인
print(len(tokenizer.vocab))

30522


In [4]:
tokens = tokenizer.tokenize('Hello WoRLD, how ARE yoU?')
print(tokens)

['hello', 'world', ',', 'how', 'are', 'you', '?']


In [5]:
indexes = tokenizer.convert_tokens_to_ids(tokens)
print(indexes)

[7592, 2088, 1010, 2129, 2024, 2017, 1029]


In [6]:
init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

print(init_token, eos_token, pad_token, unk_token)

[CLS] [SEP] [PAD] [UNK]


In [7]:
init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [8]:
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased'] # 내 데이터의 max length가 아니라 bert model의 max length가 필요?
print(max_input_length)

512


In [9]:
def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_length-2] # 항상 끝 두개를 잘라야 하는지?
    return tokens

In [10]:
from torchtext import data

TEXT = data.Field(batch_first=True,
                  use_vocab=False,
                  tokenize=tokenize_and_cut,
                  preprocessing=tokenizer.convert_tokens_to_ids,
                  init_token=init_token_idx,
                  eos_token=eos_token_idx,
                  pad_token=pad_token_idx,
                  unk_token=unk_token_idx)

LABEL = data.LabelField(dtype=torch.float) # LABEL의 dtype이 float인 이유는?

In [11]:
from torchtext import datasets

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
train_data, valid_data = train_data.split(random_state=random.seed(SEED))

In [12]:
print('number of training samples: {}'.format(len(train_data)))
print('number of validation samples: {}'.format(len(valid_data)))
print('number of test samples: {}'.format(len(test_data)))

number of training samples: 17500
number of validation samples: 7500
number of test samples: 25000


In [13]:
LABEL.build_vocab(train_data)
print(LABEL.vocab.stoi)

defaultdict(None, {'neg': 0, 'pos': 1})


In [14]:
LABEL.vocab.freqs

Counter({'neg': 8810, 'pos': 8690})

In [15]:
# make iterator
BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iter, valid_iter, test_iter = data.BucketIterator.splits(
    (train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device)

## Build Model
    
    1. Model class 구성
        - Layer 구성: Bert Layer(Embedding) > bidirectional GRU 2개 > Dropout > Fully Connected
        - forward, backward method
    2. Dimension:
        - hidden dim = 256
        - output dim = 1(0~1 사이의 긍/부정 수치)
    3. optimizer, criterion 설정: Adam(model.parameters()), BCEWithLogitsLoss
    4. to GPU: model, criterion, optimizer는 왜 안 보내도?

In [16]:
from transformers import BertTokenizer, BertModel

bert = BertModel.from_pretrained('bert-base-uncased')

In [17]:
import torch.nn as nn

class BERTGRUSentiment(nn.Module):
    def __init__(self, bert, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        self.bert = bert
        embedding_dim = bert.config.to_dict()['hidden_size'] # 무슨 뜻?
        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers=n_layers,
                          bidirectional=bidirectional,
                          batch_first=True,
                          dropout = 0 if n_layers < 2 else dropout)
        self.out = nn.Linear(hidden_dim*2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        #text = [batch size, sent len]

        with torch.no_grad():
            embedded = self.bert(text)[0]
        
        #embedded = [batch size, sent len, emb dim]
        _, hidden = self.rnn(embedded)

        #hidden = [n_layers * n_directions, batch size, emb dim]
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        else:
            hidden = self.dropout(hidden[-1,:,:])

        #hidden = [batch size, hid dim]
        output = self.out(hidden)

        #output = [batch size, out dim]
        return output

In [18]:
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25

model = BERTGRUSentiment(bert,
                         HIDDEN_DIM,
                         OUTPUT_DIM,
                         N_LAYERS,
                         BIDIRECTIONAL,
                         DROPOUT)

In [19]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print('Model has {:,} trainable parameters'.format(count_parameters(model)))

Model has 112,241,409 trainable parameters


In [20]:
for name, param in model.named_parameters():
    if name.startswith('bert'):
        param.requires_grad = False

In [21]:
print('Model has {:,} trainable parameters'.format(count_parameters(model)))

Model has 2,759,169 trainable parameters


In [22]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)

rnn.weight_ih_l0
rnn.weight_hh_l0
rnn.bias_ih_l0
rnn.bias_hh_l0
rnn.weight_ih_l0_reverse
rnn.weight_hh_l0_reverse
rnn.bias_ih_l0_reverse
rnn.bias_hh_l0_reverse
rnn.weight_ih_l1
rnn.weight_hh_l1
rnn.bias_ih_l1
rnn.bias_hh_l1
rnn.weight_ih_l1_reverse
rnn.weight_hh_l1_reverse
rnn.bias_ih_l1_reverse
rnn.bias_hh_l1_reverse
out.weight
out.bias


In [23]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [24]:
criterion = nn.BCEWithLogitsLoss()

In [25]:
model = model.to(device)
criterion = criterion.to(device)

## Train

    1. define accuracy, epoch_time function
    2. define train, evaluate function
        - train(model, iterator, optimizer, criterion)
            --아래가 하나의 epoch--
            1)model.train()을 열어주고 (실제로 어떤 역할?)
            2)batch 하나씩 넣어서
            3)optimizer.zero_grad()
            4)prediction 후 loss와 acc 계산
            5)backward()와 optimizer.step()
            6)loss와 acc를 더해주고
            7)전 학습이 완료되면 loss와 acc를 미니배치 개수로 나눠주면 각 배치 당 평균 loss, acc 구할 수 있음
        - evaluate(model, iterator, criterion): do not optimize parameters
    3. train iteration
        - epoch 수 만큼 train loop
        - 시간 계산
        - 최저 valid loss를 찾아서 그 때의 state_dict()를 '.pt' 파일에 저장
        - 각 epoch 소요 시간과 train, valid accuracy와 loss를 출력

In [26]:
def binary_accuracy(preds, y):
    '''
    returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8
    '''
    rounded_preds = torch.round(torch.sigmoid(preds)) # 0.5보다 크면 1, 그렇지 않으면 0
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [27]:
def train(model, iterator, optimizer, criterion):
    epoch_loss, epoch_acc = 0, 0
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item() # .item()이 반환하는 것은?
        epoch_acc += acc.item()
    
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [28]:
def evaluate(model, iterator, criterion):
    epoch_loss, epoch_acc = 0, 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
    
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [29]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins*60))
    return elapsed_mins, elapsed_secs

In [30]:
N_EPOCHS = 5
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iter, criterion)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut6-model.pt')

    print('Epoch: {:02} | Epoch Time: {}m {}s'.format(epoch+1, epoch_mins, epoch_secs))
    print('Train Loss: {:.3f} | Train Acc: {:.2f}%'.format(train_loss, train_acc*100))
    print('Val. Loss: {:.3f} | Val. Acc: {:.2f}%'.format(valid_loss, valid_acc*100))

Epoch: 01 | Epoch Time: 4m 30s
Train Loss: 0.481 | Train Acc: 75.72%
Val. Loss: 0.266 | Val. Acc: 89.26%
Epoch: 02 | Epoch Time: 4m 40s
Train Loss: 0.273 | Train Acc: 88.91%
Val. Loss: 0.247 | Val. Acc: 90.05%
Epoch: 03 | Epoch Time: 4m 38s
Train Loss: 0.238 | Train Acc: 90.70%
Val. Loss: 0.223 | Val. Acc: 91.03%
Epoch: 04 | Epoch Time: 4m 38s
Train Loss: 0.201 | Train Acc: 92.36%
Val. Loss: 0.221 | Val. Acc: 91.37%
Epoch: 05 | Epoch Time: 4m 38s
Train Loss: 0.190 | Train Acc: 92.64%
Val. Loss: 0.215 | Val. Acc: 91.49%


## Test

    1. test with test dataset
    2. test with real samples

In [31]:
model.load_state_dict(torch.load('tut6-model.pt'))
test_loss, test_acc = evaluate(model, test_iter, criterion)
print('Test Loss: {:.3f} | Test Acc: {:.2f}'.format(test_loss, test_acc*100))

Test Loss: 0.197 | Test Acc: 92.16


In [43]:
def predict_sentiment(model, tokenizer, sentence):
    model.eval()
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_length-2]
    indexed = [init_token_idx] + tokenizer.convert_tokens_to_ids(tokens) + [eos_token_idx]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    print(tensor)
    prediction = torch.sigmoid(model(tensor))
    print(prediction)
    return prediction.item()

In [44]:
review1 = 'This film is great'
review2 = "it was really boring, so i would like to quit it. My Friends also feel like that.\
            I don't wanna watch it again at all."

In [45]:
predict_sentiment(model, tokenizer, review1)

tensor([[ 101, 2023, 2143, 2003, 2307,  102]], device='cuda:0')
tensor([[0.9755]], device='cuda:0', grad_fn=<SigmoidBackward>)


0.9754907488822937

In [46]:
predict_sentiment(model, tokenizer, review2)

tensor([[  101,  2009,  2001,  2428, 11771,  1010,  2061,  1045,  2052,  2066,
          2000,  8046,  2009,  1012,  2026,  2814,  2036,  2514,  2066,  2008,
          1012,  1045,  2123,  1005,  1056, 10587,  3422,  2009,  2153,  2012,
          2035,  1012,   102]], device='cuda:0')
tensor([[0.3265]], device='cuda:0', grad_fn=<SigmoidBackward>)


0.3265240490436554