<a href="https://colab.research.google.com/github/jonas-jun/nlp_imdb_sentiment/blob/master/sentimental_analysis_IMDb_200725.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Simple Sentiment Analysis use IMDb dataset**

following [bentrevett's github](https://github.com/bentrevett/pytorch-sentiment-analysis)



## Prepare Datasets



1.   Field 설정
2.   데이터셋 다운로드
3.   validation set 나누기



    데이터 셋을 매번 다운로드 받지 않고 load할 수 있는 방법은?

In [None]:
import torch
from torchtext import data
from torchtext import datasets #download IMDb dataset
import random
import torch.nn as nn
import torch.optim as optim
import time #check timedelta for an epoch

In [None]:
seed = 1
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
#Field의 기능?
TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)

In [None]:
#download datasets
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

In [None]:
print('Number of training examples: {}'.format(len(train_data)))
print(f'Number of testing examples: {len(test_data)}')

In [None]:
print(vars(train_data.examples[0]))

In [None]:
#split validation set
train_data, valid_data = train_data.split(random_state = random.seed(seed))

In [None]:
print('Number of training examples: {}'.format(len(train_data)))
print('Number of validation examples: {}'.format(len(valid_data)))
print('Number of test examples: {}'.format(len(test_data)))

## Make Vocabulary

1.   max_vocab_size 지정
    
    그런데 25000개가 아닌 25002개가 잡히는 이유는?: unk와 pad가 존재한다.

    sentence1: I hate this film.

    sentence2: This film sucks < pad >
2.   단어 빈도 분석 가능

    vocab은 dictionary 형태로 되어 있으며, itos(int to str), stoi(s to i) 사용 가능



    vocab 파일은 저장이 되나?

In [None]:
max_vocab_size = 25000
TEXT.build_vocab(train_data, max_size=max_vocab_size)
LABEL.build_vocab(train_data)

print('Unique tokens in TEXT vocabulary: {}'.format(len(TEXT.vocab)))
print('Unique tokens in LABEL vocabulary: {}'.format(len(LABEL.vocab)))

In [None]:
print(TEXT.vocab.freqs.most_common(20))

In [None]:
print(TEXT.vocab.itos[:10])
print(LABEL.vocab.stoi)

## Make Iterator

    BucketIterator의 기능은?

In [None]:
batch_size = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iter, valid_iter, test_iter = data.BucketIterator.splits(
    (train_data, valid_data, test_data), batch_size=batch_size, device=device)

In [12]:
print('device: {}'.format(device))

device: cuda


## Build the Model

**Make RNN model**

input dimension: lenth of one-hot vectors

embedding dimension: the size of the dense word vectors, usually around 50-250 dimensions.

hidden dimension: the size of the hidden states.

output dimension: the number of classes. in this case 1, because only 2 cases, 0 or 1

    assert와 squeeze(1)의 기능은?
    embedding dim은 어떻게 정해지는지?

In [None]:
class RNN(nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        embedded = self.embedding(text)
        output, hidden = self.rnn(embedded)
        
        assert torch.equal(output[-1, :, :], hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))

In [None]:
input_dim = len(TEXT.vocab)
embedding_dim = 100
hidden_dim = 128
output_dim = 1

model = RNN(input_dim, embedding_dim, hidden_dim, output_dim)

In [None]:
def count_parameteres(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print('The model has {0:,} trainable parameters'.format(count_parameteres(model)))

## Train the Model



1.   Optimizer 정의
2.   Loss function 정의
3. model과 loss 연산을 모두 device로 보내기
4. train과 evaluate 함수 각각 정의
5. training에 걸리는 시간 측정
6. 실제로 트레이닝(여러 차례의 epochs)
    
    최적의 validation loss를 가진 parameters를 저장하여 이후 test set에 사용

argument1: parameters we will update

argument2: learning rate


    최적의 parameter가 어디에 저장이 되는지?

In [None]:
optimizer = optim.SGD(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [None]:
# make accuracy function
def binary_accuracy(preds, y):
    '''
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, not 8
    '''

    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #float for division
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
#for validation: evaluation에서는 parameter 최적화 과정을 거치지 않아도 된다. (except backpropagation & optimizing)
def evaluate(model, iterator, criterion):

    epoch_loss = 0
    epoch_acc = 0

    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
# checking time to compare training times between models
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
n_epochs = 5

best_valid_loss = float('inf') #무한대에서 조금씩 줄여가기

for epoch in range(n_epochs):
    start_time = time.time()

    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iter, criterion)

    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt') # .pt 파일?

    print('Epoch: {0:02} | Epoch Time: {1}m {2}s'.format(epoch+1, epoch_mins, epoch_secs))
    print('\tTrain Loss: {0:0.3f} | Train Acc: {1:0.2f}%'.format(train_loss, train_acc*100))
    print('\t Val. Loss: {0:0.3f} |  Val. Acc: {1:0.2f}%'.format(valid_loss, valid_acc*100))

## Prediction with testset

In [None]:
model.load_state_dict(torch.load('tut1-model.pt'))
test_loss, test_acc = evaluate(model, test_iter, criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')