# Fast text
#### https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/3%20-%20Faster%20Sentiment%20Analysis.ipynb

# 1. Preparing Data

* 입력 문장의 마지막에 문장 구성 토큰들의 n-gram을 추가로 넣자! 
* n-grams가 아닌 bi-grams를 사용할 것
* bi-gram: a pair of words/tokens that appear consecutively within a sentence.
* 예)  "how are you ?" -- "how are", "are you" and "you ?"

In [2]:
def generate_bigrams(x):
    n_grams = set(zip(*[x[i:] for i in range(2)]))
    for n_gram in n_grams:
        x.append(' '.join(n_gram))
    return x

In [3]:
generate_bigrams(['This', 'film', 'is', 'terrible'])

['This', 'film', 'is', 'terrible', 'is terrible', 'film is', 'This film']

torchtext의 Field는 preprocessing 과정이 있어서 여기에 generate_bigrams 함수를 넣어 토크나이징 후 적용되도록 하였다. 

In [4]:
import torch
from torchtext import data
from torchtext import datasets

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True # CUDA와 CuDNN에서 Randomness 제어를 위한 설정법

# 실제 텍스트를 위한 TEXT 객체
TEXT = data.Field(tokenize = 'spacy', # 어떤 토큰화 함수를 사용할 것인지 지정(string.split이 기본값)
                  tokenizer_language = 'en_core_web_sm',
                  preprocessing = generate_bigrams)

# 레이블 데이터를 위한 LABEL 객체
LABEL = data.LabelField(dtype = torch.float)

* IMDB dataset load와 split

In [5]:
import random

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

train_data, valid_data = train_data.split(random_state = random.seed(SEED))

vocab을 만들고 pre-trainde word embeddings load

In [6]:
MAX_VOCAB_SIZE = 25000

# 단어 집합 생성
TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, # 단어 집합의 최대 크기를 지정
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

iterators 생성하기(데이터로더 만들기)

In [7]:
BATCH_SIZE = 64

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE) #,device = device)

# 2. Build the model

In [8]:
import torch.nn as nn
import torch.nn.functional as F

class FastText(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        
        self.fc = nn.Linear(embedding_dim, output_dim)
        
    def forward(self, text):
        
        #text = [sent len, batch size]
        
        embedded = self.embedding(text)
                
        #embedded = [sent len, batch size, emb dim]
        
        embedded = embedded.permute(1, 0, 2)
        
        #embedded = [batch size, sent len, emb dim]
        
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1) 
        
        #pooled = [batch size, embedding_dim]
                
        return self.fc(pooled)

In [9]:
# 필요한 부분 정의하기

INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
OUTPUT_DIM = 1
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = FastText(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX)

vocab_size=INPUT_DIM
embedding_dim=EMBEDDING_DIM
pad_idx=PAD_IDX 
output_dim=OUTPUT_DIM

embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
fc=nn.Linear(embedding_dim, output_dim)

*  def forward(self, text) 시행하기

In [10]:
for batch in train_iterator:
    text = batch.text
    
text.shape #text = [sent len, batch size]

torch.Size([2225, 64])

In [11]:
text

tensor([[ 2953,     0,   551,  ...,   178,    11,    25],
        [11739, 13877,     2,  ...,  5019,    34,  4009],
        [  195,  3964,   153,  ...,     3,    99,     7],
        ...,
        [    1,     1,     1,  ...,     1,     1,     1],
        [    1,     1,     1,  ...,     1,     1,     1],
        [    1,     1,     1,  ...,     1,     1,     1]])

In [12]:
embedded = embedding(text)
embedded.shape # #embedded = [sent len, batch size, emb dim]

torch.Size([2225, 64, 100])

In [13]:
embedded

tensor([[[-1.3625, -0.8420,  0.2043,  ...,  2.1156, -0.4357,  0.8511],
         [ 0.1372,  1.2721, -2.2575,  ...,  0.2783, -0.8423, -0.1621],
         [-1.3672,  0.0165, -0.3256,  ..., -1.2031, -0.9534,  1.0318],
         ...,
         [ 0.2830, -0.0760,  0.2256,  ..., -0.0775, -0.5676, -1.0094],
         [-0.9970, -0.3779, -1.0603,  ..., -0.0518,  0.2768, -2.7892],
         [-0.7995, -0.4348,  0.7561,  ..., -0.5735, -0.4235, -0.2271]],

        [[ 1.9615, -1.0437,  0.3060,  ..., -0.5402,  1.9692, -1.0461],
         [ 0.6536,  0.6243,  0.3493,  ...,  1.0296,  0.4269,  0.1809],
         [ 0.4591,  0.5450, -0.7152,  ...,  0.1359, -0.8589,  1.6946],
         ...,
         [-0.2882,  0.3293, -0.0051,  ..., -0.4664,  0.3850,  1.8702],
         [-0.4237,  1.0587,  0.3736,  ...,  0.0263, -0.4617, -1.8794],
         [ 0.8797, -0.2252,  0.1204,  ..., -0.2871,  1.1661,  0.1121]],

        [[-1.1513,  0.8139,  0.8720,  ..., -0.1208,  0.1230, -0.7933],
         [ 0.1780, -0.2542, -1.3582,  ..., -0

In [14]:
embedded = embedded.permute(1, 0, 2)
embedded.shape #embedded = [batch size, sent len, emb dim]

torch.Size([64, 2225, 100])

In [15]:
pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1) 
pooled.shape # pooled = [batch size, embedding_dim]

torch.Size([64, 100])

In [16]:
fc(pooled).shape 

torch.Size([64, 1])

# 3. Train & Test

In [17]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

In [21]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [18]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [19]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [22]:
N_EPOCHS = 

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss

    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

	Train Loss: 0.689 | Train Acc: 54.80%
	 Val. Loss: 0.653 |  Val. Acc: 66.94%
	Train Loss: 0.663 | Train Acc: 68.30%
	 Val. Loss: 0.547 |  Val. Acc: 75.20%
	Train Loss: 0.607 | Train Acc: 77.18%
	 Val. Loss: 0.447 |  Val. Acc: 78.90%
	Train Loss: 0.540 | Train Acc: 82.25%
	 Val. Loss: 0.406 |  Val. Acc: 82.24%
	Train Loss: 0.475 | Train Acc: 85.17%
	 Val. Loss: 0.385 |  Val. Acc: 84.31%


In [23]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.393 | Test Acc: 83.86%


# 4. User Input

In [24]:
import spacy
nlp = spacy.load('en_core_web_sm')

def predict_sentiment(model, sentence):
    model.eval()
    tokenized = generate_bigrams([tok.text for tok in nlp.tokenizer(sentence)])
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed)#.to(device)
    tensor = tensor.unsqueeze(1)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

In [25]:
predict_sentiment(model, "This film is terrible") # negative review

1.7625741577376175e-07

In [26]:
predict_sentiment(model, "This film is great") # positive review

1.0