In [6]:
import os
import re
import time
import json
import torch
import random
import codecs
import gensim
import torchtext
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings(action='ignore')

print(f"PyTorch version: {torch.__version__}\nTorchtext version: {torchtext.__version__}")

PyTorch version: 1.7.1
Torchtext version: 0.8.0a0+0f911ec


In [7]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
# torch.backend.cudnn.deterministic = True



<torch._C.Generator at 0x7f90cc05e990>

In [8]:
def text_preprocessing(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Every dataset is lower cased except for TREC
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)     
    string = re.sub(r"\'s", " \'s", string) 
    string = re.sub(r"\'ve", " \'ve", string) 
    string = re.sub(r"n\'t", " n\'t", string) 
    string = re.sub(r"\'re", " \'re", string) 
    string = re.sub(r"\'d", " \'d", string) 
    string = re.sub(r"\'ll", " \'ll", string) 
    string = re.sub(r",", " , ", string) 
    string = re.sub(r"!", " ! ", string) 
    string = re.sub(r"\(", " \( ", string) 
    string = re.sub(r"\)", " \) ", string) 
    string = re.sub(r"\?", " \? ", string) 
    string = re.sub(r"\s{2,}", " ", string)
    
    return string.strip().lower()

In [9]:
def build_data(data_path, cv=10):
  
    with open(os.path.join(data_path, "rt-polarity.pos"), "r") as f:
        pos = f.read()
    
    with open(os.path.join(data_path, "rt-polarity.neg"), "r") as f:
        neg = f.read()
        
    pos_sentences = [text_preprocessing(sent) for sent in pos.split('\n')[:-1]]
    neg_sentences = [text_preprocessing(sent) for sent in neg.split('\n')[:-1]]
    
    pos_length = len(pos_sentences)
    neg_length = len(neg_sentences)
    tot_length = pos_length + neg_length
    
    print(f'Positive: {pos_length}\nNegative: {neg_length}\nTotal: {tot_length}')
    
    reviews = [0] * tot_length

    for i, sent in enumerate(pos_sentences):
        datum = {'label': 1,
                 'text': sent}
        reviews[i] = datum

    for j, sent in enumerate(neg_sentences):
        datum = {'label': 0,
                 'text': sent}
        reviews[pos_length + j] = datum

    word_to_idx = {'@pad': 0}

    for sentence in pos_sentences + neg_sentences:
        for word in sentence.split():
            if word not in word_to_idx:
                word_to_idx[word] = len(word_to_idx)

    print(f'number of reviews: {len(reviews)}\nnumber of vocabularies: {len(word_to_idx)}')
    
    result_fname = os.path.join(data_path, 'polarity_df.csv')
    if not os.path.isfile(result_fname):
        df = pd.DataFrame(reviews) 
        df.to_csv(os.path.join(data_path, 'polarity_df.csv'), index=False, encoding='utf-8')
        print('reviews was successfully saved!')
    else:
        print('reviews was already exist!')
    
    return reviews, word_to_idx

In [10]:
data_path = "data"
revs, word_to_idx = build_data(data_path)

Positive: 5331
Negative: 5331
Total: 10662
number of reviews: 10662
number of vocabularies: 18765
reviews was already exist!


In [11]:
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

TEXT   = torchtext.data.Field(sequential=True, tokenize=str.split, batch_first=True, fix_length=56, lower=True)
LABEL  = torchtext.data.LabelField(sequential=False, dtype=torch.float)
FIELDS = [('label', LABEL), ('text', TEXT)]

dataset = torchtext.data.TabularDataset(os.path.join(data_path, "polarity_df.csv"), fields=FIELDS, format='csv', skip_header=True)

train_data, test_data = dataset.split(random_state=random.seed(SEED), split_ratio=0.9)

print(f'train_data: {len(train_data)}\ntest data: {len(test_data)}')

train_data: 9596
test data: 1066


In [12]:
word2vec = gensim.models.KeyedVectors.load_word2vec_format(os.path.join(data_path, 'GoogleNews-vectors-negative300.bin.gz'), binary = True)
word2vec_index = {token: token_index for token_index, token in enumerate(word2vec.index2word)}

TEXT.build_vocab(train_data, max_size=20000)
LABEL.build_vocab(train_data)

TEXT.vocab.set_vectors(word2vec_index, torch.from_numpy(word2vec.vectors).float().to(device), 300)
# TEXT.vocab.vectors.shape
# pretrained_embeddings = torch.FloatTensor(TEXT.vocab.vectors)


In [29]:
pretrained_embeddings = torch.FloatTensor(TEXT.vocab.vectors)

In [13]:
# making iterators
train_iterator,  test_iterator = torchtext.data.BucketIterator.splits(
    (train_data, test_data), 
    batch_size=BATCH_SIZE, 
    device=device, 
    sort=False, 
    shuffle=True)

In [36]:
import torch.nn as nn
import torch.nn.functional as F

class CNN1d(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx, freeze_mode):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.convs = nn.ModuleList([
                                    nn.Conv1d(in_channels=embedding_dim,
                                              out_channels=n_filters,
                                              kernel_size=fs)
                                    for fs in filter_sizes
                                    ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        #in: [batch size, sent len]
        embedded = self.embedding(text) 
        #out: [batch size, sent len, emb dim]
        
        embedded = embedded.permute(0, 2, 1)
        #out: [batch size, emb dim, sent len]

        conved = [F.tanh(conv(embedded)) for conv in self.convs]
        #out: [batch size, n_filters, sent len - filter_sizes[n] + 1]

        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        #out: [batch size, n_filters]

        cat = self.dropout(torch.cat(pooled, dim = 1))
        #out: [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [32]:
INPUT_DIM = len(TEXT.vocab)
# VOCAB_SIZE = len(TEXT.vocab)
PRETRAINED_EMBEDDINGS = pretrained_embeddings
EMBEDDING_DIM = 300
N_FILTERS = 100
FILTER_SIZES = [3, 4, 5]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
FREEZE_MODE = True

In [33]:
print('INPUT_DIM', INPUT_DIM)
print('EMBEDDING_DIM', EMBEDDING_DIM)
print('PAD_IDX', PAD_IDX)
print('UNK_IDX', UNK_IDX)

INPUT_DIM 17863
EMBEDDING_DIM 300
PAD_IDX 1
UNK_IDX 0


In [46]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

model = CNN1d(VOCAB_SIZE, EMBEDDING_DIM, N_FILTERS, 
              FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX, FREEZE_MODE)

model.embedding.weight.data[UNK_IDX] = torch.nn.init.uniform_(torch.empty(EMBEDDING_DIM), -0.25, 0.25)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data.copy_(TEXT.vocab.vectors)

model.embedding.weight.requires_grad = True

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 5,719,501 trainable parameters


In [47]:
optimizer = torch.optim.Adadelta(model.parameters())

# BCEWithLogitsLoss automatically does softmax function
criterion = torch.nn.BCEWithLogitsLoss()

model     = model.to(device)
criterion = criterion.to(device)

In [48]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    
    return acc

In [49]:
def max_norm_scailing(model, max_val=3, eps=1e-8):
    param = model.fc.weight.norm()
    norm = param.norm(2, dim=0, keepdim=True)
    # torch.cla
    desired = torch.clamp(norm, 0, max_val)
    param = param * (desired / (eps + norm))

def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        optimizer.step()
        
        if True:
            max_norm_scailing(model, max_val=3)
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    
    return elapsed_mins, elapsed_secs

In [50]:
N_EPOCHS = 10
best_test_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    test_loss, test_acc = evaluate(model, test_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if test_loss < best_test_loss:
        best_test_loss = test_loss
        torch.save(model.state_dict(), os.path.join('weights', 'latest_weigths.pt'))
    
    print(f'Epoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%')
    print(f'\tTest Loss: {test_loss:.3f} |  Val. Acc: {test_acc * 100:.2f}%')

Epoch: 01 | Epoch Time: 0m 27s
	Train Loss: 0.568 | Train Acc: 70.61%
	Test Loss: 0.480 |  Val. Acc: 76.58%
Epoch: 02 | Epoch Time: 0m 27s
	Train Loss: 0.429 | Train Acc: 80.31%
	Test Loss: 0.439 |  Val. Acc: 80.74%
Epoch: 03 | Epoch Time: 0m 27s
	Train Loss: 0.343 | Train Acc: 85.63%
	Test Loss: 0.426 |  Val. Acc: 80.41%
Epoch: 04 | Epoch Time: 0m 27s
	Train Loss: 0.275 | Train Acc: 88.61%
	Test Loss: 0.551 |  Val. Acc: 74.15%
Epoch: 05 | Epoch Time: 0m 26s
	Train Loss: 0.198 | Train Acc: 92.97%
	Test Loss: 0.515 |  Val. Acc: 78.07%
Epoch: 06 | Epoch Time: 0m 27s
	Train Loss: 0.125 | Train Acc: 96.10%
	Test Loss: 0.576 |  Val. Acc: 78.05%
Epoch: 07 | Epoch Time: 0m 26s
	Train Loss: 0.076 | Train Acc: 97.82%
	Test Loss: 0.534 |  Val. Acc: 81.01%
Epoch: 08 | Epoch Time: 0m 27s
	Train Loss: 0.043 | Train Acc: 99.11%
	Test Loss: 0.578 |  Val. Acc: 80.32%
Epoch: 09 | Epoch Time: 0m 26s
	Train Loss: 0.024 | Train Acc: 99.45%
	Test Loss: 0.620 |  Val. Acc: 79.71%
Epoch: 10 | Epoch Time: 0m 2

In [51]:
model.load_state_dict(torch.load(os.path.join('weights', 'latest_weigths.pt')))
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}%')

Test Loss: 0.427 | Test Acc: 80.17%


In [52]:
def predict(sentence, model, fixed_length=56):
    word2id = []

    for word in my_sentence.split():
        word2id.append(TEXT.vocab.stoi[word])
        
    word2id = word2id + [1] * (fixed_length - len(word2id))
    input_tensor = torch.LongTensor(word2id).to(device).unsqueeze(0)
    probability = np.squeeze(torch.sigmoid(model(input_tensor)).detach().numpy()[0], 0)
    predicted_label = 'Pos' if probability >= 0.5 else 'Neg' 
    
    return probability, predicted_label

my_sentence = "this film is terrible"
probability, predicted_label = predict(my_sentence, model)

print(f"predicted lable: {predicted_label}\nprobability: {probability}")


predicted lable: Neg
probability: 0.05411254242062569
