In [1]:
import os
import re
import time
import json
import torch
import random
import codecs
import gensim
import torchtext
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm
from torch.utils.data import DataLoader

print(f"PyTorch version: {torch.__version__}\nTorchtext version: {torchtext.__version__}")

PyTorch version: 1.7.1
Torchtext version: 0.8.0a0+0f911ec


In [2]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
# torch.backend.cudnn.deterministic = True

<torch._C.Generator at 0x7fdd5685d9b0>

In [3]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Every dataset is lower cased except for TREC
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)     
    string = re.sub(r"\'s", " \'s", string) 
    string = re.sub(r"\'ve", " \'ve", string) 
    string = re.sub(r"n\'t", " n\'t", string) 
    string = re.sub(r"\'re", " \'re", string) 
    string = re.sub(r"\'d", " \'d", string) 
    string = re.sub(r"\'ll", " \'ll", string) 
    string = re.sub(r",", " , ", string) 
    string = re.sub(r"!", " ! ", string) 
    string = re.sub(r"\(", " \( ", string) 
    string = re.sub(r"\)", " \) ", string) 
    string = re.sub(r"\?", " \? ", string) 
    string = re.sub(r"\s{2,}", " ", string)
    
    return string.strip().lower()

In [4]:
def build_data(data_path, cv=10):
  
    revs = []
    pos = codecs.open(os.path.join(data_path, "rt-polarity.pos"), "r", encoding='utf-8', errors='ignore').read()
    neg = codecs.open(os.path.join(data_path, "rt-polarity.neg"), "r", encoding='utf-8', errors='ignore').read()
    pos_list = [clean_str(sent) for sent in pos.split('\n')[:-1]]
    neg_list = [clean_str(sent) for sent in neg.split('\n')[:-1]]
    print('pos len', len(pos_list))
    print('neg len', len(neg_list))
    print('total len', len(pos_list) + len(neg_list) )
  
    for sent in pos_list:
        datum = {'label': 1,
                 'text': sent,
                 'num_words': len(sent.split()),
                 'split': np.random.randint(0,cv)
                 }
        revs.append(datum)

    for sent in neg_list:
        datum = {'label': 0,
                 'text': sent,
                 'num_words': len(sent.split()),
                 'split': np.random.randint(0,cv)
                }
        revs.append(datum)

    word_to_idx = {'@pad': 0}

    for sent in pos_list + neg_list:
        for word  in sent.split():
            if word not in word_to_idx:
                word_to_idx[word] = len(word_to_idx)

    print(f'num of total data: {len(revs)}')
    print(f'num of vocab: {len(word_to_idx)}')

    df = pd.DataFrame(revs) 
    df.to_csv(os.path.join(data_path, 'polarity_df.csv'), index=False, encoding='utf-8')
    
    print('save the data')
    
    return revs, word_to_idx

In [5]:
data_path = "data"
revs, word_to_idx = build_data(data_path)

pos len 5331
neg len 5331
total len 10662
num of total data: 10662
num of vocab: 18765
save the data


In [6]:
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

TEXT   = torchtext.data.Field(sequential=True, tokenize=str.split, batch_first=True, fix_length=56, lower=True)
LABEL  = torchtext.data.LabelField(sequential=False, dtype=torch.float)
FIELDS = [('label', LABEL), ('text', TEXT)]

dataset = torchtext.data.TabularDataset(os.path.join(data_path, "polarity_df.csv"), fields=FIELDS, format='csv', skip_header=True)

train_data, test_data = dataset.split(random_state=random.seed(SEED), split_ratio=0.9)

print('train_data', len(train_data))
print('test_data', len(test_data))

train_data 9596
test_data 1066




In [7]:
w2v = gensim.models.KeyedVectors.load_word2vec_format(os.path.join(data_path, 'GoogleNews-vectors-negative300.bin.gz'), binary = True)
w2v

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7fdd5bc806d0>

In [8]:
TEXT.build_vocab(train_data)
LABEL.build_vocab(train_data)

In [9]:
EMBEDDING_DIM = 300
w2v_vectors = []

for token, idx in tqdm(TEXT.vocab.stoi.items()):
    if token in w2v.wv.vocab.keys():
        w2v_vectors.append(torch.FloatTensor(w2v[token]))
    else:
        w2v_vectors.append(torch.zeros(EMBEDDING_DIM))
#         w2v_vectors.append(torch.distributions.Uniform(-0.25, +0.25).sample((EMBEDDING_DIM, )))

print(len(w2v_vectors))

  0%|          | 0/17863 [00:00<?, ?it/s]

17863


  """
  


In [10]:
# making iterators
train_iterator,  test_iterator = torchtext.data.BucketIterator.splits(
    (train_data, test_data), 
    batch_size=BATCH_SIZE, 
    device=device, 
    sort=False, 
    shuffle = True)



In [11]:
for a , b in train_iterator:
    print(a)
    print(a.size())
    print()
    print(b)
    print(b.size())
    break

tensor([[  741,     3,  4650,  ...,     1,     1,     1],
        [   42,    38,    56,  ...,     1,     1,     1],
        [ 2735,   982,     8,  ...,     1,     1,     1],
        ...,
        [    4,  4401,  8082,  ...,     1,     1,     1],
        [   10,  2667, 13000,  ...,     1,     1,     1],
        [   30,     2,  1602,  ...,     1,     1,     1]])
torch.Size([64, 56])

tensor([0., 1., 1., 1., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 1., 1., 0., 0.,
        1., 1., 0., 0., 1., 0., 0., 0., 0., 1., 1., 1., 0., 0., 1., 0., 0., 1.,
        1., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 1., 1., 1.,
        0., 1., 1., 0., 0., 0., 1., 1., 1., 0.])
torch.Size([64])




In [12]:
import torch.nn as nn
import torch.nn.functional as F

class CNN1d(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        super().__init__()

        self.static_embedding    = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx, freeze=False)
        self.nonstatic_embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx, freeze=True)
        self.convs = nn.ModuleList([
                                    nn.Conv1d(in_channels=embedding_dim,
                                              out_channels=n_filters,
                                              kernel_size=fs)
                                    for fs in filter_sizes
                                    ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        #text = [batch size, sent len]

        embedded = self.embedding(text)
        #embedded = [batch size, sent len, emb dim]

        embedded = embedded.permute(0, 2, 1)
        #embedded = [batch size, emb dim, sent len]

        conved = [F.tanh(conv(embedded)) for conv in self.convs]
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]

        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        #pooled_n = [batch size, n_filters]

        cat = self.dropout(torch.cat(pooled, dim = 1))
        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [13]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
N_FILTERS = 100
FILTER_SIZES = [3, 4, 5]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

print('INPUT_DIM', INPUT_DIM)
print('EMBEDDING_DIM', EMBEDDING_DIM)
print('PAD_IDX', PAD_IDX)
print('UNK_IDX', UNK_IDX)

INPUT_DIM 17863
EMBEDDING_DIM 300
PAD_IDX 1
UNK_IDX 0


In [14]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

model = CNN1d(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 5,719,501 trainable parameters


In [15]:
# for check up (before applying word2vec)
test = 'good'
test_indx = TEXT.vocab.stoi[test]
print(f'test string: {test}\ntest string: {test_indx}')

original_vector = w2v[test]
torch_model_vector = model.embedding(torch.tensor([test_indx]))[0]
print()
print(f'before apply\norigianl: {original_vector[:3]}\ntorch_vector: {torch_model_vector[:3]}')

# apply w2v
TEXT.vocab.set_vectors(TEXT.vocab.stoi, w2v_vectors, EMBEDDING_DIM)
pretrained_embeddings = torch.FloatTensor(TEXT.vocab.vectors)
model.embedding.weight.data.copy_(pretrained_embeddings)

# for check up (after applying word2vec)
torch_model_vector = model.embedding(torch.tensor([test_indx]))[0]
print()
print(f'after apply\norigianl: {original_vector[:3]}\ntorch_vector: {torch_model_vector[:3]}')

test string: good
test string: 54

before apply
origianl: [ 0.04052734  0.0625     -0.01745605]
torch_vector: tensor([ 0.0853, -0.9901, -3.2475], grad_fn=<SliceBackward>)

after apply
origianl: [ 0.04052734  0.0625     -0.01745605]
torch_vector: tensor([ 0.0405,  0.0625, -0.0175], grad_fn=<SliceBackward>)


In [16]:
# padding -> zero vectors
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX]

# unknown token -> randomly initialized with uniform distribution
model.embedding.weight.data[UNK_IDX] = torch.distributions.Uniform(-0.25, +0.25).sample((EMBEDDING_DIM,))
model.embedding.weight.data[UNK_IDX]
print()




In [17]:
optimizer = torch.optim.Adadelta(model.parameters())

# BCEWithLogitsLoss automatically does softmax function
criterion = torch.nn.BCEWithLogitsLoss()

model     = model.to(device)
criterion = criterion.to(device)

In [18]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    
    return acc

In [19]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        optimizer.step()
        
        #  l2 norm (weight contraints): 3
        with torch.no_grad():
            for param in model.parameters():
                param.clamp_(min=-3, max=3)
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    
    return elapsed_mins, elapsed_secs

In [20]:
N_EPOCHS = 10
best_test_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    test_loss, test_acc = evaluate(model, test_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if test_loss < best_test_loss:
        best_test_loss = test_loss
        torch.save(model.state_dict(), os.path.join('weights', 'latest_weigths.pt'))
    
    print(f'Epoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%')
    print(f'\t Test. Loss: {test_loss:.3f} |  Val. Acc: {test_acc * 100:.2f}%')



Epoch: 01 | Epoch Time: 0m 25s
	Train Loss: 0.566 | Train Acc: 70.64%
	 Test. Loss: 0.494 |  Val. Acc: 75.38%
Epoch: 02 | Epoch Time: 0m 25s
	Train Loss: 0.428 | Train Acc: 79.84%
	 Test. Loss: 0.437 |  Val. Acc: 79.94%
Epoch: 03 | Epoch Time: 0m 25s
	Train Loss: 0.342 | Train Acc: 85.41%
	 Test. Loss: 0.431 |  Val. Acc: 80.64%
Epoch: 04 | Epoch Time: 0m 25s
	Train Loss: 0.266 | Train Acc: 89.24%
	 Test. Loss: 0.434 |  Val. Acc: 79.67%
Epoch: 05 | Epoch Time: 0m 25s
	Train Loss: 0.191 | Train Acc: 92.96%
	 Test. Loss: 0.463 |  Val. Acc: 80.03%
Epoch: 06 | Epoch Time: 0m 25s
	Train Loss: 0.124 | Train Acc: 95.75%
	 Test. Loss: 0.506 |  Val. Acc: 80.04%
Epoch: 07 | Epoch Time: 0m 25s
	Train Loss: 0.071 | Train Acc: 98.11%
	 Test. Loss: 0.541 |  Val. Acc: 81.00%
Epoch: 08 | Epoch Time: 0m 25s
	Train Loss: 0.044 | Train Acc: 98.89%
	 Test. Loss: 0.592 |  Val. Acc: 81.41%
Epoch: 09 | Epoch Time: 0m 25s
	Train Loss: 0.023 | Train Acc: 99.61%
	 Test. Loss: 0.616 |  Val. Acc: 80.73%
Epoch: 10 

In [21]:
model.load_state_dict(torch.load(os.path.join('weights', 'latest_weigths.pt')))
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.428 | Test Acc: 80.73%


In [22]:
def predict(sentence, model, fixed_length=56):
    word2id = []

    for word in my_sentence.split():
        word2id.append(TEXT.vocab.stoi[word])
        
    word2id = word2id + [1] * (fixed_length - len(word2id))
    input_tensor = torch.LongTensor(word2id).to(device).unsqueeze(0)
    probability = np.squeeze(torch.sigmoid(model(input_tensor)).detach().numpy()[0], 0)
    predicted_label = 'Pos' if probability >= 0.5 else 'Neg' 
    
    return probability, predicted_label

my_sentence = "this film is terrible"
probability, predicted_label = predict(my_sentence, model)

print(f"predicted lable: {predicted_label}\nprobability: {probability}")


predicted lable: Neg
probability: 0.057798054069280624
