In [1]:
import os
import torch
import torchtext
import random
import numpy as np

import torch.nn as nn
import torch.nn.functional as F

import warnings
warnings.filterwarnings(action='ignore')

data_path = "data"
weights_path = "weights"

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7fc6e805e990>

In [2]:
if not os.path.isfile("data/train.csv"):
    with open(os.path.join(data_path, 'rt-polarity.pos'), 'rb') as f:
        pos = f.readlines()

    with open(os.path.join(data_path, 'data/rt-polarity.neg'), 'rb') as f:
        neg = f.readlines()

    dataset = [(p, 1) for p in pos] + [(n, 0) for n in neg]
    pd.DataFrame(dataset).to_csv(os.path.join(data_path, 'train.csv'), index=None)
    print("train.csv file was successfully saved!")
else:
    print("train.csv file already exist!")

train.csv file already exist!


In [3]:
TEXT   = torchtext.data.Field(sequential=True, tokenize=str.split, batch_first=True, fix_length=50, lower=True)
LABEL  = torchtext.data.LabelField(sequential=False, dtype=torch.float)
FIELDS = [('text', TEXT), ('label', LABEL)]

dataset = torchtext.data.TabularDataset(os.path.join(data_path, "train.csv"), 
                                        fields=FIELDS, format='csv', 
                                        skip_header=True)

train_data, test_data = dataset.split(split_ratio=0.8, random_state=random.seed(SEED))
train_data, valid_data = train_data.split(split_ratio=0.8, random_state=random.seed(SEED))

In [4]:
MAX_VOCAB_SIZE = np.inf

TEXT.build_vocab(train_data, 
                 max_size  = MAX_VOCAB_SIZE,
                 vectors   = "glove.6B.100d", 
                 unk_init  = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

In [5]:
BATCH_SIZE = 10

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator = torchtext.data.Iterator(train_data, batch_size=BATCH_SIZE, device=device)
test_iterator  = torchtext.data.Iterator(test_data, batch_size=BATCH_SIZE, device=device)
valid_iterator = torchtext.data.Iterator(valid_data, batch_size=BATCH_SIZE, device=device)

In [6]:
from torch import nn

class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
                
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
                
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
            
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
                
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))

        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [7]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [8]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,993,401 trainable parameters


In [9]:
pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [-0.3398,  0.2094,  0.4635,  ..., -0.2339,  0.4730, -0.0288],
        ...,
        [ 0.8759, -0.0671, -0.3230,  ..., -0.0673, -0.4737,  0.2571],
        [ 0.5719, -0.7835,  0.4352,  ..., -0.2164, -0.7241,  0.2962],
        [ 0.6172,  0.8054,  0.1729,  ..., -1.6525, -0.1259, -1.1798]])

In [10]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [11]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [12]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [13]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [14]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [15]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [16]:
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), os.path.join(weights_path, 'tut4-model.pt'))
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 18s
	Train Loss: 0.643 | Train Acc: 62.33%
	 Val. Loss: 0.538 |  Val. Acc: 72.14%
Epoch: 02 | Epoch Time: 0m 18s
	Train Loss: 0.430 | Train Acc: 79.90%
	 Val. Loss: 0.491 |  Val. Acc: 75.36%
Epoch: 03 | Epoch Time: 0m 19s
	Train Loss: 0.271 | Train Acc: 88.85%
	 Val. Loss: 0.521 |  Val. Acc: 77.12%
Epoch: 04 | Epoch Time: 0m 19s
	Train Loss: 0.132 | Train Acc: 95.10%
	 Val. Loss: 0.624 |  Val. Acc: 76.67%
Epoch: 05 | Epoch Time: 0m 19s
	Train Loss: 0.064 | Train Acc: 98.05%
	 Val. Loss: 0.730 |  Val. Acc: 76.76%
Epoch: 06 | Epoch Time: 0m 19s
	Train Loss: 0.033 | Train Acc: 99.05%
	 Val. Loss: 0.838 |  Val. Acc: 75.32%
Epoch: 07 | Epoch Time: 0m 19s
	Train Loss: 0.017 | Train Acc: 99.50%
	 Val. Loss: 1.004 |  Val. Acc: 75.42%
Epoch: 08 | Epoch Time: 0m 19s
	Train Loss: 0.013 | Train Acc: 99.62%
	 Val. Loss: 1.104 |  Val. Acc: 75.69%
Epoch: 09 | Epoch Time: 0m 19s
	Train Loss: 0.010 | Train Acc: 99.72%
	 Val. Loss: 1.315 |  Val. Acc: 74.60%
Epoch: 10 | Epoch T

In [17]:
model.load_state_dict(torch.load(os.path.join(weights_path, 'tut4-model.pt')))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.499 | Test Acc: 75.84%


In [18]:
import spacy
nlp = spacy.load('en')

def predict_sentiment(model, sentence, min_len = 5):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    if len(tokenized) < min_len:
        tokenized += ['<pad>'] * (min_len - len(tokenized))
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.sigmoid(model(tensor))
    label = 1 if prediction >= 0.5 else 0 
    
    print(f"Predicted Label: {label}\nConfidence: {prediction}")
#     return label, prediction.item()

In [19]:
predict_sentiment(model, "This film is terrible")

Predicted Label: 0
Confidence: tensor([[0.4763]], grad_fn=<SigmoidBackward>)


In [20]:
predict_sentiment(model, "This film is great")

Predicted Label: 0
Confidence: tensor([[0.1924]], grad_fn=<SigmoidBackward>)
