In [1]:
# New Stuff
from torchtext.data import Field, TabularDataset, BucketIterator, LabelField
import spacy
import torch

torch.backends.cudnn.deterministic = True

# spacy_en = spacy.load('en')

#     return [token.text for token in spacy_en.tokenizer(text)]

title = Field(sequential=True, use_vocab=True, tokenize='spacy', lower=True)
text = Field(sequential=True, use_vocab=True, tokenize='spacy', lower=True)
label = LabelField(dtype=torch.float)

fields = {'title': ('title', title), 'text': ('text', text), 'label': ('label', label)}

In [2]:
train_data = TabularDataset.splits(
    path='../data',
    train='news.csv',
    # validation='news.csv',
    # test='news.csv',
    format='csv',
    fields=fields)[0]
print("Num of training: ", len(train_data))
# print("Num of validation: ", len(validation_data))
# print("Num of testing: ", len(test_data))

Num of training:  6335


In [3]:
train_data, validation_data = train_data.split(split_ratio=0.65)
validation_data, test_data = validation_data.split(split_ratio=0.5)
print("Num of training: ", len(train_data))
print("Num of validation: ", len(validation_data))
print("Num of testing: ", len(test_data))

Num of training:  4118
Num of validation:  1108
Num of testing:  1109


In [4]:
print(train_data[0].__dict__.keys())
print(train_data[0].__dict__.values())

dict_keys(['title', 'text', 'label'])
dict_values([['2nd', 'new', 'york', 'prison', 'worker', 'charged', 'in', 'killers', "'", 'escape'], ['plattsburgh', ',', 'new', 'york', '(', 'cnn', ')', 'gene', 'palmer', ',', 'the', 'second', 'prison', 'employee', 'charged', 'in', 'connection', 'with', 'the', 'escape', 'of', 'two', 'convicted', 'murderers', 'in', 'upstate', 'new', 'york', ',', 'admitted', 'he', 'provided', 'the', 'fugitives', 'with', 'tools', 'and', 'other', 'items', 'that', 'unintentionally', '"', 'made', 'their', 'escape', 'easier', ',', '"', 'according', 'to', 'a', 'statement', 'he', 'gave', 'the', 'state', 'police', '.', '\n\n', 'palmer', ',', 'a', 'prison', 'guard', 'for', 'more', 'than', '27', 'years', ',', 'told', 'investigators', 'that', 'within', 'the', 'last', 'eight', 'months', 'he', 'provided', 'inmate', 'david', 'sweat', 'with', 'a', 'pair', 'of', 'needle', '-', 'nose', 'pliers', 'and', 'a', 'flat', '-', 'head', 'screwdriver', ',', 'according', 'to', 'the', 'court', '

In [5]:
MAX_VOCAB_SIZE = 25000
text.build_vocab(train_data, vectors = "glove.6B.100d", max_size=MAX_VOCAB_SIZE)
title.build_vocab(train_data, vectors = "glove.6B.100d", max_size=MAX_VOCAB_SIZE)
label.build_vocab(train_data)

In [6]:
BATCH_SIZE = 64

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'

train_iterator, validation_iterator, test_iterator = BucketIterator.splits(
    (train_data, validation_data, test_data),
    batch_size=BATCH_SIZE,
    sort=False,
    device=device)

In [7]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, text):
        embedded = self.embedding(text)

        output, hidden = self.rnn(embedded)

        assert torch.equal(output[-1,:,:], hidden.squeeze(0))

        return self.fc(hidden.squeeze(0))

In [8]:
INPUT_DIM = len(text.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [9]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'This model has {count_parameters(model):,} trainable parameters')

This model has 2,592,105 trainable parameters


In [10]:
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=1e-3)

In [11]:
criterion = nn.BCEWithLogitsLoss()

In [12]:
model = model.to(device)
criterion = criterion.to(device)

In [13]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum()/len(correct)
    return acc

In [14]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:
        optimizer.zero_grad()
        predictions = model.forward(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label)

        acc = binary_accuracy(predictions, batch.label)

        loss.backward()     # calcualte the gradient of each param
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss/len(iterator), epoch_acc/len(iterator)

In [15]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():   # don't calcualte gradients in this block
        for batch in iterator:
            predictions = model.forward(batch.text).squeeze(1)
            loss = criterion(predictions, batch.label)

            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss/len(iterator), epoch_acc/len(iterator)


In [16]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time/60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [19]:

N_EPOCHS = 6
best_validation_loss = float('inf')


In [20]:
for epoch in range(N_EPOCHS):
    start_time = time.time()

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    validation_loss, validation_acc = evaluate(model, validation_iterator, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if validation_loss < best_validation_loss:
        best_valid_loss = validation_loss
        torch.save(model.state_dict(), 'RNN-Model.pt')
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\tValidation Loss: {validation_loss:.3f} | Validation Acc: {validation_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 7m 17s
	Train Loss: 0.693 | Train Acc: 49.03%
	Validation Loss: 0.693 | Validation Acc: 47.12%
Epoch: 02 | Epoch Time: 7m 43s
	Train Loss: 0.693 | Train Acc: 49.54%
	Validation Loss: 0.693 | Validation Acc: 52.86%
Epoch: 03 | Epoch Time: 7m 33s
	Train Loss: 0.693 | Train Acc: 49.76%
	Validation Loss: 0.693 | Validation Acc: 47.12%
Epoch: 04 | Epoch Time: 7m 48s
	Train Loss: 0.693 | Train Acc: 49.69%
	Validation Loss: 0.692 | Validation Acc: 52.86%
Epoch: 05 | Epoch Time: 7m 35s
	Train Loss: 0.693 | Train Acc: 49.35%
	Validation Loss: 0.693 | Validation Acc: 47.12%
Epoch: 06 | Epoch Time: 7m 53s
	Train Loss: 0.693 | Train Acc: 49.78%
	Validation Loss: 0.693 | Validation Acc: 47.12%


In [21]:
model.load_state_dict(torch.load('RNN-Model.pt'))
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.693 | Test Acc: 52.15%
