In [3]:
# New Stuff
from torchtext.data import Field, TabularDataset, BucketIterator
import spacy
import torch

torch.backends.cudnn.deterministic = True

# spacy_en = spacy.load('en')

#     return [token.text for token in spacy_en.tokenizer(text)]

title = Field(sequential=True, use_vocab=True, tokenize='spacy', lower=True)
text = Field(sequential=True, use_vocab=True, tokenize='spacy', lower=True)
label = Field(sequential=False, use_vocab=False, dtype = torch.float)

fields = {'title': ('title', title), 'text': ('text', text), 'label': ('label', label)}

In [4]:
train_data, test_data = TabularDataset.splits(
    path='../data',
    train='news.csv',
    test='news.csv',
    format='csv',
    fields=fields)
print("Num of training: ", len(train_data))
print("Num of testing: ", len(test_data))

Num of training:  6335
Num of testing:  6335


In [5]:
train_data, validation_data = train_data.split(split_ratio=0.75)
print("Num of training: ", len(train_data))
print("Num of validation: ", len(validation_data))
print("Num of testing: ", len(test_data))

Num of training:  4751
Num of validation:  1584
Num of testing:  6335


In [6]:
print(train_data[0].__dict__.keys())
print(train_data[0].__dict__.values())

dict_keys(['title', 'text', 'label'])
dict_values([['if', 'you', 'really', 'want', 'to', 'save', 'energy', 'at', 'home', ',', 'forget', 'about', 'your', 'light', 'switches'], ['keeping', 'an', 'eye', 'on', 'your', 'own', 'energy', 'use', 'is', 'the', '"', 'duh', '"', 'approach', 'to', 'a', 'smorgasbord', 'of', 'environmental', 'problems', ',', 'up', 'to', 'and', 'including', 'climate', 'change', '.', '\xa0', 'as', 'a', 'reporter', ',', 'i', 'can', 'obsess', 'over', 'research', 'funding', 'for', 'renewable', 'technology', ',', 'or', 'streamlined', 'permitting', 'for', 'solar', 'installations', ',', 'or', 'more', 'public', 'transit', ',', 'or', 'better', 'roads', 'for', 'cyclists', 'and', 'pedestrians', ',', 'or', 'how', 'much', 'fuel', 'is', 'burned', 'in', 'schlepping', 'and', 'refrigerating', 'my', 'food', 'before', 'it', 'gets', 'to', 'me', '.', 'but', 'if', 'i', 'actually', 'want', 'to', 'feel', 'like', 'i', 'have', 'control', 'over', 'one', 'small', 'corner', 'of', 'the', 'world', 

In [7]:
MAX_VOCAB_SIZE = 25000
text.build_vocab(train_data, vectors = "glove.6B.100d", max_size=MAX_VOCAB_SIZE)
title.build_vocab(train_data, vectors = "glove.6B.100d", max_size=MAX_VOCAB_SIZE)
label.build_vocab(train_data)

.vector_cache\glove.6B.zip: 862MB [06:30, 2.21MB/s]                           
100%|█████████▉| 399999/400000 [04:01<00:00, 1656.82it/s]


In [8]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, validation_iterator, test_iterator = BucketIterator.splits(
    (train_data, validation_data, test_data),
    batch_size=BATCH_SIZE,
    device=device)

In [10]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, text):
        embedded = self.embedding(text)

        output, hidden = self.rnn(embedded)

        assert torch.equal(output[-1,:,:], hidden.squeeze(0))

        return self.fc(hidden.squeeze(0))

In [11]:
INPUT_DIM = len(text.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [15]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'This model has {count_parameters(model):,} trainable parameters')

This model has 2,592,105 trainable parameters


In [16]:
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=1e-3)