In [2]:
# New Stuff
from torchtext.data import Field, TabularDataset, BucketIterator
import spacy
import torch

torch.backends.cudnn.deterministic = True

# spacy_en = spacy.load('en')

#     return [token.text for token in spacy_en.tokenizer(text)]

title = Field(sequential=True, use_vocab=True, tokenize='spacy', lower=True)
text = Field(sequential=True, use_vocab=True, tokenize='spacy', lower=True)
label = Field(sequential=False, use_vocab=False, dtype = torch.float)

fields = {'title': ('title', title), 'text': ('text', text), 'label': ('label', label)}



In [3]:
train_data, test_data = TabularDataset.splits(
    path='../data',
    train='news.csv',
    test='news.csv',
    format='csv',
    fields=fields)
print("Num of training: ", len(train_data))
print("Num of testing: ", len(test_data))



Num of training:  6335
Num of testing:  6335


In [4]:
train_data, validation_data = train_data.split(split_ratio=0.75)
print("Num of training: ", len(train_data))
print("Num of validation: ", len(validation_data))
print("Num of testing: ", len(test_data))

Num of training:  4751
Num of validation:  1584
Num of testing:  6335


In [5]:
print(train_data[0].__dict__.keys())
print(train_data[0].__dict__.values())

dict_keys(['title', 'text', 'label'])
dict_values([['gop', 'hits', 'another', 'roadblock', 'on', 'obamacare', 'repeal'], ['donald', 'trump', 'is', 'considering', 'tapping', 'a', 'democrat', 'to', 'be', 'his', 'treasury', 'secretary', ',', 'politico', 'has', 'learned', '.'], 'REAL'])


In [6]:
import torch
from torchtext import data
from torchtext import datasets
import random
import numpy as np

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize = 'spacy', batch_first = True)
LABEL = data.LabelField(dtype = torch.float)

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

train_data, valid_data = train_data.split(random_state = random.seed(SEED))

aclImdb_v1.tar.gz:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:09<00:00, 8.57MB/s]


In [7]:
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [06:31, 2.20MB/s]                               
100%|█████████▉| 399999/400000 [00:15<00:00, 25520.19it/s]


In [8]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)



In [16]:
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout, pad_idx):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.conv_0 = nn.Conv2d(in_channels = 1, out_channels = n_filters,
                               kernel_size = (filter_sizes[0], embedding_dim))
        
        self.conv_1 = nn.Conv2d(in_channels = 1,
                               out_channels = n_filters,
                               kernel_size = (filter_sizes[1], embedding_dim))
        
        self.conv_2 = nn.Conv2d(in_channels = 1,
                               out_channels = n_filters,
                               kernel_size = (filter_sizes[2], embedding_dim))
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [batch size, sent len]
        embedded = self.embedding(text)
        
        #embedded = [batch size, sent len, emb dim]
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        conved_0 = F.relu(self.conv_0(embedded).squeeze(3))
        conved_1 = F.relu(self.conv_1(embedded).squeeze(3))
        conved_2 = F.relu(self.conv_1(embedded).squeeze(3))
        
        #conved_n = [batch size, n_filets, sent len - filter_sizes[n] + 1]
        pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
        pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeese(2)
        pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
        
        #pooled_n = [batch size, n_filters]
        cat= self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim=1))
        
        #cat = [batch size, n_filters * len(filter_sizes)]
        
        return self.fc(cat)
        

In [17]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)