In [1]:
import sys
import csv
from pathlib import Path
from argparse import Namespace
import random

import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.optim as optim

import torchtext
from torchtext import data

In [2]:
csv.field_size_limit(sys.maxsize)

131072

In [185]:
args = Namespace(
    data_path = Path('../data'),
    split_ratio = 0.8,
    max_vocab_size = None,
    min_freq = 1, 
    
    epochs = 1, 
    batch_size = 64,
    
    random_seed = 17,
    
    device = 'cpu'
)

In [4]:
args.device = torch.device('cuda' if (torch.cuda.is_available and (args.device == 'cuda')) else 'cpu')
print(args.device)

cpu


In [5]:
def set_seed_everywhere(seed, device):
    np.random.seed(seed)
    torch.manual_seed(seed)
    
    if device == 'cuda':
        torch.cuda.manual_seed_all(seed)
        
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [6]:
set_seed_everywhere(args.random_seed, args.device)

In [133]:
train_df = pd.read_csv(args.data_path/'train.csv')
valid_df = pd.read_csv(args.data_path/'valid.csv')
test_df = pd.read_csv(args.data_path/'test.csv', names=['id', 'title', 'text'], header=0)

train_df = train_df.fillna('')
valid_df = valid_df.fillna('')
test_df = test_df.fillna('')

full_train_df = pd.concat([train_df, valid_df], axis=0, ignore_index=True)

print(train_df.shape, valid_df.shape, full_train_df.shape, test_df.shape)
train_df.columns, valid_df.columns, full_train_df.columns, test_df.columns

(24871, 3) (3552, 3) (28423, 3) (5647, 3)


(Index(['label', 'title', 'text'], dtype='object'),
 Index(['label', 'title', 'text'], dtype='object'),
 Index(['label', 'title', 'text'], dtype='object'),
 Index(['id', 'title', 'text'], dtype='object'))

In [134]:
train_df.to_json(args.data_path/'train.json', orient='records', lines=True)
valid_df.to_json(args.data_path/'valid.json', orient='records', lines=True)
full_train_df.to_json(args.data_path/'full_train.json', orient='records', lines=True)

test_df.to_json(args.data_path/'test.json', orient='records', lines=True)

In [135]:
ID = data.Field()
TITLE = data.Field()
TEXT = data.Field()
LABEL = data.LabelField()

In [136]:
train_fields = {'title': ('title', TITLE), 'text': ('text', TEXT), 'label': ('label', LABEL)}
test_fields = {'id': ('id', ID), 'title': ('title', TITLE), 'text': ('text', TEXT)}

In [137]:
full_train_data, = data.TabularDataset.splits(
    path = '../data/',
    train = 'full_train.json',
    format = 'json',
    fields = train_fields,
)

len(full_train_data)

28423

In [138]:
train_data, valid_data = full_train_data.split(
    split_ratio = args.split_ratio,
    stratified = True, 
    strata_field = 'label',
    random_state = random.seed(args.random_seed)
)

len(train_data), len(valid_data)

(22738, 5685)

In [140]:
print(vars(train_data[0]))

{'title': [], 'text': ['Your', 'site', 'appears', 'to', 'be', 'broken', 'in', 'Chrome', 'at', 'least.', 'The', 'questions', 'section', 'is', 'unstyled', 'in', 'full', 'desktop', 'width', '(its', 'edge', 'to', 'edge', 'which', 'looks', 'pretty', 'bad', 'compared', 'to', 'the', 'rest', 'of', 'the', 'site)', 'and', 'there', 'is', 'no', 'form', 'below', 'the', '"fill', 'out', 'the', 'simple', 'form', 'below"', 'Oh', 'wait', 'I', 'have', 'to', 'click', 'to', 'show', 'the', 'form?', 'Why', 'do', 'that', 'when', 'the', 'instruction', 'text', 'suggests', 'I', 'should', 'be', 'seeing', 'a', '2:', 'That', 'form', 'is', 'horrifyingly', 'bad.'], 'label': 'other'}


In [141]:
test_data,  = data.TabularDataset.splits(
    path = '../data/',
    test = 'test.json',
    format = 'json',
    fields = test_fields,
)

len(test_data)

5647

In [142]:
TITLE.build_vocab(train_data,
                  max_size = args.max_vocab_size,
                  min_freq = args.min_freq)

print(len(TITLE.vocab))
TITLE.vocab.freqs.most_common(10)

41779


[('to', 4704),
 ('the', 3317),
 ('in', 3128),
 ('of', 2951),
 ('for', 2229),
 ('a', 2167),
 ('The', 1972),
 ('and', 1834),
 ('on', 1656),
 ('Trump', 1635)]

In [143]:
TEXT.build_vocab(train_data,
                 max_size = args.max_vocab_size,
                 min_freq = args.min_freq)

print(len(TEXT.vocab))
TEXT.vocab.freqs.most_common(10)

437303


[('the', 523125),
 ('to', 292426),
 ('of', 252964),
 ('a', 243728),
 ('and', 238967),
 ('in', 191643),
 ('that', 115872),
 ('for', 98369),
 ('is', 95825),
 ('on', 90314)]

In [144]:
LABEL.build_vocab(train_data)

print(len(LABEL.vocab))
LABEL.vocab.freqs.most_common(10)

3


[('news', 13390), ('other', 5915), ('clickbait', 3433)]

In [158]:
data.Iterator?

[0;31mInit signature:[0m [0mdata[0m[0;34m.[0m[0mIterator[0m[0;34m([0m[0mdataset[0m[0;34m,[0m [0mbatch_size[0m[0;34m,[0m [0msort_key[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mdevice[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mbatch_size_fn[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mtrain[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m [0mrepeat[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m [0mshuffle[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0msort[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0msort_within_batch[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Defines an iterator that loads batches of data from a Dataset.

Attributes:
    dataset: The Dataset object to load Examples from.
    batch_size: Batch size.
    batch_size_fn: Function of three arguments (new example to add, current
        count of examples in the batch, and current effective batch size)
        that returns the new effective batch size

In [163]:
train_iter, valid_iter = data.BucketIterator.splits(
    (train_data, valid_data),
    batch_size = args.batch_size,
    sort_within_batch = True,
    sort_key=lambda x: data.interleave_keys(len(x.title), len(x.text)),
    device = args.device
)

In [166]:
next(iter(train_iter))


[torchtext.data.batch.Batch of size 64]
	[.title]:[torch.LongTensor of size 24x64]
	[.text]:[torch.LongTensor of size 367x64]
	[.label]:[torch.LongTensor of size 64]

In [167]:
class SimpleFC(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, title):
        embedded = self.embedding(title)
        x = self.fc1(embedded)
        x = self.relu(x)
        x = self.fc2(x)
        
        return x

In [168]:
input_dim = len(TITLE.vocab)
embedding_dim = 100
hidden_dim = 100
output_dim = len(LABEL.vocab)

input_dim, embedding_dim, hidden_dim, output_dim

(41779, 100, 100, 3)

In [169]:
model = SimpleFC(input_dim, embedding_dim, hidden_dim, output_dim)

In [170]:
def count_parameters(model):
    return np.sum(params.numel() for params in model.parameters() if params.requires_grad)

count_parameters(model)

4188303

In [199]:
batch = next(iter(train_iter))
print(batch)

y_true = batch.label
y_pred = model(batch.title)
y_true.shape, y_pred.shape


[torchtext.data.batch.Batch of size 64]
	[.title]:[torch.LongTensor of size 34x64]
	[.text]:[torch.LongTensor of size 847x64]
	[.label]:[torch.LongTensor of size 64]


(torch.Size([64]), torch.Size([34, 64, 3]))

In [197]:
batch.label.shape

torch.Size([64])

In [172]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

model = model.to(args.device)

In [183]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_f1 = 0
    
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        
        y_pred = model(batch.title)
        f1_score = f1_score(batch.label, y_pred, average='macro')
        
        loss = criterion(y_pred, batch.label)
        loss.backward()
        
        epoch_loss += loss.item()
        epoch_f1_score += f1_score.item()
        
    return epoch_loss / len(iterator), epoch_f1_score / len(iterator)

In [184]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_f1 = 0
    
    model.eval()
    
    with torch.no_grad():
        y_pred = model(batch.title)
        f1_score = f1_score(batch.label, y_pred, average='macro')
        
        loss = criterion(y_pred, batch.label)
        
        epoch_loss += loss.item()
        epoch_f1_score += f1_score.item()
        
    return epoch_loss / len(iterator), epoch_f1_score / len(iterator)

In [188]:
best_valid_loss = np.float('inf')



TypeError: 'torch.dtype' object is not callable