In [118]:
import sys
import csv
from pathlib import Path
from argparse import Namespace
import random

import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import torchtext
from torchtext import data

In [2]:
csv.field_size_limit(sys.maxsize)

131072

In [3]:
args = Namespace(
    data_path = Path('../data'),
    split_ratio = 0.8,
    max_vocab_size = None,
    min_freq = 1, 
    
    epochs = 1, 
    batch_size = 64,
    
    random_seed = 17,
    
    device = 'cpu'
)

In [4]:
args.device = torch.device('cuda' if (torch.cuda.is_available and (args.device == 'cuda')) else 'cpu')
print(args.device)

cpu


In [10]:
def set_seed_everywhere(seed, device):
    np.random.seed(seed)
    torch.manual_seed(seed)
    
    if device == 'cuda':
        torch.cuda.manual_seed_all(seed)
        
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [11]:
set_seed_everywhere(args.random_seed, args.device)

In [12]:
train_df = pd.read_csv(args.data_path/'train.csv')
valid_df = pd.read_csv(args.data_path/'valid.csv')
test_df = pd.read_csv(args.data_path/'test.csv', names=['id', 'title', 'text'], header=0)

train_df = train_df.fillna('')
valid_df = valid_df.fillna('')
test_df = test_df.fillna('')

full_train_df = pd.concat([train_df, valid_df], axis=0, ignore_index=True)

print(train_df.shape, valid_df.shape, full_train_df.shape, test_df.shape)
train_df.columns, valid_df.columns, full_train_df.columns, test_df.columns

(24871, 3) (3552, 3) (28423, 3) (5647, 3)


(Index(['label', 'title', 'text'], dtype='object'),
 Index(['label', 'title', 'text'], dtype='object'),
 Index(['label', 'title', 'text'], dtype='object'),
 Index(['id', 'title', 'text'], dtype='object'))

In [13]:
train_df.to_json(args.data_path/'train.json', orient='records', lines=True)
valid_df.to_json(args.data_path/'valid.json', orient='records', lines=True)
full_train_df.to_json(args.data_path/'full_train.json', orient='records', lines=True)

test_df.to_json(args.data_path/'test.json', orient='records', lines=True)

In [62]:
ID = data.Field()
TITLE = data.Field(batch_first=True)
TEXT = data.Field(batch_first=True)
LABEL = data.LabelField()

In [63]:
train_fields = {'title': ('title', TITLE), 'text': ('text', TEXT), 'label': ('label', LABEL)}
test_fields = {'id': ('id', ID), 'title': ('title', TITLE), 'text': ('text', TEXT)}

In [64]:
full_train_data, = data.TabularDataset.splits(
    path = '../data/',
    train = 'full_train.json',
    format = 'json',
    fields = train_fields,
)

len(full_train_data)

28423

In [65]:
train_data, valid_data = full_train_data.split(
    split_ratio = args.split_ratio,
    stratified = True, 
    strata_field = 'label',
    random_state = random.seed(args.random_seed)
)

len(train_data), len(valid_data)

(22738, 5685)

In [66]:
test_data,  = data.TabularDataset.splits(
    path = '../data/',
    test = 'test.json',
    format = 'json',
    fields = test_fields,
)

len(test_data)

5647

In [67]:
TITLE.build_vocab(train_data,
                  max_size = args.max_vocab_size,
                  min_freq = args.min_freq)

print(len(TITLE.vocab))
TITLE.vocab.freqs.most_common(10)

41779


[('to', 4704),
 ('the', 3317),
 ('in', 3128),
 ('of', 2951),
 ('for', 2229),
 ('a', 2167),
 ('The', 1972),
 ('and', 1834),
 ('on', 1656),
 ('Trump', 1635)]

In [68]:
TEXT.build_vocab(train_data,
                 max_size = args.max_vocab_size,
                 min_freq = args.min_freq)

print(len(TEXT.vocab))
TEXT.vocab.freqs.most_common(10)

437303


[('the', 523125),
 ('to', 292426),
 ('of', 252964),
 ('a', 243728),
 ('and', 238967),
 ('in', 191643),
 ('that', 115872),
 ('for', 98369),
 ('is', 95825),
 ('on', 90314)]

In [69]:
LABEL.build_vocab(train_data)

print(len(LABEL.vocab))
LABEL.vocab.freqs.most_common(10)

3


[('news', 13390), ('other', 5915), ('clickbait', 3433)]

In [70]:
train_iter, valid_iter = data.BucketIterator.splits(
    (train_data, valid_data),
    batch_size = args.batch_size,
    sort_within_batch = True,
    sort_key=lambda x: data.interleave_keys(len(x.title), len(x.text)),
    device = args.device
)

In [81]:
batch = next(iter(train_iter))
print(batch)


[torchtext.data.batch.Batch of size 64]
	[.title]:[torch.LongTensor of size 64x30]
	[.text]:[torch.LongTensor of size 64x558]
	[.label]:[torch.LongTensor of size 64]


In [114]:
class SimpleFC(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
#         self.avg_pool = nn.AvgPool1d()
        
    def forward(self, title):
        # title = [batch_size, seq_length]
        
        embedded = self.embedding(title)
        print(embedded.shape)
        # embedded = [batch_size, seq_length, embedding_dim] 
        
        x = self.fc1(embedded)
        print(x.shape)
        # x = [batch_size, seq_length, hidden_dim]
        
        x = self.relu(x)
        print(x.shape)
        # x = [batch_size, seq_length, hidden_dim]
   
        x = self.fc2(x)
        print(x.shape)
        # x = [batch_size, seq_length, output_dim]    
        
        # ???
        
        return x

In [124]:
arr = torch.arange(24).reshape(2, 3, 4).type(torch.float)
arr

tensor([[[ 0.,  1.,  2.,  3.],
         [ 4.,  5.,  6.,  7.],
         [ 8.,  9., 10., 11.]],

        [[12., 13., 14., 15.],
         [16., 17., 18., 19.],
         [20., 21., 22., 23.]]])

In [127]:
F.avg_pool1d(arr, arr.shape[1])

tensor([[[ 1.],
         [ 5.],
         [ 9.]],

        [[13.],
         [17.],
         [21.]]])

In [119]:
input_dim = len(TITLE.vocab)
embedding_dim = 100
hidden_dim = 50
output_dim = len(LABEL.vocab)

print(input_dim, embedding_dim, hidden_dim, output_dim)
model = SimpleFC(input_dim, embedding_dim, hidden_dim, output_dim)
model

41779 100 50 3


TypeError: __init__() missing 1 required positional argument: 'kernel_size'

In [116]:
print(batch.title.shape)
model(batch.title);

torch.Size([64, 30])
torch.Size([64, 30, 100])
torch.Size([64, 30, 50])
torch.Size([64, 30, 50])
torch.Size([64, 30, 3])


In [76]:
y_true = batch.label
y_pred = model(batch.title)
y_true.shape, y_pred.shape


[torchtext.data.batch.Batch of size 64]
	[.title]:[torch.LongTensor of size 64x24]
	[.text]:[torch.LongTensor of size 64x367]
	[.label]:[torch.LongTensor of size 64]
torch.Size([64, 24])
torch.Size([64, 24, 100])


AttributeError: 'SimpleFC' object has no attribute 'relu'

In [27]:
def count_parameters(model):
    return np.sum(params.numel() for params in model.parameters() if params.requires_grad)

count_parameters(model)

4188303

In [32]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

model = model.to(args.device)

In [52]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_f1 = 0
    
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        
        y_pred = model(batch.title)
        epoch_f1 = f1_score(batch.label, y_pred, average='macro')
        
        loss = criterion(y_pred, batch.label)
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_f1_score += f1_score.item()
        
    return epoch_loss / len(iterator), epoch_f1_score / len(iterator)

In [41]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_f1 = 0
    
    model.eval()
    
    with torch.no_grad():
        y_pred = model(batch.title)
        epoch_f1 = f1_score(batch.label, y_pred, average='macro')
        
        loss = criterion(y_pred, batch.label)
        
        epoch_loss += loss.item()
        epoch_f1_score += f1_score.item()
        
    return epoch_loss / len(iterator), epoch_f1_score / len(iterator)

In [42]:
best_valid_loss = np.float('inf')

for epoch in range(args.epochs):
    train_loss, train_f1 = train(model, train_iter, optimizer, criterion)
    valid_loss, valid_f1 = evaluate(model, valid_iter, criterion)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'pipeline.pt')
        
    print(f'Epoch: {epoch}')
    print(f'Train loss: {train_loss}\t Train F1: {train_f1}')
    print(f'Valid loss: {valid_loss}\t Valid F1: {valid_f1}')    

ValueError: Found input variables with inconsistent numbers of samples: [64, 30]