In [1]:
import sys
import csv
from pathlib import Path
from argparse import Namespace
import random

from tqdm import tqdm, tqdm_notebook

import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import torchtext
from torchtext import data

In [3]:
csv.field_size_limit(sys.maxsize)
tqdm_notebook().pandas()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [4]:
DATA_PATH = Path('../data')
RANDOM_SEED = 17

In [5]:
def set_seed_everywhere(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
        
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed_everywhere(RANDOM_SEED)

In [6]:
train_df = pd.read_csv(DATA_PATH/'train.csv')
valid_df = pd.read_csv(DATA_PATH/'valid.csv')
test_df = pd.read_csv(DATA_PATH/'test.csv', names=['id', 'title', 'text'], header=0)

train_df = train_df.fillna('')
valid_df = valid_df.fillna('')
test_df = test_df.fillna('')

full_train_df = pd.concat([train_df, valid_df], axis=0, ignore_index=True)

train_df.to_json(DATA_PATH/'train.json', orient='records', lines=True)
valid_df.to_json(DATA_PATH/'valid.json', orient='records', lines=True)
full_train_df.to_json(DATA_PATH/'full_train.json', orient='records', lines=True)
test_df.to_json(DATA_PATH/'test.json', orient='records', lines=True)

print(train_df.shape, valid_df.shape, full_train_df.shape, test_df.shape)
train_df.columns, valid_df.columns, full_train_df.columns, test_df.columns

(24871, 3) (3552, 3) (28423, 3) (5647, 3)


(Index(['label', 'title', 'text'], dtype='object'),
 Index(['label', 'title', 'text'], dtype='object'),
 Index(['label', 'title', 'text'], dtype='object'),
 Index(['id', 'title', 'text'], dtype='object'))

In [28]:
SPLIT_RATIO = 0.8

ID = data.Field()
TITLE = data.Field(batch_first=True)
TEXT = data.Field(batch_first=True)
LABEL = data.LabelField()

train_fields = {'title': ('title', TITLE), 'text': ('text', TEXT), 'label': ('label', LABEL)}
test_fields = {'id': ('id', ID), 'title': ('title', TITLE), 'text': ('text', TEXT)}

full_train_data, = data.TabularDataset.splits(
    path = '../data/',
    train = 'full_train.json',
    format = 'json',
    fields = train_fields,
)

train_data, valid_data = full_train_data.split(
    split_ratio = SPLIT_RATIO,
    stratified = True, 
    strata_field = 'label',
    random_state = random.seed(RANDOM_SEED)
)

test_data,  = data.TabularDataset.splits(
    path = '../data/',
    test = 'test.json',
    format = 'json',
    fields = test_fields,
)

len(full_train_data), len(train_data), len(valid_data), len(test_data)

(28423, 22738, 5685, 5647)

In [29]:
TITLE_MAX_VOCAB_SIZE = None
TITLE_MIN_FREQ = 1
TEXT_MAX_VOCAB_SIZE = None
TEXT_MIN_FREQ = 1

TITLE.build_vocab(train_data,
                  max_size = TITLE_MAX_VOCAB_SIZE,
                  min_freq = TITLE_MIN_FREQ)

TEXT.build_vocab(train_data,
                 max_size = TEXT_MAX_VOCAB_SIZE,
                 min_freq = TEXT_MIN_FREQ)

LABEL.build_vocab(train_data)

print(len(TITLE.vocab))
print(TITLE.vocab.freqs.most_common(10))
print(len(TEXT.vocab))
print(TEXT.vocab.freqs.most_common(10))
print(len(LABEL.vocab))
print(LABEL.vocab.freqs.most_common(10))

41797
[('to', 4662), ('the', 3293), ('in', 3117), ('of', 2956), ('for', 2237), ('a', 2176), ('The', 1973), ('and', 1818), ('on', 1641), ('Trump', 1625)]
439185
[('the', 526516), ('to', 295062), ('of', 255355), ('a', 246630), ('and', 241636), ('in', 193152), ('that', 117137), ('for', 99170), ('is', 96550), ('on', 91107)]
3
[('news', 13390), ('other', 5915), ('clickbait', 3433)]


In [30]:
DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE

'cpu'

In [168]:
BATCH_SIZE = 5

train_iter, valid_iter, test_iter = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    sort_key=lambda x: len(x.title),
    device = DEVICE
)

In [212]:
class SimpleFC(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, title):
        # title = [batch_size, seq_length]
        
        embedded = self.embedding(title)
        # embedded = [batch_size, seq_length, embedding_dim] 
        
        x = self.fc1(embedded)
        x = F.relu(x)
        # x = [batch_size, seq_length, hidden_dim]
        
        x = x.permute(0, 2, 1)
        x = F.avg_pool1d(x, x.shape[2])
        x = x.squeeze(2)
        # x = [batch_size, hidden_dim]
        
        x = self.fc2(x)
        x = F.relu(x)
        # x = [batch_size, hidden_dim, output_dim]    
        
        return x

In [213]:
INPUT_DIM = len(TITLE.vocab)
EMBEDDING_DIM = 50
HIDDEN_DIM = 100
OUTPUT_DIM = len(LABEL.vocab)

print(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

model = SimpleFC(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model

41797 50 100 3


SimpleFC(
  (embedding): Embedding(41797, 50)
  (fc1): Linear(in_features=50, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=3, bias=True)
)

In [214]:
model(batch.title)

tensor([[0.0000, 0.2846, 0.1737],
        [0.0000, 0.2455, 0.1118],
        [0.0000, 0.2725, 0.1477],
        [0.0000, 0.1467, 0.2011],
        [0.0000, 0.1731, 0.0953]], grad_fn=<ReluBackward0>)

In [216]:
print(batch.title.shape)
F.avg_pool1d(batch.title, batch.title.shape[1])

torch.Size([5, 13])


RuntimeError: Expected 3-dimensional tensor, but got 2-dimensional tensor for argument #1 'self' (while checking arguments for avg_pool1d)

In [171]:
def count_parameters(model):
    return np.sum(params.numel() for params in model.parameters() if params.requires_grad)

count_parameters(model)

4184903

In [172]:
batch = next(iter(train_iter))
print(batch)

y_true = batch.label
y_pred = model(batch.title)
y_true.shape, y_pred.shape


[torchtext.data.batch.Batch of size 5]
	[.title]:[torch.LongTensor of size 5x13]
	[.text]:[torch.LongTensor of size 5x1711]
	[.label]:[torch.LongTensor of size 5]


(torch.Size([5]), torch.Size([5, 3]))

In [173]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

model = model.to(DEVICE)

In [178]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_f1 = 0
    
    model.train()
    
    for batch in tqdm_notebook(iterator):
        optimizer.zero_grad()
        
        y_pred = model(batch.title)
        loss = criterion(y_pred, batch.label)
        loss.backward()
        
        y_pred_proba = F.softmax(y_pred.detach().numpy(), dim=1).argmax(dim=1)
        
        f1 = f1_score(batch.label, y_pred_proba, average='macro')
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_f1 += f1.item()
        
    return epoch_loss / len(iterator), epoch_f1_score / len(iterator)

In [179]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_f1 = 0
    
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            y_pred = model(batch.title)
            loss = criterion(y_pred, batch.label)

            f1 = f1_score(batch.label, y_pred.detach().numpy(), average='macro')

            epoch_loss += loss.item()
            epoch_f1 += f1.item()
        
    return epoch_loss / len(iterator), epoch_f1_score / len(iterator)

In [180]:
EPOCHS = 1

best_valid_loss = np.float('inf')

for epoch in range(EPOCHS):
    train_loss, train_f1 = train(model, train_iter, optimizer, criterion)
    valid_loss, valid_f1 = evaluate(model, valid_iter, criterion)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'pipeline.pt')
        
    print(f'Epoch: {epoch}')
    print(f'Train loss: {train_loss}\t Train F1: {train_f1}')
    print(f'Valid loss: {valid_loss}\t Valid F1: {valid_f1}')    

HBox(children=(IntProgress(value=0, max=4548), HTML(value='')))




AttributeError: 'numpy.ndarray' object has no attribute 'softmax'