In [1]:
import sys
import csv
from pathlib import Path
from argparse import Namespace
import random

from tqdm import tqdm, tqdm_notebook

import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import torchtext
from torchtext import data

In [2]:
csv.field_size_limit(sys.maxsize)
tqdm_notebook().pandas()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [3]:
DATA_PATH = Path('../data')
MODELS_PATH = Path('../models')
RANDOM_SEED = 17

In [4]:
def set_seed_everywhere(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
        
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed_everywhere(RANDOM_SEED)

In [5]:
train_df = pd.read_csv(DATA_PATH/'train.csv')
valid_df = pd.read_csv(DATA_PATH/'valid.csv')
test_df = pd.read_csv(DATA_PATH/'test.csv', names=['id', 'title', 'text'], header=0)

train_df = train_df.fillna('<EMPTY>')
valid_df = valid_df.fillna('<EMPTY>')
test_df = test_df.fillna('<EMPTY>')

full_train_df = pd.concat([train_df, valid_df], axis=0, ignore_index=True)

train_df.to_json(DATA_PATH/'train.json', orient='records', lines=True)
valid_df.to_json(DATA_PATH/'valid.json', orient='records', lines=True)
full_train_df.to_json(DATA_PATH/'full_train.json', orient='records', lines=True)
test_df.to_json(DATA_PATH/'test.json', orient='records', lines=True)

print(train_df.shape, valid_df.shape, full_train_df.shape, test_df.shape)
train_df.columns, valid_df.columns, full_train_df.columns, test_df.columns

(24871, 3) (3552, 3) (28423, 3) (5647, 3)


(Index(['label', 'title', 'text'], dtype='object'),
 Index(['label', 'title', 'text'], dtype='object'),
 Index(['label', 'title', 'text'], dtype='object'),
 Index(['id', 'title', 'text'], dtype='object'))

In [96]:
SPLIT_RATIO = 0.8

ID = data.Field()
TITLE = data.Field(batch_first=True)
TEXT = data.Field(batch_first=True)
LABEL = data.LabelField()

train_fields = {'title': ('title', TITLE), 'text': ('text', TEXT), 'label': ('label', LABEL)}
test_fields = {'title': ('title', TITLE), 'text': ('text', TEXT)}
# test_fields = {'id': ('id', ID), 'title': ('title', TITLE), 'text': ('text', TEXT)}


full_train_data, = data.TabularDataset.splits(
    path = '../data/',
    train = 'full_train.json',
    format = 'json',
    fields = train_fields,
)

train_data, valid_data = full_train_data.split(
    split_ratio = SPLIT_RATIO,
    stratified = True, 
    strata_field = 'label',
    random_state = random.seed(RANDOM_SEED)
)

test_data,  = data.TabularDataset.splits(
    path = '../data/',
    test = 'test.json',
    format = 'json',
    fields = test_fields,
)

len(full_train_data), len(train_data), len(valid_data), len(test_data)

(28423, 22738, 5685, 5647)

In [97]:
TITLE_MAX_VOCAB_SIZE = None
TITLE_MIN_FREQ = 1
TEXT_MAX_VOCAB_SIZE = None
TEXT_MIN_FREQ = 1

TITLE.build_vocab(train_data,
                  max_size = TITLE_MAX_VOCAB_SIZE,
                  min_freq = TITLE_MIN_FREQ)

TEXT.build_vocab(train_data,
                 max_size = TEXT_MAX_VOCAB_SIZE,
                 min_freq = TEXT_MIN_FREQ)

LABEL.build_vocab(train_data)

print(len(TITLE.vocab))
print(TITLE.vocab.freqs.most_common(10))
print(len(TEXT.vocab))
print(TEXT.vocab.freqs.most_common(10))
print(len(LABEL.vocab))
print(LABEL.vocab.freqs.most_common(10))

41547
[('to', 4649), ('<EMPTY>', 4530), ('the', 3315), ('in', 3072), ('of', 2976), ('for', 2216), ('a', 2130), ('The', 2000), ('and', 1829), ('on', 1655)]
438252
[('the', 528152), ('to', 294826), ('of', 256077), ('a', 247590), ('and', 241233), ('in', 193414), ('that', 117477), ('for', 99191), ('is', 96799), ('on', 91650)]
3
[('news', 13390), ('other', 5915), ('clickbait', 3433)]


In [98]:
DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE

'cuda'

In [99]:
BATCH_SIZE = 64

train_iter, valid_iter, test_iter = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    sort_key=lambda x: len(x.title),
    device = DEVICE
)

In [69]:
class SimpleFC(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, title):
        # title = [batch_size, seq_length]
        
        embedded = self.embedding(title)
        # embedded = [batch_size, seq_length, embedding_dim] 
        
        x = self.fc1(embedded)
        x = F.relu(x)
        # x = [batch_size, seq_length, hidden_dim]
        
        x = x.permute(0, 2, 1)
        x = F.avg_pool1d(x, x.shape[2])
        x = x.squeeze(2)
        # x = [batch_size, hidden_dim]
        
        x = self.fc2(x)
        x = F.relu(x)
        # x = [batch_size, output_dim]    
        
        return x

In [70]:
INPUT_DIM = len(TITLE.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 100
OUTPUT_DIM = len(LABEL.vocab)

print(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

model = SimpleFC(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model

41547 100 100 3


SimpleFC(
  (embedding): Embedding(41547, 100)
  (fc1): Linear(in_features=100, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=3, bias=True)
)

In [71]:
def count_parameters(model):
    return np.sum([params.numel() for params in model.parameters() if params.requires_grad])

count_parameters(model)

4165103

In [76]:
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-3)
criterion = nn.CrossEntropyLoss()

model = model.to(DEVICE)

In [77]:
def categorical_accuracy(y_pred, y_true):
    max_preds = y_pred.argmax(dim=1, keepdim=True)
    correct = max_preds.squeeze(1).eq(y_true)
    return correct.sum() / torch.FloatTensor([y_true.shape[0]])

In [78]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in tqdm_notebook(iterator):
        optimizer.zero_grad()
        
        y_pred = model(batch.title)
        
        loss = criterion(y_pred, batch.label)
        loss.backward()
        
        acc = categorical_accuracy(y_pred, batch.label)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [79]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for batch in tqdm_notebook(iterator):
            y_pred = model(batch.title)
            loss = criterion(y_pred, batch.label)

            acc = categorical_accuracy(y_pred, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [80]:
EPOCHS = 10

best_valid_loss = np.float('inf')

for epoch in tqdm_notebook(range(EPOCHS)):
    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iter, criterion)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), MODELS_PATH/'SimpleFC.pth')
        
    print(f'Epoch: {epoch}')
    print(f'Train loss: {train_loss}, Train Accuracy: {train_acc}')
    print(f'Valid loss: {valid_loss}, Valid Accuracy: {valid_acc}')    

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=356), HTML(value='')))

HBox(children=(IntProgress(value=0, max=89), HTML(value='')))

Epoch: 0
Train loss: 0.8277689650487364, Train Accuracy: 0.7585293772850144
Valid loss: 0.7141840879836779, Valid Accuracy: 0.7791664459732142


HBox(children=(IntProgress(value=0, max=356), HTML(value='')))

HBox(children=(IntProgress(value=0, max=89), HTML(value='')))

Epoch: 1
Train loss: 0.6550872507348154, Train Accuracy: 0.7876531289199765
Valid loss: 0.6395830167143532, Valid Accuracy: 0.7791664459732142


HBox(children=(IntProgress(value=0, max=356), HTML(value='')))

HBox(children=(IntProgress(value=0, max=89), HTML(value='')))

Epoch: 2
Train loss: 0.6151465155665626, Train Accuracy: 0.7872629915730337
Valid loss: 0.6159796561166812, Valid Accuracy: 0.7847844235013041


HBox(children=(IntProgress(value=0, max=356), HTML(value='')))

HBox(children=(IntProgress(value=0, max=89), HTML(value='')))

Epoch: 3
Train loss: 0.6008370483315058, Train Accuracy: 0.7880627730589235
Valid loss: 0.6050323202703776, Valid Accuracy: 0.7851355470968097


HBox(children=(IntProgress(value=0, max=356), HTML(value='')))

HBox(children=(IntProgress(value=0, max=89), HTML(value='')))

Epoch: 4
Train loss: 0.5923088225431405, Train Accuracy: 0.7886382255996212
Valid loss: 0.5984721203449737, Valid Accuracy: 0.7860133560855737


HBox(children=(IntProgress(value=0, max=356), HTML(value='')))

KeyboardInterrupt: 

In [81]:
model.load_state_dict(torch.load(MODELS_PATH/'SimpleFC.pth'))

In [None]:
preds = []

model.eval()

with tqdm_notebook(torch.no_grad()):
    for batch in test_iter:
        y_pred = model(batch.title)
        y_pred_class = y_pred.argmax(dim=1).cpu().numpy()
        preds.extend(y_pred_class)

preds

In [129]:
submission_df = pd.DataFrame({'id': test_df.index, 'label': preds})
submission_df.head()

Unnamed: 0,id,label
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [138]:
submission_df.label.map(lambda x: LABEL.vocab.itos[x]).value_counts()

news     5631
other      16
Name: label, dtype: int64