In [5]:
import logging
import pandas as pd
import numpy as np
import os
import pdb

import gensim
import torchtext
import nltk
import random
from nltk import word_tokenize
from pathlib import Path
from collections import Counter
from sklearn import metrics
from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook as tqdm
from IPython.display import display, clear_output

import torch
from torch import nn
from torch.autograd import Variable
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch import optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [6]:
# Hyperparameters
nrows = 200
no_below = 3
batch_size = 32
n_embed = 32
n_hidden = 128
n_layers = 2
epochs = 5
lr=1e-3
datapath = Path('/src/data')

In [7]:
class TextFeatureTransformer(object):
    
    _reserved = ['<UNK>', '<BOS>', '<EOS>', '<PAD>']

    def __init__(self, tokenizer=word_tokenize, preprocessors=[]):
        self.vocab = gensim.corpora.Dictionary()
        self.preprocessors = preprocessors
        self.tokenizer = tokenizer

    def __call__(self, text):
        text = self.preprocess(text)
        tokens = self.tokenizer(text)
        unk = self.vocab.token2id['<UNK>']
        wids = self.vocab.doc2idx(tokens, unknown_word_index=unk)
        return wids

    def preprocess(self, text):
        for preprocessor in self.preprocessors:
            text = preprocessor(text)
        return text
        
    def fit(self, texts, no_below=1):
        self.vocab.doc2bow(self._reserved, allow_update=True)
        for text in tqdm(texts):
            text = self.preprocess(text)
            tokens = self.tokenizer(text)
            self.vocab.doc2bow(tokens, allow_update=True)
        self.vocab.filter_extremes(no_below=no_below, keep_tokens=self._reserved)


class QIQCDataset(Dataset):
    
    def __init__(self, df, device=-1):
        self.df = df
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, i):
        if i == len(self):
            raise StopIteration
        row = self.df.iloc[i]
        
        dataset = {}
        dataset['qid'] = row.qid
        dataset['text'] = row.question_text
        if 'target' in row:
            dataset['target'] = row.target
        
        return dataset

In [8]:
preprocessors = [str.lower]
transformer = TextFeatureTransformer(preprocessors=preprocessors)

# Load rawdata

df_train_all = pd.read_csv(datapath / 'train.csv', nrows=nrows)
df_train, df_valid = train_test_split(
    df_train_all, test_size=0.2, random_state=0)
df_valid, df_test = train_test_split(
    df_valid, test_size=0.5, shuffle=False)
df_submit = pd.read_csv(datapath / 'test.csv', nrows=nrows)

# Build vocab
texts = np.concatenate([df_train_all.question_text, df_test.question_text])
transformer.fit(texts, no_below=no_below)

# Build dataset
train_dataset = DataLoader(
    QIQCDataset(df_train),
    batch_size=batch_size,
    shuffle=True
)
valid_dataset = DataLoader(
    QIQCDataset(df_valid),
    batch_size=batch_size,
    shuffle=False
)
test_dataset = DataLoader(
    QIQCDataset(df_test),
    batch_size=batch_size,
    shuffle=False
)
submit_dataset = DataLoader(
    QIQCDataset(df_submit),
    batch_size=batch_size,
    shuffle=False
)

HBox(children=(IntProgress(value=0, max=220), HTML(value='')))




In [9]:
def evaluate(losses, ys, ts, thresholds=[0.5, 0.1, 0.01]):
    scores = {}
    ys = np.concatenate(ys)
    ts = np.concatenate(ts)
    
    scores['loss'] = np.mean(losses)
    if len(np.unique(ts)) > 1:
        scores['ap'] = metrics.average_precision_score(ts, ys)
        scores['rocauc'] = metrics.roc_auc_score(ts, ys)        
        for threshold in thresholds:
            ys_bin = np.digitize(ys, [threshold])
            prec, rec, fbeta, sup = metrics.precision_recall_fscore_support(
                ts, ys_bin, labels=[0, 1], warn_for=[])
            scores[f'prec#{threshold}'] = prec[1]
            scores[f'rec#{threshold}'] = rec[1]
            scores[f'fbeta#{threshold}'] = fbeta[1]
    
    return scores


class RNNClassifier(nn.Module):
    
    def __init__(self, transformer, n_embed, n_hidden, n_layers, dropout=0.2):
        super().__init__()
        self.transformer = transformer
        self.padding_idx = transformer.vocab.token2id['<PAD>']
        self.embed = nn.Embedding(
            len(transformer.vocab), n_embed, padding_idx=self.padding_idx)
        self.rnn = nn.LSTM(
            input_size=n_embed,
            hidden_size=n_hidden,
            num_layers=n_layers, 
            dropout = dropout,
            bidirectional=True)
        self.out = nn.Linear(
            n_hidden * 2, 1)
        self.lossfunc = nn.BCEWithLogitsLoss()
        
    def forward(self, batch):
        xs = pad_sequence(
            [torch.tensor(self.transformer(t), dtype=torch.long)
             for t in batch['text']],
            padding_value=self.padding_idx)
        batch_size = xs.size()[1]
        h = self.embed(xs)
        hs, (ht, ct) = self.rnn(h)
        h = ht[-2:].transpose(0, 1).contiguous().view(batch_size, -1) 
        h = self.out(h)
        return h
    
    def predict(self, batch):
        self.eval()
        out = self.forward(batch)
        y = torch.sigmoid(out).cpu().data.numpy()
        return y

    
class Trainer(object):
    
    def __init__(self, model, optimizer):
        self.model = model
        self.optimizer = optimizer
        
    def calc_loss(self, batch):
        y = self.model(batch).view(-1)
        t = batch['target'].type(torch.Tensor)
        loss = self.model.lossfunc(y, t)
        
        return y, t, loss
    
    def train(self, iterator, name='train'):
        ys, ts, losses = [], [], []
        for batch in tqdm(iterator, desc=name):
            self.model.train(); self.optimizer.zero_grad()
            y, t, loss = self.calc_loss(batch)
            loss.backward(); self.optimizer.step()

            losses.append(loss.cpu().data.numpy())
            ys.append(torch.sigmoid(y).cpu().data.numpy())
            ts.append(t.cpu().cpu().data.numpy())
        result = evaluate(losses, ys, ts)
        result['dataset'] = name
        return result
    
    def validate(self, iterator, name='valid'):
        ys, ts, losses = [], [], []
        for batch in tqdm(iterator, desc=name):
            self.model.eval(); self.optimizer.zero_grad()
            y, t, loss = self.calc_loss(batch)

            losses.append(loss.cpu().data.numpy())
            ys.append(torch.sigmoid(y).cpu().data.numpy())
            ts.append(t.cpu().cpu().data.numpy())
        result = evaluate(losses, ys, ts)
        result['dataset'] = name
        return result
    
    def predict(self, iterator, threshold=0.5):
        ys = []
        for batch in tqdm(iterator):
            self.model.eval()
            ys.append(self.model.predict(batch))
        ys = np.concatenate(ys)
        ys_bin = np.digitize(ys, [threshold])
        return ys_bin

In [10]:
model = RNNClassifier(transformer, n_embed, n_hidden, n_layers)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
trainer = Trainer(model, optimizer)

train_results, valid_results, test_results = [], [], []
for epoch in range(epochs):
    train_results.append(trainer.train(train_dataset, name='train'))
    valid_results.append(trainer.validate(valid_dataset, name='valid'))
    test_results.append(trainer.validate(test_dataset, name='test'))

    clear_output(wait=True)
    display(pd.DataFrame(train_results))
    display(pd.DataFrame(valid_results))
    display(pd.DataFrame(test_results))

Unnamed: 0,ap,dataset,fbeta#0.01,fbeta#0.1,fbeta#0.5,loss,prec#0.01,prec#0.1,prec#0.5,rec#0.01,rec#0.1,rec#0.5,rocauc
0,0.082319,train,0.117647,0.117647,0.0,0.592505,0.0625,0.0625,0.0,1.0,1.0,0.0,0.591333
1,0.079785,train,0.117647,0.1,0.0,0.34138,0.0625,0.053846,0.0,1.0,0.7,0.0,0.478
2,0.059223,train,0.117647,0.0,0.0,0.258614,0.0625,0.0,0.0,1.0,0.0,0.0,0.459333
3,0.271996,train,0.117647,0.0,0.0,0.234676,0.0625,0.0,0.0,1.0,0.0,0.0,0.594667
4,0.139824,train,0.117647,0.125,0.0,0.228027,0.0625,0.166667,0.0,1.0,0.1,0.0,0.676667


Unnamed: 0,dataset,loss
0,valid,0.412621
1,valid,0.044913
2,valid,0.031113
3,valid,0.062476
4,valid,0.098794


Unnamed: 0,ap,dataset,fbeta#0.01,fbeta#0.1,fbeta#0.5,loss,prec#0.01,prec#0.1,prec#0.5,rec#0.01,rec#0.1,rec#0.5,rocauc
0,0.5,test,0.095238,0.095238,0.0,0.454623,0.05,0.05,0.0,1.0,1.0,0.0,0.947368
1,0.5,test,0.095238,0.0,0.0,0.184112,0.05,0.0,0.0,1.0,0.0,0.0,0.947368
2,0.25,test,0.095238,0.0,0.0,0.201868,0.05,0.0,0.0,1.0,0.0,0.0,0.842105
3,0.166667,test,0.095238,0.0,0.0,0.198866,0.05,0.0,0.0,1.0,0.0,0.0,0.736842
4,0.166667,test,0.095238,0.0,0.0,0.208884,0.05,0.0,0.0,1.0,0.0,0.0,0.736842


In [11]:
ys = trainer.predict(submit_dataset, threshold=0.1)

HBox(children=(IntProgress(value=0, max=7), HTML(value='')))




In [12]:
torch.cuda.device_of(list(model.parameters())[0]).idx

-1