In [1]:
%load_ext autoreload

In [2]:
from config import Config
from pprint import pprint, pformat
from logger import model_logger
log = model_logger.getLogger('main')
log.setLevel(Config.Log.MODEL.level)

2018-01-29 10:57:27,962:root:INFO   :           getLogger:: creating logger for main under MODEL


In [3]:
%autoreload 2
from trainer import Trainer, Feeder, Predictor
from datafeed import DataFeed
from torch import nn, optim
from torch.nn import functional as F
from torch.autograd import Variable
import torch

2018-01-29 10:57:28,037:root:INFO   :           getLogger:: creating logger for main under TRAINER
2018-01-29 10:57:28,271:root:INFO   :           getLogger:: creating logger for main under DATAFEED


In [4]:
import csv
train_dataset = csv.reader(open('dataset/train.csv'))
test_dataset = csv.reader(open('dataset/test.csv'))

In [5]:
from collections import namedtuple
Sample = namedtuple('Sample', ['id','comment_text',
                               'toxic','severe_toxic','obscene',
                               'threat','insult','identity_hate'])

In [6]:
import unicodedata
train_datapoints = []
for i in list(train_dataset)[1:]:
    _id, c, t, st, o, t, ins, ih = i
    t, st, o, t, ins, ih = (int(_) for _ in [t, st, o, t, ins, ih])
    c = unicodedata.normalize('NFD', c).encode('ascii','ignore').decode()
    train_datapoints.append(Sample(_id, c, t, st, o, t, ins, ih))

test_datapoints = []
for i in list(test_dataset)[1:]:
    _id, c = i
    c = unicodedata.normalize('NFD', c).encode('ascii','ignore').decode()
    test_datapoints.append(Sample(_id, c, 0, 0, 0, 0, 0, 0))

len(train_datapoints), len(test_datapoints)

(159571, 153164)

In [7]:
#train_datapoints = train_datapoints[:1000]

In [None]:
test_datapoints[1000:1010]

[Sample(id='01ac9982edae9977', comment_text='" \n\n Dear ,Welcome to Wikipedia!Unfortunately, using your e-mail address as your username is not a good idea. Wikipedia content is extensively copied and the site itself is one of the most visited sites in the world. Any edit you make on Wikipedia will have your username attached to it, and using your email address will make you a tempting target for spammers. We recommend that you change your username at Wikipedia:Changing username in order to prevent abuse.If you need any help, simply contact me on my talk page, or go to Wikipedia:Help desk. Another option is to place  on your own talk page, and someone will come shortly to help. Remember to sign your posts on talk pages with four tildes (~~~~). Again, welcome! -  at "', toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0),
 Sample(id='01ac9ea1271d7409', comment_text='Asshole, your dirty MF people tries to steal our glory and heavy persian (tajik) history by claiming 

In [None]:
%timeit
from nltk import word_tokenize
from tqdm import tqdm
datapoints = train_datapoints
INPUT_VOCAB = [word for dp in tqdm(datapoints) for word in word_tokenize(dp.comment_text)]
INPUT_VOCAB = ['<<PAD>>', '<<UNK>>'] + list(set(INPUT_VOCAB))
len(INPUT_VOCAB)

  3%|▎         | 4405/159571 [00:02<01:24, 1844.43it/s]

In [None]:
INPUT_VOCAB[:10]

In [None]:
OUTPUT_VOCAB = ['toxic','severe_toxic','obscene', 'threat','insult','identity_hate']

In [None]:
WORD_INDEX = {w: i for i, w in enumerate(INPUT_VOCAB)}

In [None]:
sorted(list(WORD_INDEX.items()), key=lambda x: x[1])[:10], WORD_INDEX['<<PAD>>'], INPUT_VOCAB[0]

In [None]:
import random
_i = train_datapoints[random.choice(range(len(train_datapoints)))]
print(_i.comment_text)
print("""


""")
print(
      ' '.join( [INPUT_VOCAB[i] for i in 
                 [WORD_INDEX[j] for j in word_tokenize(_i.comment_text)]]
              )
     )

In [None]:
class Model(nn.Module):
    def __init__(self, Config, input_vocab_size, output_vocab_size):
        super(Model, self).__init__()
        self.input_vocab_size = input_vocab_size
        self.output_vocab_size = output_vocab_size
        self.hidden_dim = Config.hidden_dim

        self.embed = nn.Embedding(self.input_vocab_size, self.hidden_dim)
        self.encode = nn.GRUCell(self.hidden_dim, self.hidden_dim)

        self.classify = [nn.Linear(self.hidden_dim, 2)
                         for i in range (self.output_vocab_size)]

        self.log = model_logger.getLogger('model')
        self.log.setLevel(logging.INFO)
        if Config.cuda:
            self.cuda()
            [i.cuda() for i in self.classify]
        
    def init_hidden(self, batch_size):
        ret = torch.zeros(batch_size, self.hidden_dim)
        if Config().cuda: ret = ret.cuda()
        return Variable(ret)
    
    def forward(self, seq):
        seq = Variable(torch.LongTensor(seq))
        if Config().cuda: seq = seq.cuda()
        batch_size = seq.size()[0]
        self.log.debug('{} seq size: {}'.format(type(seq.data), seq.size()))
        seq_emb = self.embed(seq).transpose(1,0)
        output = self.init_hidden(batch_size)
        for token_emb in seq_emb:
            self.log.debug('token_emb := {}'.format(token_emb))
            self.log.debug('output := {}'.format(output))
            output = self.encode(token_emb, output)
                    
        self.log.debug('output := {}'.format(output))
    
        ret = torch.stack([F.softmax(classify(output), dim=-1) 
                           for classify in self.classify])
        self.log.debug('ret := {}'.format(ret))

        self.log.debug('ret size: {}'.format(ret.size()))

        return ret

In [None]:
import numpy as np
def seq_maxlen(seqs):
    return max([len(seq) for seq in seqs])

PAD = WORD_INDEX[INPUT_VOCAB[0]]
print(PAD)
def pad_seq(seqs, maxlen=0, PAD=PAD):
    if type(seqs[0]) == type([]):
        maxlen = maxlen if maxlen else seq_maxlen(seqs)
        def pad_seq_(seq):
            return seq + [PAD]*(maxlen-len(seq))
        seqs = [ pad_seq_(seq) for seq in seqs ]
    return seqs

def batchop(datapoints, *args, **kwargs):
    indices = [d.id for d in datapoints]
    seq   = pad_seq([ [WORD_INDEX[w] for w in word_tokenize(d.comment_text)]
                     for d in datapoints])
    target = [(d.toxic, d.severe_toxic, d.obscene, d.threat, d.insult, d.identity_hate)
              for d in datapoints]
    seq, target = np.array(seq), np.array(target)
    return indices, (seq, ), (target,)

In [None]:
def loss(output, target, loss_function=nn.NLLLoss(), *args, **kwargs):
    loss = 0
    target = Variable(torch.LongTensor(target[0]))
    if Config().cuda: target = target.cuda()
    output = output.transpose(1,0)
    batch_size = output.size()[0]
    for i, t in zip(output, target):
        log.debug('i, o sizes: {} {}'.format(i, t))
        loss += loss_function(i, t.squeeze()).mean()
        log.debug('loss size: {}'.format(loss))

    return loss/batch_size

def accuracy(output, target, *args, **kwargs):
    accuracy = 0
    target = Variable(torch.LongTensor(target[0]))
    if Config().cuda: target = target.cuda()
    output = output.transpose(1,0)
    batch_size = output.size()[0]
    class_size = output.size()[1]    
    
    for i, t in zip(output, target):
        correct = (i.max(dim=1)[1] == t).sum()
        accuracy += correct/class_size
        
    return (accuracy/batch_size).data[0]
    

In [None]:
from IPython.display import HTML
from IPython.display import display
def repr_function(output, feed, batch_index):
    results = []
    output = output.transpose(1,0)
    indices, (seq,), (classes,) = feed.nth_batch(batch_index)
    print(output.size(), len(indices), len(seq), len(classes))
    for i, o, s, c in zip(indices, output, seq, classes):
        orig_s = feed.data_dict[i].comment_text
        s = [INPUT_VOCAB[i] for i in s]
        s = ' '.join(s)
        results.append([orig_s, s] + list(c))
        o = o.max(dim=1)[1]
        results.append([' ', '  '] + o.data.cpu().numpy().tolist())
    return results

In [None]:
_train_datapoints = train_datapoints

In [None]:
import logging
log.setLevel(logging.INFO)

In [None]:
import random
def  experiment(epochs=10, checkpoint=1):
    model =  Model(Config(), len(INPUT_VOCAB), len(OUTPUT_VOCAB))
    if Config().cuda:  model = model.cuda()
        
    split_index = int( len(train_datapoints) * 0.85 )
    train_feed = DataFeed(train_datapoints[:split_index], batchop=batchop, batch_size=128)
    test_feed = DataFeed(train_datapoints[split_index:], batchop=batchop, batch_size=120)

    trainer = Trainer(model=model, loss_function=loss, accuracy_function=accuracy, 
                    checkpoint=checkpoint, epochs=epochs,
                    feeder = Feeder(train_feed, test_feed))

    predictor = Predictor(model=model, repr_function=repr_function, feed=test_feed)
    output, results = predictor.predict(random.choice(range(test_feed.num_batch)))

    for e in range(10000):
        output, results = predictor.predict(random.choice(range(test_feed.num_batch)))
        display(HTML(results._repr_html_()))
        trainer.train()
        
experiment()

In [None]:
dummy_feed = DataFeed(train_datapoints[:100], batchop=batchop, batch_size=1)
indices, (seq,), (target,) = dummy_feed.nth_batch(random.choice(range(dummy_feed.num_batch)))
print(dummy_feed.data_dict[indices[0]])
print([INPUT_VOCAB[i] for i in seq[0]])