In [1]:
%load_ext autoreload

In [2]:
from config import Config
from pprint import pprint, pformat
from logger import model_logger
log = model_logger.getLogger('main')
log.setLevel(Config.Log.MODEL.level)

2018-01-28 19:24:54,129:root:INFO   :           getLogger:: creating logger for main under MODEL


In [3]:
%autoreload 2
from trainer import Trainer, Feeder
from datafeed import DataFeed
from torch import nn, optim
from torch.nn import functional as F
from torch.autograd import Variable
import torch

2018-01-28 19:24:54,313:root:INFO   :           getLogger:: creating logger for main under TRAINER
2018-01-28 19:24:54,548:root:INFO   :           getLogger:: creating logger for main under DATAFEED


In [4]:
import csv
train_dataset = csv.reader(open('dataset/train.csv'))
test_dataset = csv.reader(open('dataset/test.csv'))

In [5]:
from collections import namedtuple
Sample = namedtuple('Sample', ['id','comment_text',
                               'toxic','severe_toxic','obscene',
                               'threat','insult','identity_hate'])

In [6]:
import unicodedata
train_datapoints = []
for i in list(train_dataset)[1:]:
    _id, c, t, st, o, t, ins, ih = i
    t, st, o, t, ins, ih = (int(_) for _ in [t, st, o, t, ins, ih])
    c = unicodedata.normalize('NFD', c).encode('ascii','ignore').decode()
    train_datapoints.append(Sample(_id, c, t, st, o, t, ins, ih))

train_datapoints = sorted(train_datapoints, key=lambda x: len(x.comment_text))

test_datapoints = []
for i in list(test_dataset)[1:]:
    _id, c = i
    c = unicodedata.normalize('NFD', c).encode('ascii','ignore').decode()
    test_datapoints.append(Sample(_id, c, 0, 0, 0, 0, 0, 0))

test_datapoints = sorted(test_datapoints, key=lambda x: len(x.comment_text))

len(train_datapoints), len(test_datapoints)

(159571, 153164)

In [7]:
#train_datapoints = train_datapoints[:1000]

In [8]:
test_datapoints[1000:1010]

[Sample(id='d2ef2d76fe9aa1db', comment_text='    .         ', toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0),
 Sample(id='d802642c20e064a4', comment_text='::I was right.', toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0),
 Sample(id='d916708f967a17b4', comment_text=':Never ending!', toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0),
 Sample(id='db8ece11eb69265f', comment_text='It is in Tampa', toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0),
 Sample(id='dde498ebdf5c6f5b', comment_text='::::I am sure.', toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0),
 Sample(id='df46837faaadfef4', comment_text='              ', toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0),
 Sample(id='df9e7b4cf096504a', comment_text=':::FYI. Cheers', toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0),
 Sample(id='dffea66ea8edbbc2', comment_text='jel

In [9]:
%timeit
from nltk import word_tokenize
from tqdm import tqdm
datapoints = train_datapoints
INPUT_VOCAB = [word for dp in tqdm(datapoints) for word in word_tokenize(dp.comment_text)]
INPUT_VOCAB = ['PAD', 'UNK'] + list(set(INPUT_VOCAB))
len(INPUT_VOCAB)

100%|██████████| 159571/159571 [01:26<00:00, 106.05it/s] 


299976

In [10]:
OUTPUT_VOCAB = ['toxic','severe_toxic','obscene', 'threat','insult','identity_hate']

In [11]:
WORD_INDEX = {w: i for i, w in enumerate(INPUT_VOCAB)}

In [53]:
class Model(nn.Module):
    def __init__(self, Config, input_vocab_size, output_vocab_size):
        super(Model, self).__init__()
        self.input_vocab_size = input_vocab_size
        self.output_vocab_size = output_vocab_size
        self.hidden_dim = Config.hidden_dim

        self.embed = nn.Embedding(self.input_vocab_size, self.hidden_dim)
        self.encode = nn.GRUCell(self.hidden_dim, self.hidden_dim)

        self.classify = [nn.Linear(self.hidden_dim, 2)
                         for i in range (self.output_vocab_size)]

        self.log = model_logger.getLogger('model')
        self.log.setLevel(logging.INFO)
        if Config.cuda:
            self.cuda()
            [i.cuda() for i in self.classify]
        
    def init_hidden(self, batch_size):
        ret = torch.zeros(batch_size, self.hidden_dim)
        if Config().cuda: ret = ret.cuda()
        return Variable(ret)
    
    def forward(self, seq):
        seq = Variable(torch.LongTensor(seq))
        if Config().cuda: seq = seq.cuda()
        batch_size = seq.size()[0]
        self.log.debug('{} seq size: {}'.format(type(seq.data), seq.size()))
        seq_emb = self.embed(seq).transpose(1,0)
        output = self.init_hidden(batch_size)
        for token_emb in seq_emb:
            self.log.debug('token_emb := {}'.format(token_emb))
            self.log.debug('output := {}'.format(output))
            output = self.encode(token_emb, output)
                    
        self.log.debug('output := {}'.format(output))
    
        ret = torch.stack([F.softmax(classify(output), dim=-1) 
                           for classify in self.classify])
        self.log.debug('ret := {}'.format(ret))

        self.log.debug('ret size: {}'.format(ret.size()))

        return ret

In [13]:
import numpy as np
def seq_maxlen(seqs):
    return max([len(seq) for seq in seqs])

PAD = WORD_INDEX[INPUT_VOCAB[0]]
def pad_seq(seqs, maxlen=0, PAD=PAD):
    if type(seqs[0]) == type([]):
        maxlen = maxlen if maxlen else seq_maxlen(seqs)
        def pad_seq_(seq):
            return seq + [PAD]*(maxlen-len(seq))
        seqs = [ pad_seq_(seq) for seq in seqs ]
    return seqs

def batchop(datapoints, *args, **kwargs):
    seq   = pad_seq([ [WORD_INDEX[w] for w in word_tokenize(d.comment_text)]
                     for d in datapoints])
    target = [(d.toxic, d.severe_toxic, d.obscene, d.threat, d.insult, d.identity_hate)
              for d in datapoints]
    seq, target = np.array(seq), np.array(target)
    return (seq, ), target

In [None]:
def loss(input, target, loss_function=nn.NLLLoss(), *args, **kwargs):
    loss = 0
    target = Variable(torch.LongTensor(target))
    if Config().cuda: target = target.cuda()
    input = input.transpose(1,0)
    batch_size = input.size()[0]
    for i, t in zip(input, target):
        log.debug('i, o sizes: {} {}'.format(i, t))
        loss += loss_function(i, t.squeeze()).mean()
        log.debug('loss size: {}'.format(loss))

    return loss/batch_size

def accuracy(input, target, *args, **kwargs):
    accuracy = 0
    target = Variable(torch.LongTensor(target))
    if Config().cuda: target = target.cuda()
    input = input.transpose(1,0)
    batch_size = input.size()[0]
    class_size = input.size()[1]    
    for i, t in zip(input, target):
        correct = (i.max(dim=1)[1] == t).sum()
        accuracy += correct/class_size
        
    return (accuracy/batch_size).data[0]
    

In [59]:
import logging
log.setLevel(logging.INFO)
model =  Model(Config(), len(INPUT_VOCAB), len(OUTPUT_VOCAB))
if Config().cuda:
    model = model.cuda()

split_index = int( len(train_datapoints) * 0.85 )
train_feed = DataFeed(train_datapoints[:split_index], batchop=batchop, batch_size=128)
test_feed = DataFeed(train_datapoints[split_index:], batchop=batchop, batch_size=120)

trainer = Trainer(model=model, loss_function=loss, accuracy_function=accuracy, 
                checkpoint=1, epoch=100,
                feeder = Feeder(train_feed, test_feed))

predict_feed = DataFeed(train_datapoints[split_index:], batchop=batchop_predict, batch_size=12)
predictor = Predictor(model=model, repr_function=repr_function, feed=predict_feed)


[autoreload of datafeed failed: Traceback (most recent call last):
  File "/home/paarulakan/environments/python/pytorch-py35/lib/python3.5/site-packages/IPython/extensions/autoreload.py", line 247, in check
    superreload(m, reload, self.old_objects)
  File "/home/paarulakan/projects/kaggle/toxic-comments/datafeed.py", line 64
    self.data[ n * self.batch_size   :   (n+1) * self.batch_size) ],
                                                                ^
SyntaxError: invalid syntax
]
2018-01-28 23:28:24,018:root:INFO   :           getLogger:: creating logger for main under TRAINER
2018-01-28 23:28:27,663:root:INFO   :           getLogger:: creating logger for model under MODEL


NameError: name 'batchop_predict' is not defined

In [None]:
trainer.train()