In [1]:
%load_ext autoreload

In [2]:
from config import Config
from pprint import pprint, pformat
from logger import model_logger
log = model_logger.getLogger('main')
log.setLevel(Config.Log.MODEL.level)

2018-01-31 12:12:42,376:root:INFO   :           getLogger:: creating logger for main under MODEL


In [3]:
%autoreload 2
from trainer import Trainer, Feeder, Predictor
from datafeed import DataFeed
from torch import nn, optim
from torch.nn import functional as F
from torch.autograd import Variable
import torch

2018-01-31 12:12:42,476:root:INFO   :           getLogger:: creating logger for main under TRAINER
2018-01-31 12:12:42,694:root:INFO   :           getLogger:: creating logger for main under DATAFEED


In [4]:
import csv
train_dataset = csv.reader(open('dataset/train.csv'))
test_dataset = csv.reader(open('dataset/test.csv'))

In [5]:
from collections import namedtuple
Sample = namedtuple('Sample', ['id','comment_text',
                               'toxic','severe_toxic','obscene',
                               'threat','insult','identity_hate'])

### Unicode to ascii text

In [6]:
import unicodedata
train_datapoints = []
for i in list(train_dataset)[1:]:
    _id, c, t, st, o, t, ins, ih = i
    t, st, o, t, ins, ih = (int(_) for _ in [t, st, o, t, ins, ih])
    c = unicodedata.normalize('NFD', c).encode('ascii','ignore').decode()
    train_datapoints.append(Sample(_id, c, t, st, o, t, ins, ih))

test_datapoints = []
for i in list(test_dataset)[1:]:
    _id, c = i
    c = unicodedata.normalize('NFD', c).encode('ascii','ignore').decode()
    test_datapoints.append(Sample(_id, c.lower(), 0, 0, 0, 0, 0, 0))

len(train_datapoints), len(test_datapoints)

(159571, 153164)

In [14]:
j = 0
for i in train_datapoints:
    if len(i.comment_text) > 4000:
        print('====')
        print(i.comment_text)
        j += 1
        if j > 10:
            break

====
Ahh, Hello Witzeman 

203.92.84.161  
Symbols: ~ |            #                           $                                m m 
Characters: A a C c E e I i L l N n O o R r S s U u Y y Z z   A a E e I i O o U u   A a C c E e G g H h I i J j O o S s U u W w Y y   A a E e I i O o U u Y y      A a E e I i N n O o U u Y y   C c G g K k L l N n R r S s T t       U u   A a C c D d E e I i L l N n O o R r S s T t U u Z z   A a E e I i O o U u Y y     u u u u A a E e G g I i O o U u   C c E e G g I  Z z   A a E e I i O o U u   D d H h L l L l M m N n R r R r S s T t       O o U u                         A a        
Greek:                                                                               
Cyrillic:                                                                                                                     
IPA: t d                                                                                                                          
= My Famous Article ==witze  happines

In [7]:
cond = lambda x: sum(x[2:]) >= 1
classified_train_datapoints = [p for p in train_datapoints if cond(p)]

In [8]:
len(classified_train_datapoints)

10559

In [None]:
test_datapoints[1000:1010]

## Build vocabulary

#### buils INPUT_VOCAB

In [None]:
%timeit
from nltk import word_tokenize
from tqdm import tqdm
from collections import defaultdict
datapoints = train_datapoints
WORD_FREQ = defaultdict(int)
INPUT_VOCAB = [word for dp in tqdm(datapoints) for word in word_tokenize(dp.comment_text)]
OUTPUT_VOCAB = ['toxic','severe_toxic','obscene', 'threat','insult','identity_hate']


In [None]:
INPUT_VOCAB = INPUT_VOCAB + OUTPUT_VOCAB
for word in INPUT_VOCAB: WORD_FREQ[word] += 1

WORD_FREQ_PAIRS = sorted(WORD_FREQ.items(), key=lambda x: -x[1])
print(WORD_FREQ_PAIRS[1110:1120])

INPUT_VOCAB = [x[0] for x in WORD_FREQ_PAIRS]
INPUT_VOCAB = list(set(INPUT_VOCAB))
len(INPUT_VOCAB)

In [None]:
print(WORD_FREQ_PAIRS[:100], WORD_FREQ_PAIRS[-100:])

In [None]:
OUTPUT_VOCAB = ['toxic','severe_toxic','obscene', 'threat','insult','identity_hate']
INPUT_VOCAB = ['<<PAD>>', '<<UNK>>'] + list(set(INPUT_VOCAB + OUTPUT_VOCAB))

#### builds WORD_INDEX

In [None]:
WORD_INDEX = {w: i for i, w in enumerate(INPUT_VOCAB)}
OUTPUT_WORD_INDEX = {w: i for i, w in enumerate(OUTPUT_VOCAB)}
OUTPUT_IDS = [OUTPUT_WORD_INDEX[i] for i in OUTPUT_VOCAB]

In [None]:
sorted(list(WORD_INDEX.items()), key=lambda x: x[1])[:10], WORD_INDEX['<<PAD>>'], INPUT_VOCAB[0], INPUT_VOCAB[ WORD_INDEX['<<PAD>>'] ]

## tests INPUTVOCAB and WORD_INDEX mapping

In [None]:
import random
_i = train_datapoints[random.choice(range(len(train_datapoints)))]
print(_i.comment_text)
print("======")
print(' '.join( [INPUT_VOCAB[i] for i in 
                [WORD_INDEX[j] for j in word_tokenize(_i.comment_text)]])  )

## Baseline model

In [None]:
class Model(nn.Module):
    def __init__(self, Config, input_vocab_size, output_vocab_size):
        super(Model, self).__init__()
        self.input_vocab_size = input_vocab_size
        self.output_vocab_size = output_vocab_size
        self.hidden_dim = Config.hidden_dim

        self.embed = nn.Embedding(self.input_vocab_size, self.hidden_dim)
        self.encode = nn.GRUCell(self.hidden_dim, self.hidden_dim)
        self.classify = [nn.Linear(self.hidden_dim, 2)
                         for i in range (self.output_vocab_size)]

        self.log = model_logger.getLogger('model')
        self.log.setLevel(logging.INFO)
        if Config.cuda:
            self.cuda()
            [i.cuda() for i in self.classify]
        
    def init_hidden(self, batch_size):
        ret = torch.zeros(batch_size, self.hidden_dim)
        if Config().cuda: ret = ret.cuda()
        return Variable(ret)
    
    def forward(self, seq):
        seq = Variable(torch.LongTensor(seq))
        if Config().cuda: seq = seq.cuda()
        batch_size = seq.size()[0]
        self.log.debug('{} seq size: {}'.format(type(seq.data), seq.size()))
        seq_emb = self.embed(seq).transpose(1,0)
        output = self.init_hidden(batch_size)
        for token_emb in seq_emb:
            self.log.debug('token_emb := {}'.format(token_emb))
            self.log.debug('output := {}'.format(output))
            output = self.encode(token_emb, output)
                    
        self.log.debug('output := {}'.format(output))    
        ret = torch.stack([F.softmax(classify(output), dim=-1) for classify in self.classify])
        self.log.debug('ret := {}'.format(ret))
        self.log.debug('ret size: {}'.format(ret.size()))
        return ret

### Batching utils

In [None]:
import numpy as np
def seq_maxlen(seqs):
    return max([len(seq) for seq in seqs])

PAD = WORD_INDEX[INPUT_VOCAB[0]]
print(PAD)
def pad_seq(seqs, maxlen=0, PAD=PAD):
    if type(seqs[0]) == type([]):
        maxlen = maxlen if maxlen else seq_maxlen(seqs)
        def pad_seq_(seq):
            return seq + [PAD]*(maxlen-len(seq))
        seqs = [ pad_seq_(seq) for seq in seqs ]
    return seqs

def batchop(datapoints, *args, **kwargs):
    indices = [d.id for d in datapoints]
    seq   = pad_seq([ [WORD_INDEX[w] for w in word_tokenize(d.comment_text)]
                     for d in datapoints])
    target = [(d.toxic, d.severe_toxic, d.obscene, d.threat, d.insult, d.identity_hate)
              for d in datapoints]
    seq, target = np.array(seq), np.array(target)
    return indices, (seq, ), (target,)

## Attention based model

In [None]:
import math
class AttModel(nn.Module):
    def __init__(self, Config, input_vocab_size, output_vocab_size):
        super(AttModel, self).__init__()
        self.input_vocab_size = input_vocab_size
        self.output_vocab_size = output_vocab_size
        self.hidden_dim = Config.hidden_dim

        self.embed = nn.Embedding(self.input_vocab_size, self.hidden_dim)
        self.cembed = nn.Embedding(self.output_vocab_size, self.hidden_dim)
        self.encode = nn.GRUCell(self.hidden_dim, self.hidden_dim)
        self.dropout = nn.Dropout(0.5)
        self.attend = nn.Parameter(torch.FloatTensor(self.hidden_dim, self.hidden_dim))

        self.classify = nn.Linear(self.hidden_dim, 2)
        self.log = model_logger.getLogger('model')
        self.size_log = self.log.getLogger('size')
        self.log.setLevel(logging.INFO)
        self.size_log.setLevel(logging.INFO)
        
        #self.embed.weight.data.copy_(torch.Tensor([-0.001]).expand_as(self.embed.weight.data))
        self.attend.data.normal_(0, 0.1)
        if Config.cuda:
            self.cuda()
            
    def logsize(self, tensor, name=''):
        self.size_log.debug('{} <- {}'.format(tensor.size(), name))
        
    def init_hidden(self, batch_size):
        ret = torch.zeros(batch_size, self.hidden_dim)
        if Config().cuda: ret = ret.cuda()
        return Variable(ret)
    
    def forward(self, seq, classes=OUTPUT_IDS):
        seq = Variable(torch.LongTensor(seq))
        classes = Variable(torch.LongTensor(classes))
           
        if Config().cuda: 
            seq = seq.cuda()
            classes = classes.cuda()
            
        pad_mask = (seq > 0).float()
        self.log.debug('seq {}'.format(seq))
        self.log.debug('classes {}'.format(classes))
        self.log.debug('pad_mask {}'.format(pad_mask))
         
        batch_size = seq.size()[0]
        self.log.debug('{} seq size: {}'.format(type(seq.data), seq.size()))
        seq_emb = self.embed(seq).transpose(1,0)                  ;self.logsize(seq_emb, 'seq_emb')
        self.log.debug('seq_emb {}'.format(seq_emb))

        seq_repr = []
        output = self.init_hidden(batch_size)                     ;self.logsize(output, 'output')
        for token_emb in seq_emb:
            self.log.debug('token_emb := {}'.format(token_emb))
            self.log.debug('output := {}'.format(output))
            output = self.encode(token_emb, output)               ;self.logsize(output, 'output')
            output = self.dropout(output)
            seq_repr.append(output)

        seq_repr = torch.stack(seq_repr).transpose(1,0)           ;self.logsize(seq_repr, 'seq_repr')
        outputs = []
        attend = self.attend
        self.logsize(attend, 'attend')
        for class_ in classes:
            class_emb = self.cembed(class_)                  ;self.logsize(class_emb, 'class_emb')
            self.log.debug('class_emb: {}'.format(class_emb))
            self.log.debug('attend: {}'.format(attend))

            attn = torch.mm(class_emb, attend)        ;self.logsize(attn, 'attn')
            self.log.debug('attn: {}'.format(attn))
            #attn = attn.expand_as(seq_repr) ;self.logsize(attn, 'attn')
            attended_outputs = torch.bmm(attn.expand(seq_repr.size()[0], *attn.size()), seq_repr.transpose(1,2))                
        
            self.logsize(attended_outputs, 'attended_outputs')
            self.log.debug('{}'.format(attended_outputs))
            attended_outputs = attended_outputs.squeeze(1) * pad_mask
            self.logsize(attended_outputs, 'attended_outputs')
            self.log.debug('{}'.format(attended_outputs))
            output = attended_outputs.unsqueeze(-1) * seq_repr
            self.logsize(output, 'output')
            self.log.debug('output {}'.format(output))

            output = output.sum(1).squeeze(1)                ;self.logsize(output, 'output')
            self.log.debug('output {}'.format(output))
            output = self.classify(output)                  ;self.logsize(output, 'output')
            self.log.debug('output {}'.format(output))
            output = F.softmax(output)
            self.log.debug('output {}'.format(output))

            outputs.append(output)
            
        ret = torch.stack(outputs)
        self.log.debug('ret {}'.format(ret))

        return ret

## Loss and accuracy function

In [None]:
def loss(output, target, loss_function=nn.NLLLoss(), *args, **kwargs):
    loss = 0
    target = target[0]
    target = Variable(torch.LongTensor(target), requires_grad=False)
    if Config().cuda: target = target.cuda()
    output = output.transpose(1,0)
    batch_size = output.size()[0]
    for i, t in zip(output, target):
        log.debug('i, o sizes: {} {}'.format(i, t))
        loss += loss_function(i, t.squeeze()).mean()
        log.debug('loss size: {}'.format(loss))

    del target
    return (loss/batch_size)

def accuracy(output, target, *args, **kwargs):
    accuracy = 0.0
    target = target[0]
    target = Variable(torch.LongTensor(target), requires_grad=False)
    if Config().cuda: target = target.cuda()
    output = output.transpose(1,0)
    batch_size = output.size()[0]
    class_size = output.size()[1]    
    for i, t in zip(output, target):
        correct = (i.max(dim=1)[1] == t).sum().float()
        accuracy += correct/class_size
    del target
    return (accuracy/batch_size)
    

### repr_function to build human readable output from model

In [None]:
from IPython.display import HTML
from IPython.display import display
def repr_function(output, feed, batch_index):
    results = []
    output = output.transpose(1,0)
    indices, (seq,), (classes,) = feed.nth_batch(batch_index)
    print(output.size(), len(indices), len(seq), len(classes))
    for i, o, s, c in zip(indices, output, seq, classes):
        orig_s = feed.data_dict[i].comment_text
        s = [INPUT_VOCAB[i] for i in s]
        s = ' '.join(s)
        results.append([orig_s, s] + list(c))
        o = o.max(dim=1)[1]
        results.append([' ', '  '] + o.data.cpu().numpy().tolist())
    del indices, seq, classes
    return results

In [None]:
_train_datapoints = train_datapoints

In [None]:
import logging
log.setLevel(logging.INFO)

## Experiment on model 1

In [None]:
import random
def  experiment(epochs=10, checkpoint=1, train_datapoints=train_datapoints):
    model =  Model(Config(), len(INPUT_VOCAB), len(OUTPUT_VOCAB))
    if Config().cuda:  model = model.cuda()
        
    split_index = int( len(train_datapoints) * 0.85 )
    train_feed = DataFeed(train_datapoints[:split_index], batchop=batchop, batch_size=128)
    test_feed = DataFeed(train_datapoints[split_index:], batchop=batchop, batch_size=120)

    trainer = Trainer(model=model, loss_function=loss, accuracy_function=accuracy, 
                    checkpoint=checkpoint, epochs=epochs,
                    feeder = Feeder(train_feed, test_feed))

    predictor = Predictor(model=model, repr_function=repr_function, feed=test_feed)

    for e in range(1):
        output, results = predictor.predict(random.choice(range(test_feed.num_batch)))
        display(HTML(results._repr_html_()))
        del output, results
        trainer.train()
        
experiment(train_datapoints=classified_train_datapoints)

In [None]:
del output, results

## Experiment on model using attention

In [None]:
import random
import gc
def  experiment(eons=100, epochs=1, checkpoint=1):
    model =  AttModel(Config(), len(INPUT_VOCAB), len(OUTPUT_VOCAB))
    if Config().cuda:  model = model.cuda()
        
    split_index = int( len(classified_train_datapoints) * 0.85 )
    key=lambda x: len(word_tokenize(x.comment_text))
    #key  = None
    classified_train_feed = DataFeed(classified_train_datapoints[:split_index], batchop=batchop, batch_size=128, sort_key=key)
    classified_test_feed = DataFeed(classified_train_datapoints[split_index:], batchop=batchop, batch_size=128, sort_key=key)
    
    split_index = int( len(train_datapoints) * 0.85 )
    non_classified_train_feed = DataFeed(train_datapoints[:split_index], batchop=batchop, batch_size=128, sort_key=key)
    non_classified_test_feed = DataFeed(train_datapoints[split_index:], batchop=batchop, batch_size=128, sort_key=key)
    
    classified_trainer = Trainer(model=model, 
                    loss_function=loss, accuracy_function=accuracy, 
                    checkpoint=checkpoint, epochs=epochs,
                    feeder = Feeder(classified_train_feed, classified_test_feed))
        
    non_classified_trainer = Trainer(model=model, 
                    loss_function=loss, accuracy_function=accuracy, 
                    checkpoint=checkpoint, epochs=epochs,
                    feeder = Feeder(non_classified_train_feed, non_classified_test_feed))    
    
    predictor_classified_feed = DataFeed(classified_train_datapoints, batchop=batchop, batch_size=12)
    predictor1 = Predictor(model=model, feed=predictor_classified_feed, repr_function=repr_function)
    
    predictor_non_classified_feed = DataFeed(train_datapoints, batchop=batchop, batch_size=12)
    predictor2 = Predictor(model=model, feed=predictor_non_classified_feed, repr_function=repr_function)
    
    for e in range(eons):
        log.info('on {}th eon'.format(e))
        output, results = predictor1.predict(random.choice(range(predictor_classified_feed.num_batch)))
        display(HTML(results._repr_html_()))
        del output, results
        output, results = predictor2.predict(random.choice(range(predictor_non_classified_feed.num_batch)))
        display(HTML(results._repr_html_()))
        del output, results
        non_classified_trainer.train()
        classified_trainer.train()
    model = model.cpu()
experiment()
gc.collect()

In [None]:
for obj in gc.get_objects():
    if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
        print(type(obj), obj.size())

In [None]:
dummy_feed = DataFeed(train_datapoints[:100], batchop=batchop, batch_size=1)
indices, (seq,), (target,) = dummy_feed.nth_batch(random.choice(range(dummy_feed.num_batch)))
print(dummy_feed.data_dict[indices[0]])
print([INPUT_VOCAB[i] for i in seq[0]])