In [None]:
#we start by importing our requirements
#Spacy can be pip installed
#See how to also install the english model here
#https://spacy.io/docs/usage/models
import spacy
#http://pytorch.org/
#Need a gpu for this. Preferably >=8GB ram or some things might not fit on your machine
import torch
import torch.autograd as ta
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as opt

import csv
import itertools
import numpy as np
import pylab as plt
from collections import Counter
from sys import stdout


from layers import *
from tools import *

In [None]:
#load the training data. This is prety straightforward
data = [row for row in csv.reader(open('train.csv','r'), delimiter=',')]

#spacy does relatively quick parsing, at least for english. Good for a fast and efortless tokenizing of our text
#It also comes with Glove's 300d already pretrained wordvectors attached to case insensitive tokens
#This comes in handy for our training
#We disable the modules we don't directly care about for speed purposes 
#[it's already really slow because the data is quite big]
#fills up to 14GB ram on my laptop, make sure you have at least that :)
nlp = spacy.load('en', parser=False, tagger=False, entity=False)


#We parse the training sentences, and keep the associated indices and labels
indata = [[[y.rank for y in nlp(x[3])],
           [y.rank for y in nlp(x[4])]] 
          for x in data[1:]]

outdata = [int(x[5]) for x in data[1:]]
outdata = np.array(outdata,dtype=np.int64)

In [None]:
#let's extract the top 99 most common characters from the training set
#and add a dummy token for the non-trainable, "masking" index 0
def tokenize(string, chars):
    return [chars.index(x) if x in chars else 0 for x in string]

#get the most common 99 characters + 1 null token
longstring = u''.join([x[3]+x[4] for x in data]).lower()
chars = [u'\x00'] + sorted([x[0] for x in Counter(longstring).most_common(99)])

#make the training series of charaters
charindata = [[tokenize(x[3], chars), 
               tokenize(x[4], chars)] 
              for x in data[1:]]

In [None]:
#sadly, glove vectors are trained en-masse on corpora irrelevant to our task.
#so about 10% of the tokens(words) in the training set do not have vectors.
#in principle they would just be masked out to 0, but in our case they can represent specialty topics
#so we'll make another series of random vectors (that we will train) out of the top 10k words that glove left behind
#this can take a while
leftoverdata = [[[y.text for y in nlp(x[3])],
                [y.text for y in nlp(x[4])]] 
                for x in data[1:]]
#find out which words don't already have glove vectors via spacy
leftovers = [[[y.text.lower() for y in nlp(x[3]) if not y.has_vector] +
             [y.text.lower() for y in nlp(x[4]) if not y.has_vector]] 
             for x in data[1:]]
#get the most common 9999 + 1 null token
leftovers = [z  for x in leftovers for y in x for z in y]
leftovers = [False]+[x[0] for x in Counter(leftovers).most_common()[:9999]]
leftovers = {x:leftovers.index(x) for x in leftovers}

#make the leftover word indices training series
leftoverdata = [[[leftovers[y] if y in leftovers else 0 for y in x[0]],
                 [leftovers[y] if y in leftovers else 0 for y in x[1]]] 
                for x in leftoverdata]


In [None]:
#load the testing data
testdata = [row for row in csv.reader(open('test.csv','r'), delimiter=',')]

#make the testing series of glove indices
testindata = [[[y.rank for y in nlp(x[1])],
               [y.rank for y in nlp(x[2])]] 
              for x in testdata[1:]]

#make the testing series of charaters
chartestindata = [[tokenize(x[1], chars), 
                   tokenize(x[2], chars)] 
                   for x in testdata[1:]]

#make the testing series of leftover word indices
leftovertestdata = [[[leftovers[y.text.lower()] if y.text.lower() in leftovers else 0 for y in nlp(x[1])],
                   [leftovers[y.text.lower()] if y.text.lower() in leftovers else 0 for y in nlp(x[2])]] 
                   for x in testdata[1:]]


In [None]:
#For GPU training purposes, it's best that we use GPU based embedded vector indices to minimize data transfers by 
#about 300%. For that we set up a function to retrieve the vocabulary weight matrix and transfer it to the GPU later
def get_embeddings(vocab):
    max_rank = max(lex.rank for lex in vocab if lex.has_vector)
    vectors = np.ndarray((max_rank+1, vocab.vectors_length), dtype='float32')
    for lex in vocab:
        if lex.has_vector:
            vectors[lex.rank] = lex.vector
    return vectors

In [None]:
#We might have some memory limitations, so we set an upper limit for how many wordvectors we transfer
#total spacy glove WV size is about ~740k, so not that big of a difference
#a lot is junk but I won't filter it out here. No time.

embsize = 700000
#We create a separate embedding model on the GPU and feed its outputs to the siamese network.
#It makes it easier to make sure we keep these weights frozen this way, as this can be excluded from the optimizer
embedder = nn.Embedding(embsize, 300).cuda()

#we override the random weights with the ones we stole from spacy :>
#I just assume you have a GPU
embedder.weight.data = torch.Tensor(get_embeddings(nlp.vocab)[:embsize]).cuda()



In [None]:
def call_prediction(model, inseq0, inseq1, linseq0, linseq1, cinseq0, cinseq1, volatile = False):
    #get the trainable embeddings for the input sequences    
    tinseq0 = trainable_embedder(ta.Variable(torch.from_numpy(inseq0).cuda(), volatile = volatile))
    tinseq1 = trainable_embedder(ta.Variable(torch.from_numpy(inseq1).cuda(), volatile = volatile))    
    #get the embeddings for the input sequences    
    inseq0 = embedder(ta.Variable(torch.from_numpy(inseq0).cuda(), volatile = volatile))
    inseq1 = embedder(ta.Variable(torch.from_numpy(inseq1).cuda(), volatile = volatile))
    #get the embeddings for the leftover input sequences
    linseq0 = extraembedder(ta.Variable(torch.from_numpy(linseq0).cuda(), volatile = volatile))
    linseq1 = extraembedder(ta.Variable(torch.from_numpy(linseq1).cuda(), volatile = volatile))
    
    #transfer the indices for the character sequences
    cinseq0 = ta.Variable(torch.from_numpy(cinseq0).cuda(), volatile = volatile)
    cinseq1 = ta.Variable(torch.from_numpy(cinseq1).cuda(), volatile = volatile)
    
    #create an initial states variable for the recurrent layers
    h_0 = ta.Variable(torch.zeros(1, inseq0.size(0), 256).cuda(), volatile = volatile)
    
    #concatenate all the embeddings together:
    #300d fixed from glove
    #50d trainable from us for domain adaptation
    #50d for leftover words
    #not more, we don't have enough data or the right training scheme to train proper "full" vectors for them
    #so we want to limit their capacity
    #400d total, each branch
    inseq0 = torch.cat([inseq0, tinseq0, linseq0],2)
    inseq1 = torch.cat([inseq1, tinseq1, linseq1],2)
    #get the prediction back and return it
    pred = model(inseq0, inseq1, cinseq0, cinseq1, h_0)
    return pred


In [None]:
#Sadly I don't have time to go through all of the many tunable parameters 
#One can try to work network depth, dropout rate, input transformations, number of units, etc
#One single model so far gets a LB entry of about .35 Not to bad for just dumping in data, but not imppressive either
#(others claim more with similar architectures)
#A quick, ML approved hack is to do model averaging of different models trained with different parameters
#We don't have time to explore many parameter settings with those either :)
#So we'll just vary the optimizer and the data partition
#This should bias each of our models towards a local gradient minimum and a local data representation minimum
#So we'll just make a bunch of optimizer styles that we'll just iterate through
optnames = ['Adam', 'Adamax', 'Adadelta', 'Adagrad', 'ASGD', 'SGD', 'RMSprop']
def optmaker(name):
    ensembleopts = {'Adam':opt.Adam(itertools.chain(model.parameters(),
                                         extraembedder.parameters(), 
                                         trainable_embedder.parameters()),
                         weight_decay = 0.0001),
                    'Adamax':opt.Adamax(itertools.chain(model.parameters(),
                                         extraembedder.parameters(), 
                                         trainable_embedder.parameters()),
                         weight_decay = 0.0001),
                    'Adadelta':opt.Adadelta(itertools.chain(model.parameters(),
                                         extraembedder.parameters(), 
                                         trainable_embedder.parameters()),
                         weight_decay = 0.0001),
                    'Adagrad':opt.Adagrad(itertools.chain(model.parameters(),
                                         extraembedder.parameters(), 
                                         trainable_embedder.parameters()),
                         weight_decay = 0.0001),
                    'ASGD':opt.ASGD(itertools.chain(model.parameters(),
                                         extraembedder.parameters(), 
                                         trainable_embedder.parameters()),
                         weight_decay = 0.0001),
                    'SGD':opt.SGD(itertools.chain(model.parameters(),
                                         extraembedder.parameters(), 
                                         trainable_embedder.parameters()),
                            lr = 1e-2,
                            momentum = 0.9,
                            nesterov = True,
                            weight_decay = 0.0001),
                    'RMSprop':opt.RMSprop(itertools.chain(model.parameters(),
                                         extraembedder.parameters(), 
                                         trainable_embedder.parameters()),
                        lr = 1e-4,
                         weight_decay = 0.0001),                    
                   }
    return ensembleopts[name]

In [None]:
#We create some training data indices
sequence = np.arange(len(indata))


In [None]:
#Let's iterate thrgouh our optimizers
for optname in optnames:
    #Make our siamese model and trainable embedding layers
    model = Siamese().cuda()
    trainable_embedder = nn.Embedding(embsize, 50, padding_idx = 0).cuda()
    extraembedder = nn.Embedding(len(leftovers), 50, padding_idx = 0).cuda()
    #create the optimizer with the above's parameters
    optimizer = optmaker(optname)
    #Shuffle the training data and split it up 90% training, 10% validation
    np.random.shuffle(sequence)
    trainsequence = sequence[:int(sequence.shape[0]*.9)]
    validsequence = sequence[int(sequence.shape[0]*.9):]    
    
    #Set the batch size to whatever fits into memory [no more than 64 for such a big network with the 300x700k matrix :)
    batch_size = 64
    #We'll do just 10 epochs each
    for z in range(10):
        print('===========')
        #Shuffle the training sequence for random sequetial ordering, and varrying padding lengths
        np.random.shuffle(trainsequence)
        #initialize print losses and counters
        cnt = 0
        avgpredloss = 0
        
        #set model into training mode
        #This enables dropout, batchnorm, etc
        model.train()
        #start the training loop
        #Note, due to rounding errors, up to batch_size - 1 data points might get omitted in the loop.
        #This shouldn't impact training much in this case, as they would get picked up in the next batch
        #But in a "proper" deployment implementation it should be taken care of 
        #[or just treat it like more regularization] :]
        for k in range(len(trainsequence)//batch_size):
            cnt += batch_size
            #select a random batch of training data, inputs and labels
            randselect = trainsequence[k*batch_size:k*batch_size+batch_size]
            outdataset = np.expand_dims(outdata[trainsequence[k*batch_size:k*batch_size+batch_size]],1)

            #Convert it into a feedable format and generate the input sequences
            inset = [indata[i] for i in randselect]
            charinset = [charindata[i] for i in randselect]
            leftoverinset = [leftoverdata[i] for i in randselect]
            (inseq0, inseq1, 
             linseq0, linseq1, 
             cinseq0, cinseq1) = generate_inseqs(embsize, inset, leftoverinset, charinset)

            #add some noise to the data. Randomly omit some words/letters (by mapping them to the null one)
            #This case is a 1% chance for each
            #A number too low and the network will overfit as it specializes on certain obvious features
            #A number too high and you don't learn any details
            inseq0 = addnoise(inseq0, 0.01)
            inseq1 = addnoise(inseq1, 0.01)
            linseq0 = addnoise(linseq0, 0.01)
            linseq1 = addnoise(linseq1, 0.01)
            cinseq0 = addnoise(cinseq0, 0.01)
            cinseq1 = addnoise(cinseq1, 0.01)


            #get the embeddings for the input sequences
            pred = call_prediction(model, inseq0, inseq1, linseq0, linseq1, cinseq0, cinseq1, False)

            #set the targets, compute the weighted loss and optimize an iteration
            targ = ta.Variable(torch.Tensor(1.*outdataset)).cuda()
            #this is a bit of a cheat as you're not supposed to look at the test set. 
            #But I'll assume that kaggle "cheated" as the training and test sets are supposed to be drawn from
            #similar distributions
            #An all 1 submission yields a loss of 28.52, an all 0 submission yields a loss 6.01
            #This indicates a roughly 20-80 positive-negative split
            #These guys say about 17%
            #https://www.kaggle.com/c/quora-question-pairs/discussion/31179
            #So to counter that we'll scale our crossentropy function to give more weight to negative examples
            #as the training set is more of a 40-60 split. 
            #Either that or we oversample the negatives, but that's just a waste of flops           
            loss = torch.mean(-( 0.472 * targ * torch.log(pred) + 1.309 * (1.0 - targ) * torch.log(1.0 - pred) ))
            #do the backwards pass, and update the weights and lcear the buffers
            loss.backward(retain_variables=False)
            optimizer.step()
            optimizer.zero_grad()

            #do some printing of average training losses to keep track of our progress
            avgpredloss += loss.data.cpu().numpy()[0] * batch_size
            stdout.write("\r%d %f" % (cnt, avgpredloss/cnt))
            stdout.flush()
        avgpredloss /= cnt
        stdout.write("\n")
        

        #Done training one epoch, time for testing
        cnt = 0
        validloss = 0
        #Set the model into evaluation mode. This disables dropout, etc
        model.eval()
        #Note: Here as well. Some testing points might get lost. Given the proportionally large number of remaining
        #samples, the effect should be minimal. But in a proper data science analysis, this should be taken care of.
        #we do it here due to lack of time, don't do it at home :)
        for k in range(len(validsequence)//batch_size):
            cnt += batch_size
            #Testing loop is similar to training out of laziness and lack of time to make it pretty
            #There's no actual need for random sequential order for evaluation. So this time it's continuous
            #but the split is pre-shuffled once
            randselect = validsequence[k*batch_size:k*batch_size+batch_size]
            outdataset = np.expand_dims(outdata[validsequence[k*batch_size:k*batch_size+batch_size]],1)

            #Generate the sequences, feed them through the net and retrieve the prediction
            inset = [indata[k] for k in randselect]
            charinset = [charindata[i] for i in randselect]
            leftoverinset = [leftoverdata[i] for i in randselect]
            
            (inseq0, inseq1, 
             linseq0, linseq1, 
             cinseq0, cinseq1) = generate_inseqs(embsize, inset, leftoverinset, charinset) 
            
            pred = call_prediction(model, inseq0, inseq1, linseq0, linseq1, cinseq0, cinseq1, True)

            #Compute the loss and print it out
            #There's no backward pass here
            targ = ta.Variable(torch.Tensor(1.*outdataset)).cuda()
            predloss = torch.mean(-( 0.472 * targ * torch.log(pred) + 1.309 * (1.0 - targ) * torch.log(1.0 - pred)))
            
            #integrate loss and print
            validloss += predloss.data.cpu().numpy()[0]*batch_size
            stdout.write("\r%d %f" % (cnt, validloss/cnt))
            stdout.flush()
        predloss /= cnt
        stdout.write("\n")
    
    #Ok. Measurable things are done, time to run it on kaggle's data
    #make some empty keepers
    results = np.empty([0,1])
    cnt = 0
    #eval mode is already set but just to remeind ourselves
    model.eval()
    for k in range(len(testindata)//batch_size):
        #iterate through the test data, generate the sequences, feed them to the net, 
        #get the predictions and annex it to the list
        cnt += batch_size
        inset = testindata[k*batch_size:k*batch_size+batch_size]
        charinset = chartestindata[k*batch_size:k*batch_size+batch_size]
        leftoverinset = leftovertestdata[k*batch_size:k*batch_size+batch_size]
        (inseq0, inseq1, 
         linseq0, linseq1,
         cinseq0, cinseq1) = generate_inseqs(embsize, inset, leftoverinset, charinset) 

        pred = call_prediction(model, inseq0, inseq1, linseq0, linseq1, cinseq0, cinseq1, True)

        #add the predictions to the result
        results = np.vstack([results,pred.data.cpu().numpy()])
        
        stdout.write("\r%d" % cnt)
        stdout.flush()

    #Do the last batch in case the dataset is not 128 dvisible
    inset = testindata[k*batch_size+batch_size:]
    charinset = chartestindata[k*batch_size+batch_size:]
    leftoverinset = leftovertestdata[k*batch_size+batch_size:]
    (inseq0, inseq1, 
     linseq0, linseq1, 
     cinseq0, cinseq1) = generate_inseqs(embsize, inset, leftoverinset, charinset) 
    pred = call_prediction(model, inseq0, inseq1, linseq0, linseq1, cinseq0, cinseq1, True)
    results = np.vstack([results,pred.data.cpu().numpy()])

    #write out the results to a CSV to be compounded together later
    #I don't save the models because it's not so straightforward, and I'm running low on disk space
    #But you can if you want
    finaldata = [['test_id','is_duplicate']]+[[x[0],y[0]] for x,y in zip(testdata[1:],results)]
    csv.writer(open(optname+"_"+str(avgpredloss)+"_"+str(validloss)+'submission.csv','w')).writerows(finaldata)