In [222]:
from __future__ import print_function, division, unicode_literals
import six
import os
from os.path import join
import json
from codecs import open
from collections import defaultdict
from operator import itemgetter
import nltk
import numpy as np
from nltk.corpus import stopwords
import re
import codecs
import pandas as pd
import random
from itertools import islice

import theano
import theano.tensor as T
import lasagne
import lasagne.layers as LL
from lasagne.nonlinearities import softmax, elu

from sklearn.cross_validation import KFold

In [223]:
DATA_DIR = join(os.environ['HOME'], 'data/allen-ai-challenge')
FEATURE_DIR = join(DATA_DIR, 'features')

TRAINING_CORRECTS = join(FEATURE_DIR, 'correct_answers.tsv')
TRAINING_SET = join(DATA_DIR, 'training_set.tsv')
VALIDATION_SET = join(DATA_DIR, 'validation_set.tsv')

SUMBISSION_FILE = join(DATA_DIR, 'submissions', 'ensemble1.csv')

# DIM_NGRAMS = 1
# NGRAMS = 'ngrams_0429_1483'

DIM_NGRAMS = 4
NGRAMS = 'merged_ngrams'

DIM_REPTIL = 1
REPTIL = 'set_reptil_features'

DIM_LUCENE_ROMAN = 50
LUCENE_ROMAN = 'lucene_f5'

DIM_LUCENE_MORE = 20
LUCENE_MORE = 'lucene_f5_more_data'

In [224]:
def fn(dataset_name, dataset_type='training'):
    assert dataset_type in ['training', 'validation']
    return join(FEATURE_DIR, ('%s_%s.tsv' % (dataset_type, dataset_name)))

In [225]:
if not os.path.exists(TRAINING_CORRECTS):
    print(TRAINING_CORRECTS, 'not found. Creating a new one...')
    with open(TRAINING_SET, encoding='utf8') as fi:
        fi.readline()  # skip header
        with open(TRAINING_CORRECTS, encoding='utf8', mode='w') as fo:
            for line in fi:
                qid = line.split('\t')[0]
                correct = line.split('\t')[2]
                print(qid, correct, sep='\t', file=fo)      

## Prepare data
-------------

In [226]:
correct_map = {}
with open(TRAINING_CORRECTS, encoding='utf8') as f:
    for qid, c in (line.strip().split() for line in f):
        correct_map[int(qid)] = 'ABCD'.index(c)

In [227]:
idx_valid = []
with open(VALIDATION_SET, encoding='utf8') as f:
    f.readline()
    for row in (line.strip().split('\t') for line in f):
        idx_valid.append(int(row[0]))

In [228]:
ids = sorted(correct_map)
itrain, itest = next(iter(KFold(len(ids), n_folds=3)))
idx_train = [ids[i] for i in itrain]
idx_test = [ids[i] for i in itest]

In [229]:
def cycle_rand(seq):
    while True:
        rseq = list(seq)
        random.shuffle(rseq)
        for s in rseq:
            yield s

In [230]:
def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

In [231]:
def read_dataset_(filename, dims, dict_out):
    with open(filename, encoding='utf8') as f:
        for row in (line.strip().split('\t') for line in f):
            qid = int(row[0])
            dict_out[qid] = np.array([np.fromstring(x, sep=';') for x in row[1:]])

            
def read_dataset(dataset_name, dims=1):
    data = defaultdict(lambda: np.zeros((4, dims)))
    read_dataset_(fn(dataset_name, 'training'), dims, data)
    read_dataset_(fn(dataset_name, 'validation'), dims, data)
    return data


def generate_data(indices, data_x_dict, ):
    f_dim = data_x_dict.values()[0][0].shape[0]
    
    x = np.zeros((len(indices), 4, f_dim), dtype='float32')
    y = np.zeros((len(indices)), dtype='int32')
    for row, i in enumerate(indices):
        x[row] = data_x_dict[i]
        y[row] = correct_map.get(i, 0)
    return x, y

In [232]:
def shuffle_dataset(dataset_x, dataset_y):
    f_dim = dataset_x.shape[2]  # feature dimensions
    rx = np.zeros_like(dataset_x)
    ry = np.zeros_like(dataset_y)
    for i_row in xrange(dataset_y.shape[0]):
        rand_order = np.arange(4)
        np.random.shuffle(rand_order)
        for k in range(4):
            ri = rand_order[k]
            rx[i_row, ri] = dataset_x[i_row, k]
        ry[i_row] = rand_order[dataset_y[i_row]]
    return rx, ry


def flatten_dataset(dataset_x, dataset_y):
    row_count, choice_count, f_dim = dataset_x.shape  # feature dimensions
    return dataset_x.reshape(row_count, choice_count * f_dim), dataset_y

In [233]:
# generate_data([100001], lucene)

In [234]:
# shuffle_dataset(*generate_data([100001], lucene))

## NGrams
-----------

In [235]:
ngrams = read_dataset(NGRAMS, dims=DIM_NGRAMS)

In [236]:
L2 = 0.0001

t_target = T.ivector()

l_in = LL.InputLayer((None, 4, DIM_NGRAMS))
nn = LL.DenseLayer(l_in, 30, nonlinearity=elu)
nn = LL.DenseLayer(nn, 4, nonlinearity=softmax, b=None)
t_output = LL.get_output(nn)

t_cost = lasagne.objectives.categorical_crossentropy(t_output, t_target).mean()
t_acc = lasagne.objectives.categorical_accuracy(t_output, t_target).mean()

params = LL.get_all_params(nn)

l2 = lasagne.regularization.regularize_network_params(nn, lasagne.regularization.l2)

updates = lasagne.updates.adam(t_cost + L2*l2, params)

train_fn = theano.function([l_in.input_var, t_target], t_cost, updates=updates)
cost_fn = theano.function([l_in.input_var, t_target], [t_cost, t_acc])

forward_fn = theano.function([l_in.input_var], t_output)

In [239]:
id_generator = cycle_rand(idx_train)

test_x, test_y = generate_data(idx_test, ngrams)

BATCH = 100
costs = []
rows_seen = 0
while rows_seen < 500000:
    indices = take(BATCH, id_generator)
    batch_x, batch_y = shuffle_dataset(*generate_data(indices, ngrams))
    costs.append(train_fn(batch_x, batch_y))
    rows_seen += BATCH
    if rows_seen % 10000 < BATCH:
        nll, acc = cost_fn(test_x, test_y)
        mean_acc = (test_x.sum(axis=2).argmax(axis=1) == test_y).sum() / test_y.shape[0]
        print(rows_seen, np.mean(costs), nll, acc, mean_acc)
        costs = []

10000 1.32587 1.34104549885 0.410071942446 0.402877697842
20000 1.32363 1.33034670353 0.408872901679 0.402877697842
30000 1.31468 1.33425927162 0.405275779376 0.402877697842
40000 1.31667 1.37550365925 0.410071942446 0.402877697842
50000 1.32522 1.34450221062 0.401678657074 0.402877697842
60000 1.31717 1.33681488037 0.402877697842 0.402877697842
70000 1.30996 1.34732842445 0.412470023981 0.402877697842
80000 1.31262 1.34604716301 0.39928057554 0.402877697842
90000 1.31091 1.34634280205 0.411270983213 0.402877697842
100000 1.32087 1.36074709892 0.39928057554 0.402877697842
110000 1.3186 1.44600951672 0.39448441247 0.402877697842
120000 1.32495 1.35781598091 0.404076738609 0.402877697842
130000 1.3085 1.35561776161 0.406474820144 0.402877697842
140000 1.30678 1.347671628 0.393285371703 0.402877697842
150000 1.31331 1.36765646935 0.410071942446 0.402877697842
160000 1.30564 1.3350276947 0.398081534772 0.402877697842
170000 1.30277 1.34949028492 0.39928057554 0.402877697842
180000 1.31441 

In [240]:
ngrams_train_pred = forward_fn(generate_data(idx_train, ngrams)[0])
ngrams_test_pred = forward_fn(generate_data(idx_test, ngrams)[0])
ngrams_valid_pred = forward_fn(generate_data(idx_valid, ngrams)[0])

REPTIL
--------------------

In [241]:
reptil = read_dataset(REPTIL)

In [244]:
t_target = T.ivector()

l_in = LL.InputLayer((None, 4, DIM_REPTIL))
nn = LL.DenseLayer(l_in, 30, nonlinearity=elu)
nn = LL.DenseLayer(nn, 4, nonlinearity=softmax, b=None)
t_output = LL.get_output(nn, deterministic=False)
t_output_det = LL.get_output(nn, deterministic=True)

t_cost = lasagne.objectives.categorical_crossentropy(t_output, t_target).mean()
t_acc = lasagne.objectives.categorical_accuracy(t_output, t_target).mean()

t_cost_det = lasagne.objectives.categorical_crossentropy(t_output_det, t_target).mean()
t_acc_det = lasagne.objectives.categorical_accuracy(t_output_det, t_target).mean()

params = LL.get_all_params(nn)

l2 = lasagne.regularization.regularize_network_params(nn, lasagne.regularization.l2)
updates = lasagne.updates.adam(t_cost + L2*l2, params)

train_fn = theano.function([l_in.input_var, t_target], t_cost, updates=updates)
cost_fn = theano.function([l_in.input_var, t_target], [t_cost_det, t_acc_det])

forward_fn = theano.function([l_in.input_var], t_output_det)

In [245]:
id_generator = cycle_rand(idx_train)

test_x, test_y = generate_data(idx_test, reptil)

BATCH = 100
costs = []
rows_seen = 0
while rows_seen < 200000:
    indices = take(BATCH, id_generator)
    batch_x, batch_y = shuffle_dataset(*generate_data(indices, reptil))
    costs.append(train_fn(batch_x, batch_y))
    rows_seen += BATCH
    if rows_seen % 1000 < BATCH:
        nll, acc = cost_fn(test_x, test_y)
        mean_acc = (test_x[:,:,0].argmax(axis=1) == test_y).sum() / test_y.shape[0]
        print(rows_seen, np.mean(costs), nll, acc, mean_acc)
        costs = []

1000 1.39045 1.3866648674 0.268585131894 0.434052757794
2000 1.38578 1.3846372366 0.269784172662 0.434052757794
3000 1.38584 1.38259017467 0.244604316547 0.434052757794
4000 1.38541 1.38134741783 0.28896882494 0.434052757794
5000 1.38261 1.37879121304 0.300959232614 0.434052757794
6000 1.38098 1.37630319595 0.310551558753 0.434052757794
7000 1.37674 1.37425780296 0.317745803357 0.434052757794
8000 1.37797 1.37329018116 0.31654676259 0.434052757794
9000 1.37517 1.37300682068 0.315347721823 0.434052757794
10000 1.37595 1.37225270271 0.328537170264 0.434052757794
11000 1.37356 1.3713350296 0.329736211031 0.434052757794
12000 1.36995 1.3695088625 0.324940047962 0.434052757794
13000 1.36833 1.36786532402 0.326139088729 0.434052757794
14000 1.36996 1.36613285542 0.338129496403 0.434052757794
15000 1.36573 1.36420297623 0.334532374101 0.434052757794
16000 1.3637 1.36283802986 0.347721822542 0.434052757794
17000 1.36455 1.36141943932 0.350119904077 0.434052757794
18000 1.35904 1.36014056206 0.

In [248]:
reptil_train_pred = forward_fn(generate_data(idx_train, reptil)[0])
reptil_test_pred = forward_fn(generate_data(idx_test, reptil)[0])
reptil_valid_pred = forward_fn(generate_data(idx_valid, reptil)[0])

## Lucene + Roman
-------------

In [249]:
lucene_roman = read_dataset(LUCENE_ROMAN)

In [253]:
t_target = T.ivector()

l_in = LL.InputLayer((None, 4, DIM_LUCENE_ROMAN))
nn = LL.SliceLayer(l_in, slice(0, 10), axis=2)
nn = LL.DenseLayer(nn, 300, nonlinearity=elu)
nn = LL.DenseLayer(nn, 4, nonlinearity=softmax, b=None)
t_output = LL.get_output(nn, deterministic=False)
t_output_det = LL.get_output(nn, deterministic=True)

t_cost = lasagne.objectives.categorical_crossentropy(t_output, t_target).mean()
t_acc = lasagne.objectives.categorical_accuracy(t_output, t_target).mean()

t_cost_det = lasagne.objectives.categorical_crossentropy(t_output_det, t_target).mean()
t_acc_det = lasagne.objectives.categorical_accuracy(t_output_det, t_target).mean()

params = LL.get_all_params(nn)

l2 = lasagne.regularization.regularize_network_params(nn, lasagne.regularization.l2)
updates = lasagne.updates.adam(t_cost + L2*l2, params)

train_fn = theano.function([l_in.input_var, t_target], t_cost, updates=updates)
cost_fn = theano.function([l_in.input_var, t_target], [t_cost_det, t_acc_det])

forward_fn = theano.function([l_in.input_var], t_output_det)

In [255]:
id_generator = cycle_rand(idx_train)

test_x, test_y = generate_data(idx_test, lucene_roman)

BATCH = 100

rows_seen = 0
while rows_seen < 300000:
    indices = take(BATCH, id_generator)
    batch_x, batch_y = shuffle_dataset(*generate_data(indices, lucene_roman))
    costs.append(train_fn(batch_x, batch_y))
    rows_seen += BATCH
    if rows_seen % 10000 < BATCH:
        nll, acc = cost_fn(test_x, test_y)
        mean_acc = (test_x.reshape((test_x.shape[0], 4, 50)).sum(axis=2).argmax(axis=1) == test_y).sum() / test_y.shape[0]
        print(rows_seen, np.mean(costs), nll, acc, mean_acc)
        costs = []

10000 1.0953 1.11536073685 0.537170263789 0.531175059952
20000 1.09511 1.12386584282 0.535971223022 0.531175059952
30000 1.09625 1.13148450851 0.525179856115 0.531175059952
40000 1.09616 1.11229348183 0.549160671463 0.531175059952
50000 1.10021 1.1151471138 0.539568345324 0.531175059952
60000 1.09585 1.12077796459 0.533573141487 0.531175059952
70000 1.09346 1.11337471008 0.552757793765 0.531175059952
80000 1.09594 1.12249219418 0.547961630695 0.531175059952
90000 1.09494 1.13337540627 0.517985611511 0.531175059952
100000 1.09606 1.13887917995 0.516786570743 0.531175059952
110000 1.09817 1.11757302284 0.541966426859 0.531175059952
120000 1.09585 1.11861991882 0.532374100719 0.531175059952
130000 1.09721 1.12010848522 0.528776978417 0.531175059952
140000 1.09711 1.14166128635 0.519184652278 0.531175059952
150000 1.09805 1.12022030354 0.547961630695 0.531175059952
160000 1.09497 1.1348181963 0.528776978417 0.531175059952
170000 1.09414 1.11387884617 0.526378896882 0.531175059952
180000 1.

In [256]:
lucene_train_pred = forward_fn(generate_data(idx_train, lucene_roman)[0])
lucene_test_pred = forward_fn(generate_data(idx_test, lucene_roman)[0])
lucene_valid_pred = forward_fn(generate_data(idx_valid, lucene_roman)[0])

## Lucene More Data

In [257]:
lucene_more = read_dataset(LUCENE_MORE, dims=DIM_LUCENE_MORE)

In [276]:
t_target = T.ivector()

l_in = LL.InputLayer((None, 4, DIM_LUCENE_MORE))
nn = LL.DimshuffleLayer(l_in, (0, 2, 1))
nn = LL.LSTMLayer(nn, 5, grad_clipping=10, only_return_final=True)
# nn = LL.DenseLayer(nn, 10, nonlinearity=elu)
nn = LL.DenseLayer(nn, 4, nonlinearity=softmax, b=None)
t_output = LL.get_output(nn, deterministic=False)
t_output_det = LL.get_output(nn, deterministic=True)

t_cost = lasagne.objectives.categorical_crossentropy(t_output, t_target).mean()
t_acc = lasagne.objectives.categorical_accuracy(t_output, t_target).mean()

t_cost_det = lasagne.objectives.categorical_crossentropy(t_output_det, t_target).mean()
t_acc_det = lasagne.objectives.categorical_accuracy(t_output_det, t_target).mean()

params = LL.get_all_params(nn)

L2 = 0.001
l2 = lasagne.regularization.regularize_network_params(nn, lasagne.regularization.l2)
updates = lasagne.updates.adam(t_cost + L2*l2, params)

train_fn = theano.function([l_in.input_var, t_target], t_cost, updates=updates)
cost_fn = theano.function([l_in.input_var, t_target], [t_cost_det, t_acc_det])

forward_fn = theano.function([l_in.input_var], t_output_det)
mean_fn = theano.function([l_in.input_var], l_in.input_var.mean(axis=2).argmax(axis=1))

In [277]:
id_generator = cycle_rand(idx_train)

test_x, test_y = generate_data(idx_test, lucene_more)

BATCH = 100

rows_seen = 0
while rows_seen < 3000000:
    indices = take(BATCH, id_generator)
    batch_x, batch_y = shuffle_dataset(*generate_data(indices, lucene_more))
    costs.append(train_fn(batch_x, batch_y))
    rows_seen += BATCH
    if rows_seen % 10000 < BATCH:
        nll, acc = cost_fn(test_x, test_y)
        mean_acc = (test_x.sum(axis=2).argmax(axis=1) == test_y).sum() / test_y.shape[0]
        print(rows_seen, np.mean(costs), nll, acc, mean_acc)
        costs = []

10000 1.27538 1.3837480545 0.227817745803 0.55035971223
20000 1.35043 1.3027755022 0.384892086331 0.55035971223
30000 1.24004 1.23423862457 0.47721822542 0.55035971223
40000 1.22101 1.23795497417 0.488009592326 0.55035971223
50000 1.20105 1.21217191219 0.492805755396 0.55035971223
60000 1.18703 1.2149541378 0.458033573141 0.55035971223
70000 1.1921 1.21274876595 0.490407673861 0.55035971223
80000 1.17808 1.1908916235 0.47721822542 0.55035971223
90000 1.17646 1.18332064152 0.485611510791 0.55035971223
100000 1.16929 1.18165802956 0.494004796163 0.55035971223
110000 1.17254 1.21361863613 0.448441247002 0.55035971223
120000 1.17026 1.178383708 0.49520383693 0.55035971223
130000 1.16431 1.17918097973 0.492805755396 0.55035971223
140000 1.15209 1.15832865238 0.49520383693 0.55035971223
150000 1.15025 1.15513908863 0.516786570743 0.55035971223
160000 1.1445 1.152561903 0.503597122302 0.55035971223
170000 1.13546 1.14107644558 0.514388489209 0.55035971223
180000 1.13093 1.1483720541 0.5131894

KeyboardInterrupt: 

In [278]:
lucenem_train_pred = forward_fn(generate_data(idx_train, lucene_more)[0])
lucenem_test_pred = forward_fn(generate_data(idx_test, lucene_more)[0])
lucenem_valid_pred = forward_fn(generate_data(idx_valid, lucene_more)[0])

## Averaging
-------------

In [283]:
_, train_y = generate_data(idx_train, ngrams)

average_train_pred = 2*lucenem_train_pred + lucene_train_pred + reptil_train_pred + ngrams_train_pred
average_test_pred = 2*lucenem_test_pred + lucene_test_pred + reptil_test_pred + ngrams_test_pred
average_valid_pred = 2*lucenem_valid_pred + lucene_valid_pred + reptil_valid_pred + ngrams_valid_pred

print('Training average', (average_train_pred.argmax(axis=1) == train_y).sum() / train_y.shape)
print('Testing average', (average_test_pred.argmax(axis=1) == test_y).sum() / test_y.shape)


Training average [ 0.55942377]
Testing average [ 0.56354916]


## Submit
----------------

In [286]:
with open(SUMBISSION_FILE, encoding='utf8', mode='w') as f:
    print('id,correctAnswer', file=f)
    for i in xrange(average_valid_pred.shape[0]):
        print(idx_valid[i], 'ABCD'[average_valid_pred[i].argmax()], sep=',', file=f)