In [None]:
from __future__ import print_function, division, unicode_literals
import six
import os
from os.path import join
import json
from codecs import open
from collections import defaultdict
from operator import itemgetter
import nltk
import numpy as np
from nltk.corpus import stopwords
import re
import codecs
import pandas as pd
import random
from itertools import islice

import theano
import theano.tensor as T
import lasagne
import lasagne.layers as LL
from lasagne.nonlinearities import softmax, elu

from sklearn.cross_validation import KFold

In [None]:
DATA_DIR = join(os.environ['HOME'], 'data/allen-ai-challenge')
FEATURE_DIR = join(DATA_DIR, 'features')

TRAINING_CORRECTS = join(FEATURE_DIR, 'correct_answers.tsv')
TRAINING_SET = join(DATA_DIR, 'training_set.tsv')
VALIDATION_SET = join(DATA_DIR, 'validation_set.tsv')

SUMBISSION_FILE = join(DATA_DIR, 'submissions', 'ensemble1.csv')

# DIM_NGRAMS = 1
# NGRAMS = 'ngrams_0429_1483'

DIM_NGRAMS = 4
NGRAMS = 'merged_ngrams'

DIM_REPTIL = 1
REPTIL = 'set_reptil_features'

DIM_LUCENE_ROMAN = 50
LUCENE_ROMAN = 'lucene_f5'

In [None]:
def fn(dataset_name, dataset_type='training'):
    assert dataset_type in ['training', 'validation']
    return join(FEATURE_DIR, ('%s_%s.tsv' % (dataset_type, dataset_name)))

In [None]:
if not os.path.exists(TRAINING_CORRECTS):
    print(TRAINING_CORRECTS, 'not found. Creating a new one...')
    with open(TRAINING_SET, encoding='utf8') as fi:
        fi.readline()  # skip header
        with open(TRAINING_CORRECTS, encoding='utf8', mode='w') as fo:
            for line in fi:
                qid = line.split('\t')[0]
                correct = line.split('\t')[2]
                print(qid, correct, sep='\t', file=fo)      

## Prepare data
-------------

In [None]:
correct_map = {}
with open(TRAINING_CORRECTS, encoding='utf8') as f:
    for qid, c in (line.strip().split() for line in f):
        correct_map[int(qid)] = 'ABCD'.index(c)

In [None]:
idx_valid = []
with open(VALIDATION_SET, encoding='utf8') as f:
    f.readline()
    for row in (line.strip().split('\t') for line in f):
        idx_valid.append(int(row[0]))

In [None]:
ids = sorted(correct_map)
itrain, itest = next(iter(KFold(len(ids), n_folds=3)))
idx_train = [ids[i] for i in itrain]
idx_test = [ids[i] for i in itest]

In [None]:
def cycle_rand(seq):
    while True:
        rseq = list(seq)
        random.shuffle(rseq)
        for s in rseq:
            yield s

In [None]:
def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

In [None]:
def read_dataset_(filename, dims, dict_out):
    with open(filename, encoding='utf8') as f:
        for row in (line.strip().split('\t') for line in f):
            qid = int(row[0])
            dict_out[qid] = np.array([np.fromstring(x, sep=';') for x in row[1:]])

            
def read_dataset(dataset_name, dims=1):
    data = defaultdict(lambda: np.zeros((4, dims)))
    read_dataset_(fn(dataset_name, 'training'), dims, data)
    read_dataset_(fn(dataset_name, 'validation'), dims, data)
    return data


def generate_data(indices, data_x_dict):
    f_dim = data_x_dict.values()[0][0].shape[0]
    
    x = np.zeros((len(indices), 4, f_dim), dtype='float32')
    y = np.zeros((len(indices)), dtype='int32')
    for row, i in enumerate(indices):
        x[row] = data_x_dict[i]
        y[row] = correct_map.get(i, 0)
    return x, y

In [None]:
def shuffle_dataset(dataset_x, dataset_y):
    f_dim = dataset_x.shape[2]  # feature dimensions
    rx = np.zeros_like(dataset_x)
    ry = np.zeros_like(dataset_y)
    for i_row in xrange(dataset_y.shape[0]):
        rand_order = np.arange(4)
        np.random.shuffle(rand_order)
        for k in range(4):
            ri = rand_order[k]
            rx[i_row, ri] = dataset_x[i_row, k]
        ry[i_row] = rand_order[dataset_y[i_row]]
    return rx, ry


def flatten_dataset(dataset_x, dataset_y):
    row_count, choice_count, f_dim = dataset_x.shape  # feature dimensions
    return dataset_x.reshape(row_count, choice_count * f_dim), dataset_y

In [None]:
# generate_data([100001], lucene)

In [None]:
# shuffle_dataset(*generate_data([100001], lucene))

## NGrams
-----------

In [None]:
ngrams = read_dataset(NGRAMS, dims=DIM_NGRAMS)

In [None]:
L2 = 0.0001

t_target = T.ivector()

l_in = LL.InputLayer((None, 4, DIM_NGRAMS))
nn = LL.DenseLayer(l_in, 30, nonlinearity=elu)
nn = LL.DenseLayer(nn, 4, nonlinearity=softmax, b=None)
t_output = LL.get_output(nn)

t_cost = lasagne.objectives.categorical_crossentropy(t_output, t_target).mean()
t_acc = lasagne.objectives.categorical_accuracy(t_output, t_target).mean()

params = LL.get_all_params(nn)

l2 = lasagne.regularization.regularize_network_params(nn, lasagne.regularization.l2)

updates = lasagne.updates.adam(t_cost + L2*l2, params)

train_fn = theano.function([l_in.input_var, t_target], t_cost, updates=updates)
cost_fn = theano.function([l_in.input_var, t_target], [t_cost, t_acc])

forward_fn = theano.function([l_in.input_var], t_output)

In [None]:
id_generator = cycle_rand(idx_train)

test_x, test_y = generate_data(idx_test, ngrams)

BATCH = 100
costs = []
rows_seen = 0
while rows_seen < 500000:
    indices = take(BATCH, id_generator)
    batch_x, batch_y = shuffle_dataset(*generate_data(indices, ngrams))
    costs.append(train_fn(batch_x, batch_y))
    rows_seen += BATCH
    if rows_seen % 10000 < BATCH:
        nll, acc = cost_fn(test_x, test_y)
        mean_acc = (test_x.sum(axis=2).argmax(axis=1) == test_y).sum() / test_y.shape[0]
        print(rows_seen, np.mean(costs), nll, acc, mean_acc)
        costs = []

In [None]:
ngrams_train_pred = forward_fn(generate_data(idx_train, ngrams)[0])
ngrams_test_pred = forward_fn(generate_data(idx_test, ngrams)[0])
ngrams_valid_pred = forward_fn(generate_data(idx_valid, ngrams)[0])

REPTIL
--------------------

In [None]:
reptil = read_dataset(REPTIL)

In [None]:
t_target = T.ivector()

l_in = LL.InputLayer((None, 4, 1))
nn = LL.DenseLayer(l_in, 30, nonlinearity=elu)
nn = LL.DenseLayer(nn, 4, nonlinearity=softmax, b=None)
t_output = LL.get_output(nn, deterministic=False)
t_output_det = LL.get_output(nn, deterministic=True)

t_cost = lasagne.objectives.categorical_crossentropy(t_output, t_target).mean()
t_acc = lasagne.objectives.categorical_accuracy(t_output, t_target).mean()

t_cost_det = lasagne.objectives.categorical_crossentropy(t_output_det, t_target).mean()
t_acc_det = lasagne.objectives.categorical_accuracy(t_output_det, t_target).mean()

params = LL.get_all_params(nn)

l2 = lasagne.regularization.regularize_network_params(nn, lasagne.regularization.l2)
updates = lasagne.updates.adam(t_cost + L2*l2, params)

train_fn = theano.function([l_in.input_var, t_target], t_cost, updates=updates)
cost_fn = theano.function([l_in.input_var, t_target], [t_cost_det, t_acc_det])

forward_fn = theano.function([l_in.input_var], t_output_det)

In [None]:
id_generator = cycle_rand(idx_train)

test_x, test_y = generate_data(idx_test, reptil)

BATCH = 100
costs = []
rows_seen = 0
while rows_seen < 200000:
    indices = take(BATCH, id_generator)
    batch_x, batch_y = shuffle_dataset(*generate_data(indices, reptil))
    costs.append(train_fn(batch_x, batch_y))
    rows_seen += BATCH
    if rows_seen % 1000 < BATCH:
        nll, acc = cost_fn(test_x, test_y)
        mean_acc = (test_x[:,:,0].argmax(axis=1) == test_y).sum() / test_y.shape[0]
        print(rows_seen, np.mean(costs), nll, acc, mean_acc)
        costs = []

In [None]:
reptil_train_pred = forward_fn(generate_data(idx_train, reptil)[0])
reptil_test_pred = forward_fn(generate_data(idx_test, reptil)[0])
reptil_valid_pred = forward_fn(generate_data(idx_valid, reptil)[0])

## Lucene + Roman
-------------

In [None]:
lucene_roman = read_dataset(LUCENE_ROMAN)

In [None]:
t_target = T.ivector()

l_in = LL.InputLayer((None, 4, 50))
nn = LL.DenseLayer(l_in, 300, nonlinearity=elu)
nn = LL.DenseLayer(nn, 4, nonlinearity=softmax, b=None)
t_output = LL.get_output(nn, deterministic=False)
t_output_det = LL.get_output(nn, deterministic=True)

t_cost = lasagne.objectives.categorical_crossentropy(t_output, t_target).mean()
t_acc = lasagne.objectives.categorical_accuracy(t_output, t_target).mean()

t_cost_det = lasagne.objectives.categorical_crossentropy(t_output_det, t_target).mean()
t_acc_det = lasagne.objectives.categorical_accuracy(t_output_det, t_target).mean()

params = LL.get_all_params(nn)

l2 = lasagne.regularization.regularize_network_params(nn, lasagne.regularization.l2)
updates = lasagne.updates.adam(t_cost + L2*l2, params)

train_fn = theano.function([l_in.input_var, t_target], t_cost, updates=updates)
cost_fn = theano.function([l_in.input_var, t_target], [t_cost_det, t_acc_det])

forward_fn = theano.function([l_in.input_var], t_output_det)

In [None]:
id_generator = cycle_rand(idx_train)

test_x, test_y = generate_data(idx_test, lucene_roman)

BATCH = 100

rows_seen = 0
while rows_seen < 300000:
    indices = take(BATCH, id_generator)
    batch_x, batch_y = shuffle_dataset(*generate_data(indices, lucene_roman))
    costs.append(train_fn(batch_x, batch_y))
    rows_seen += BATCH
    if rows_seen % 10000 < BATCH:
        nll, acc = cost_fn(test_x, test_y)
        mean_acc = (test_x.reshape((test_x.shape[0], 4, 50)).sum(axis=2).argmax(axis=1) == test_y).sum() / test_y.shape[0]
        print(rows_seen, np.mean(costs), nll, acc, mean_acc)
        costs = []

In [None]:
lucene_train_pred = forward_fn(generate_data(idx_train, lucene_roman)[0])
lucene_test_pred = forward_fn(generate_data(idx_test, lucene_roman)[0])
lucene_valid_pred = forward_fn(generate_data(idx_valid, lucene_roman)[0])

## Averaging
-------------

In [None]:
_, train_y = generate_data(idx_train, ngrams)

for k_lucene in np.linspace(1, 2, 5):
    for k_reptil in np.linspace(1, 2, 5):

        average_train_pred = lucene_train_pred*k_lucene + reptil_train_pred*k_reptil + ngrams_train_pred
        average_test_pred = lucene_test_pred*k_lucene + reptil_test_pred*k_reptil + ngrams_test_pred

        c = (k_lucene, k_reptil)
#         print('Training average', c, (average_train_pred.argmax(axis=1) == train_y).sum() / train_y.shape)
        print('Testing average', c, (average_test_pred.argmax(axis=1) == test_y).sum() / test_y.shape)


## Stacking
----------------

In [None]:
np.hstack([lucene_train_pred, reptil_train_pred, ngrams_train_pred])