In [None]:
from __future__ import print_function, division, unicode_literals
import six
import os, sys
from os.path import join
import json
from codecs import open
from collections import defaultdict
from operator import itemgetter
import nltk
import numpy as np
from nltk.corpus import stopwords
import re
import codecs
import random
from time import time

from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("english")

from nltk.corpus import stopwords

In [None]:
import lucene
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.document import Document, Field
from org.apache.lucene.index import IndexWriter, IndexWriterConfig, IndexReader
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.search import Sort, SortField
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.util import Version
from java.io import File
lucene.initVM()

In [None]:
DATA_DIR = join(os.environ['HOME'], 'data/allen-ai-challenge')
WIKI_DIR = join(DATA_DIR, 'wiki_dump')
CK12_DIR = join(DATA_DIR, 'ck12_dump')
TRAINING_SET = join(DATA_DIR, 'training_set.tsv')
VALIDATION_SET = join(DATA_DIR, 'validation_set.tsv')
TRAINING_SET_MERGED = join(DATA_DIR, 'training_set_merged.tsv')
# INDEX_DIR = join(DATA_DIR, 'index-wiki-ck12')
# INDEX_DIR = join(DATA_DIR, 'index-ck12-stem')
# INDEX_DIR = join(DATA_DIR, 'index-all-l_stem_summ')
INDEX_DIR = join(DATA_DIR, 'index-ck12-stem')
SUBMISSION = join(DATA_DIR, 'submissions/lucene_wiki_ck12_17jan.tsv')
VOCABULARY = join(DATA_DIR, 'vocabulary', 'w2v_a2_5.tsv')
SENT_DELIM = ' | '

In [None]:
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english') + '. , ! ? !? ?! ... ; : - â€”'.split())
def cleanup_text(text):
    sents = []
    for s in nltk.sent_tokenize(text):        
#         words = [stemmer.stem(w) for w in nltk.word_tokenize(s.lower()) if w not in stopwords]
        words = [w for w in nltk.word_tokenize(s.lower()) if w not in stopwords]
        if words:
            sents.append(words)
    return SENT_DELIM.join([' '.join(s) for s in sents])
#     return [stemmer.stem(w.text) for w in nlp(text) if not w.is_stop and not w.is_punct and not w.is_space]

In [None]:
%%time
vocab = {}
with open(VOCABULARY, encoding='utf8') as f:
    for word, vec in (line.strip().split('\t', 1) for line in f):
        vocab[word] = np.fromstring(vec, sep='\t')
vocab_dim = 300

In [None]:
vocab['earth'].shape

In [None]:
cleanup_text('Here is some text. What the heck more place?')

Index Creation
----------

In [None]:
analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
writer = IndexWriter(SimpleFSDirectory(File(INDEX_DIR)), writerConfig)

In [None]:
def add_document(doc_text):
    doc = Document()
    doc.add(Field("text", cleanup_text(doc_text), Field.Store.YES, Field.Index.ANALYZED))
    writer.addDocument(doc)

In [None]:
%%time

# CK12
for i, fn_short in enumerate(os.listdir(CK12_DIR)):
    fn = join(CK12_DIR, fn_short)
    with open(fn, encoding='utf-8', errors='ignore') as f:
        ck12_article = json.load(f)
#         content = []
#         for subtitle, paragraph in ck12_article['contents'].items():
#             content.append(subtitle + '. ' + paragraph)
#         add_document(' '.join(content))    

        for subtitle, paragraph in ck12_article['contents'].items():
            add_document(subtitle + '. ' + paragraph)

In [None]:
%%time

# Wiki
for i, fn_short in enumerate(os.listdir(WIKI_DIR)):
    fn = join(WIKI_DIR, fn_short)
    with open(fn, encoding='utf-8', errors='ignore') as f:
        wiki_article = json.load(f)
        _, summary, content = wiki_article
        add_document(summary)
        for p in re.sub('\[ \d* \]', ' ', content).replace('\n\n\n', '.\n ').split('\n\n'):
            add_document(p)

In [None]:
doc_count = writer.numDocs()
writer.close()

Build NN
----------

In [None]:
# %%time
# VERSION 0
# import theano
# import theano.tensor as T
# import lasagne
# import lasagne.layers as LL
# from lasagne.nonlinearities import elu, rectify

# M = 1.0

# input_context = LL.InputLayer((None, 300))
# input_hyp = LL.InputLayer((None, 300))

# l_diff = LL.ElemwiseMergeLayer([input_context, input_hyp], merge_function=T.sub)
# l_mult = LL.ElemwiseMergeLayer([input_context, input_hyp], merge_function=T.mul)

# nn = LL.concat([l_diff, l_mult])
# nn = LL.DenseLayer(nn, 100, nonlinearity=elu)
# nn = LL.DenseLayer(nn, 1, nonlinearity=rectify)
# t_output = LL.get_output(nn)[:, 0]

# t_must_be_less = t_output[0::2]
# t_must_be_more = t_output[1::2]

# # t_cost = (T.maximum(0, M + t_must_be_less - t_must_be_more)).mean()
# t_cost = (T.maximum(0, M + T.sqr(t_must_be_less) - T.sqr(t_must_be_more))).mean()
# # t_cost = (T.sqr(t_must_be_less) + T.sqr(T.maximum(0, M - t_must_be_more))).mean()  # square-square
# # t_cost = (T.sqr(t_must_be_less) + T.exp(-t_must_be_more)).mean()  # square-exponential

# params = LL.get_all_params(nn)

# updates = lasagne.updates.adam(t_cost, params)

# train_fn = theano.function([input_hyp.input_var, input_context.input_var], t_cost, updates=updates)
# cost_fn = theano.function([input_hyp.input_var, input_context.input_var], t_cost)
# energy_fn = theano.function([input_hyp.input_var, input_context.input_var], t_output)

In [None]:
# %%time
# # VERSION 1
# import theano
# import theano.tensor as T
# import lasagne
# import lasagne.layers as LL
# from lasagne.nonlinearities import elu, rectify

# M = 1.0

# LEN_CONTEXT = 50
# LEN_HYP = 20

# input_context = LL.InputLayer((None, LEN_CONTEXT, 300))
# input_context_mask = LL.InputLayer((None, LEN_CONTEXT))
# l_c = LL.LSTMLayer(input_context, 300, only_return_final=True, mask_input=input_context_mask)

# input_hyp = LL.InputLayer((None, LEN_HYP, 300))
# input_hyp_mask = LL.InputLayer((None, LEN_HYP))
# l_h = LL.LSTMLayer(input_hyp, 300, only_return_final=True, mask_input=input_hyp_mask)

# nn = LL.concat([l_c, l_h])
# nn = LL.DenseLayer(nn, 100, nonlinearity=elu)
# nn = LL.DenseLayer(nn, 1, nonlinearity=rectify)
# t_output = LL.get_output(nn)[:, 0]

# t_must_be_less = t_output[0::2]
# t_must_be_more = t_output[1::2]

# # t_cost = (T.maximum(0, M + t_must_be_less - t_must_be_more)).mean()
# t_cost = (T.maximum(0, M + T.sqr(t_must_be_less) - T.sqr(t_must_be_more))).mean()
# # t_cost = (T.sqr(t_must_be_less) + T.sqr(T.maximum(0, M - t_must_be_more))).mean()  # square-square
# # t_cost = (T.sqr(t_must_be_less) + T.exp(-t_must_be_more)).mean()  # square-exponential

# params = LL.get_all_params(nn)

# updates = lasagne.updates.adam(t_cost, params)

# train_fn = theano.function([input_hyp.input_var, input_hyp_mask.input_var,
#                             input_context.input_var, input_context_mask.input_var], t_cost, updates=updates)
# cost_fn = theano.function([input_hyp.input_var, input_hyp_mask.input_var,
#                             input_context.input_var, input_context_mask.input_var], t_cost)
# energy_fn = theano.function([input_hyp.input_var, input_hyp_mask.input_var,
#                             input_context.input_var, input_context_mask.input_var], t_output)

In [None]:
%%time
# VERSION 2
import theano
import theano.tensor as T
import lasagne
import lasagne.layers as LL
from lasagne.nonlinearities import elu, rectify

M = 1.0

LEN_CONTEXT = 50
LEN_HYP = 20

input_context = LL.InputLayer((None, LEN_CONTEXT, 300))
input_context_mask = LL.InputLayer((None, LEN_CONTEXT))
l_c = LL.LSTMLayer(input_context, 300, only_return_final=True, mask_input=input_context_mask)

input_hyp = LL.InputLayer((None, LEN_HYP, 300))
input_hyp_mask = LL.InputLayer((None, LEN_HYP))
l_h = LL.LSTMLayer(input_hyp, 300, only_return_final=True, mask_input=input_hyp_mask)

nn = LL.concat([l_c, l_h])
nn = LL.DenseLayer(nn, 100, nonlinearity=elu)
nn = LL.DenseLayer(nn, 1, nonlinearity=rectify)
t_output = LL.get_output(nn)[:, 0]

t_must_be_less = t_output[0::2]
t_must_be_more = t_output[1::2]

# t_cost = (T.maximum(0, M + t_must_be_less - t_must_be_more)).mean()
t_cost = (T.maximum(0, M + T.sqr(t_must_be_less) - T.sqr(t_must_be_more))).mean()
# t_cost = (T.sqr(t_must_be_less) + T.sqr(T.maximum(0, M - t_must_be_more))).mean()  # square-square
# t_cost = (T.sqr(t_must_be_less) + T.exp(-t_must_be_more)).mean()  # square-exponential

params = LL.get_all_params(nn)

updates = lasagne.updates.adam(t_cost, params)

train_fn = theano.function([input_hyp.input_var, input_hyp_mask.input_var,
                            input_context.input_var, input_context_mask.input_var], t_cost, updates=updates)
cost_fn = theano.function([input_hyp.input_var, input_hyp_mask.input_var,
                            input_context.input_var, input_context_mask.input_var], t_cost)
energy_fn = theano.function([input_hyp.input_var, input_hyp_mask.input_var,
                            input_context.input_var, input_context_mask.input_var], t_output)

Read index
-----------

In [None]:
analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
reader = IndexReader.open(SimpleFSDirectory(File(INDEX_DIR)))
searcher = IndexSearcher(reader)

doc_count = reader.maxDoc()

In [None]:
def take_random(seq, drop=True):
    '''Changes seq!'''
    idx = random.randint(0, len(seq)-1)
    d = seq[idx]
    if drop:
        del seq[idx]
    return d, seq    

def corrupt_context(index, window_nearest, window_farest, target='context', modify_context=True):    
    artice = reader.document(index)['text']
    sentsA = artice.split(SENT_DELIM)
    if len(sentsA) < 2:
        return None, None
    hypA, restA = take_random(sentsA, drop=modify_context)
    others = [i for i in range(index - window_farest, index + window_farest + 1) 
              if (np.abs(index-i) > window_nearest) and 
                 (i >= 0) and (i < doc_count)]
    idxB = random.choice(others)
    sentsB = reader.document(idxB)['text'].split(SENT_DELIM)
    if target == 'context':
        return [(hypA, ' '.join(restA)), (hypA, ' '.join(sentsB))]
    elif target == 'hyp':
        hypB, restB = take_random(sentsB, drop=False)
        return [(hypA, ' '.join(restA)), (hypB, ' '.join(restA))]
    else:
        raise RuntimeError('Unknown corruption target = %s' % context)


def mean_w2v(text):
    vec = np.zeros((vocab_dim,), dtype='float64')
    c = 1
    for w in nltk.word_tokenize(text):
        if w in vocab:
            vec += vocab[w]
            c += 1
    return (vec/c).astype('float32'), None


def seq_w2v(text, max_len):
    row = 0
    words = list(nltk.word_tokenize(text))
    n = max(max_len, len(words))
    
    vec = np.zeros((n, vocab_dim), dtype='float32')
    mask = np.zeros((n,), dtype='float32')

    for w in words:
        if w in vocab:
            vec[row] = vocab[w]
            mask[row] = 1
            row += 1
    return vec[-max_len:], mask[-max_len:]

In [None]:
# %%time
# questions = []
# with open(TRAINING_SET, encoding='utf8') as f:
#     f.readline()
#     for line in f:
#         qid, q, correct, aa, ab, ac, ad = line.strip().split('\t')

#         vecs_context = np.zeros((4, LEN_CONTEXT, vocab_dim), dtype='float32')
#         vecs_context_mask = np.zeros((4, LEN_CONTEXT), dtype='float32')
#         vecs_hyp = np.zeros((4, LEN_HYP, vocab_dim), dtype='float32')
#         vecs_hyp_mask = np.zeros((4, LEN_HYP), dtype='float32')
#         lucene_score = np.zeros((4,), dtype='float32')
#         for i, a in enumerate([aa, ab, ac, ad]):
#             query_text = cleanup_text(q + '. ' + a)
#             try:
#                 query = QueryParser(Version.LUCENE_4_10_1, "text", analyzer).parse(re.sub("[^a-zA-Z0-9]"," ", query_text))
#                 hits = searcher.search(query, 20).scoreDocs
#                 doc = reader.document(hits[0].doc)['text']
#             except Excetion as ex:
#                 print(ex)
#                 doc = ''
#             vecs_hyp[i], vecs_hyp_mask[i] = seq_w2v(query_text, LEN_HYP)
#             vecs_context[i], vecs_context_mask[i] = seq_w2v(doc, LEN_CONTEXT)
#             lucene_score[i] = hits[0].score
#         questions.append((qid, 'ABCD'.index(correct), vecs_hyp, vecs_hyp_mask, vecs_context, vecs_context_mask, lucene_score))

In [44]:
%%time
questions2 = []
docs_to_consider = 20
with open(TRAINING_SET, encoding='utf8') as f:
    f.readline()
    for line in f:
        qid, q, correct, aa, ab, ac, ad = line.strip().split('\t')

        vecs_context = np.zeros((4*docs_to_consider, LEN_CONTEXT, vocab_dim), dtype='float32')
        vecs_context_mask = np.zeros((4*docs_to_consider, LEN_CONTEXT), dtype='float32')
        vecs_hyp = np.zeros((4*docs_to_consider, LEN_HYP, vocab_dim), dtype='float32')
        vecs_hyp_mask = np.zeros((4*docs_to_consider, LEN_HYP), dtype='float32')
        lucene_score = np.zeros((4*docs_to_consider,), dtype='float32')
        
        for i, a in enumerate([aa, ab, ac, ad]):
            query_text = cleanup_text(q + '. ' + a)
            try:
                query = QueryParser(Version.LUCENE_4_10_1, "text", analyzer).parse(re.sub("[^a-zA-Z0-9]"," ", query_text))
                hits = searcher.search(query, docs_to_consider).scoreDocs
                docs = [reader.document(hits[d].doc)['text'] for d in range(len(hits))]
            except Excetion as ex:
                print(ex)
                docs = [''] * 20
                
            for di, doc in enumerate(docs):
                idx = i*docs_to_consider + di
                vecs_hyp[idx], vecs_hyp_mask[idx] = seq_w2v(query_text, LEN_HYP)
                vecs_context[idx], vecs_context_mask[i] = seq_w2v(doc, LEN_CONTEXT)
                lucene_score[idx] = hits[di].score
                
        questions2.append((qid, 'ABCD'.index(correct), vecs_hyp, vecs_hyp_mask, vecs_context, vecs_context_mask, lucene_score))

CPU times: user 3min 28s, sys: 7.61 s, total: 3min 35s
Wall time: 3min 19s


In [65]:
def check2(questions2):
    scores = []
    for qid, idx_correct, vecs_hyp, vecs_hyp_mask, vecs_context, vecs_context_mask, lucene_score in questions2:
        energies = energy_fn(vecs_hyp, vecs_hyp_mask, vecs_context, vecs_context_mask)
        score = lucene_score / energies
        score = (lucene_score / energies).reshape((20, 4))
        scores.append(score.sum(axis=0).argmax() == idx_correct)
    return np.mean(scores)

In [46]:
print('Am I alive')
print('Am I alive?', file=tele)

Am I alive


In [47]:
print(check2(questions2))

0.2336


In [74]:
lucene_score.reshape((4, 20)).sum(axis=1)

array([ 10.38587189,  10.41579437,  10.99467087,  11.09564781], dtype=float32)

In [105]:
-np.argsort([0, 10, 2])

array([ 0, -2, -1])

In [112]:
lucene_scores = []
nn_scores = []

luc1nn0 = 0
luc1nn1 = 0
luc0nn0 = 0
luc0nn1 = 0
ranks = 0

for qid, idx_correct, vecs_hyp, vecs_hyp_mask, vecs_context, vecs_context_mask, lucene_score in questions2:
    energies = energy_fn(vecs_hyp, vecs_hyp_mask, vecs_context, vecs_context_mask)
    
    is_nn_correct = (lucene_score / energies).reshape((4, 20)).mean(axis=1).argmin() == idx_correct
    nn_scores.append(is_nn_correct)
    
    is_lucene_correct = lucene_score.reshape((4, 20)).mean(axis=1).argmax() == idx_correct
    lucene_scores.append(is_lucene_correct)
    
    luc1nn0 += is_lucene_correct and (not is_nn_correct)
    luc1nn1 += is_lucene_correct and is_nn_correct
    luc0nn0 += (not is_lucene_correct) and (not is_nn_correct)
    luc0nn1 += (not is_lucene_correct) and is_nn_correct
    
    nn_ranks = np.argsort((lucene_score / energies).reshape((4, 20)).mean(axis=1))
    lucene_ranks = np.argsort(lucene_score.reshape((4, 20)).mean(axis=1))
    ranks += (nn_ranks - lucene_ranks).argmin() == idx_correct
    
    if (not is_lucene_correct) and is_nn_correct:
        print(qid, idx_correct, energies.reshape((4, 20)).mean(axis=1), lucene_score.reshape((4, 20)).mean(axis=1))
#     if int(qid) > 100010:
#         break
    
    
    
print(np.mean(lucene_scores), np.mean(nn_scores))
print('luc1nn0', luc1nn0)
print('luc1nn1', luc1nn1)
print('luc0nn0', luc0nn0)
print('luc0nn1', luc0nn1)
print('ranks', ranks)

# energy_fn(vecs_hyp, vecs_hyp_mask, vecs_context, vecs_context_mask)

100006 1 [ 2.40606761  2.32350206  2.25700712  2.34429169] [ 0.42701712  0.3943359   0.38990322  0.40243354]
100010 2 [ 2.73563075  2.71240354  2.81222796  2.70209932] [ 0.90636921  0.90851295  0.800093    0.78100979]
100019 1 [ 2.3258152   2.43140554  2.4482739   2.3743968 ] [ 0.4250927   0.39706564  0.70291793  0.72880572]
100020 1 [ 2.40448141  2.30290222  2.37311935  2.37647939] [ 0.22957969  0.17810591  0.19246739  0.20024486]
100021 2 [ 2.33364367  2.25555277  2.35502291  2.2894578 ] [ 0.2595945   0.25835955  0.25381538  0.26988846]
100028 3 [ 2.28671598  2.36374593  2.30268097  2.32143903] [ 1.20832705  1.227036    1.13999009  1.14275074]
100032 1 [ 2.39864993  2.52707529  2.53969836  2.63534546] [ 0.72308552  0.65294617  0.8203122   0.73846531]
100038 1 [ 2.42538142  2.59904742  2.56720018  2.38319826] [ 0.37457392  0.37439176  0.42190105  0.3721208 ]
100044 0 [ 2.40987849  2.4999516   2.41515684  2.52116728] [ 0.67782438  1.05943894  1.15683889  1.32138669]
100056 3 [ 2.521041

In [None]:
# def check(questions):
#     scores = []
#     for qid, idx_correct, vecs_hyp, vecs_hyp_mask, vecs_context, vecs_context_mask, lucene_score in questions:
#         energies = energy_fn(vecs_hyp, vecs_hyp_mask, vecs_context, vecs_context_mask)
#         scores.append(np.argmax(lucene_score / energies) == idx_correct)
#     return np.mean(scores)

In [None]:
def randomized_cyclic_generator(seq):
    rseq = list(seq)
    while True:
        random.shuffle(rseq)
        for s in rseq:
            yield s

In [None]:
%time check(questions)

In [None]:
import telepot
class TelegramStream:
    def __init__(self, token, reciever_id):
        self.bot = telepot.Bot(token)
        self.id = reciever_id
        self.buffer = []
    def write(self, txt):
        self.buffer.append(txt)
        if txt.endswith('\n'):
            msg = ''.join(self.buffer)
            if msg:
                self.bot.sendMessage(self.id, msg)
            self.buffer = []
    
tele = TelegramStream("123209868:AAHGkFt5NPUNnkjW8VbG4wUqpMHmhwPGarM", "87799679")
print('Telegram bot started', file=tele)

In [None]:
# %%time

BATCH = 20
EPOCHS = 100
            
id_generator = iter(randomized_cyclic_generator(np.arange(doc_count, dtype=int)))

corruption_targets = ['context'] + ['hyp']
drop_hyp_from_context = False
win_min = 5
win_width = 5

print_each_sec = 600

time_started = time()

acc = 0
acc = check(questions)

costs = []
rows_seen = 0
while True:
    batch_hyp = np.zeros((BATCH*2, LEN_HYP, vocab_dim), dtype='float32')
    batch_hyp_m = np.zeros((BATCH*2, LEN_HYP), dtype='float32')
    batch_context = np.zeros((BATCH*2, LEN_CONTEXT, vocab_dim), dtype='float32')
    batch_context_m = np.zeros((BATCH*2, LEN_CONTEXT), dtype='float32')

    b = 0
    while b < BATCH:
        idx_row_in_db = int(next(id_generator))
        right, corrupted = corrupt_context(idx_row_in_db, win_min, win_min + win_width, 
                                           random.choice(corruption_targets), modify_context=drop_hyp_from_context)
        if right is None:
            continue

        batch_hyp[b*2], batch_hyp_m[b*2] = seq_w2v(right[0], LEN_HYP)
        batch_context[b*2], batch_context_m[b*2] = seq_w2v(right[1], LEN_CONTEXT)

        batch_hyp[b*2+1], batch_hyp_m[b*2+1] = seq_w2v(corrupted[0], LEN_HYP)
        batch_context[b*2+1], batch_context_m[b*2+1] = seq_w2v(corrupted[1], LEN_CONTEXT)
        b += 1
            
    batch_cost = train_fn(batch_hyp, batch_hyp_m, batch_context, batch_context_m)
#     print(batch_cost)
    costs.append(batch_cost)
    rows_seen += BATCH
    if time() - time_started > print_each_sec:
        print('%d: %.3f (%.2f%%) in %.0fs' % (rows_seen, np.mean(costs), acc * 100, time() - time_started), file=tele)
#         sys.stdout.flush()
        time_started = time()
        acc = check(questions)
        costs = []