In [1]:
from __future__ import print_function, division, unicode_literals
import six
import os
from os.path import join
import json
from codecs import open
from collections import defaultdict
from operator import itemgetter
import nltk
import numpy as np
from nltk.corpus import stopwords
import re

In [2]:
os.environ["LD_LIBRARY_PATH"]

'/usr/local/cuda-7.5/lib64:/root/reps/AdaGram.jl/lib'

In [17]:
DATA_DIR = join(os.environ['HOME'], 'data/allen-ai-challenge')
CORPUS = join(DATA_DIR, 'corpus_paragraph_roman_2_short150-100.txt')

TRAINING_SET = join(DATA_DIR, 'training_set_cleaned.tsv')
VALIDATION_SET = join(DATA_DIR, 'validation_set_cleaned.tsv')

INDEX_DIR = join(DATA_DIR, 'index_merged_corpus_long_topkek_x_15')
SUBMISSION = join(DATA_DIR, 'submissions/lucene_more_data_plus_topkek_cleaned.tsv')

w2v
----

In [8]:
import julia

class AdaGramModel(object):
    
    def __init__(self, path_to_model, path_to_dict):
        self.j = julia.Julia()
        self.j.eval("using AdaGram")
        self.j.eval('vm, dict = load_model("%s")' % path_to_model)
        self.d = self.j.eval('size(vm.In, 1)') # size of word vectors
        self.m = self.j.eval('size(vm.In, 2)') # number of context 
        self.n = self.j.eval('size(vm.In, 3)') # number of vectors
        # TODO: AdaGram.Dictionary -> Python dict()?
        self.dictionary = dict()
        with open(path_to_dict) as f:
            next(f) # skip strange empty token
            for l in f:
                r = l.strip().split()
                self.dictionary[r[0]] = int(r[1])
    
    def expected_pi(self, word):
        return self.j.eval('expected_pi(vm, dict.word2id["%s"])' % word)
    
    def disambiguate(self, word, context):
        return self.j.eval('disambiguate(vm, dict, "%s", split("%s"))' % (word, context))
    
    def vec(self, word, pi):
        assert pi + 1 <= self.m, "n of prototypes mismatch"
        return self.j.eval('vec(vm, dict, "%s", %d)' % (word, pi + 1))
    
    def nearest_neighbors(self, word, pi):
        return self.j.eval('nearest_neighbors(vm, dict, "%s", %d)' % (word, pi + 1))

In [9]:
ADAGRAM_MODEL = join(DATA_DIR, "adam.model")
ADAGRAM_DICT = join(DATA_DIR, "adam.dict")

In [10]:
am = AdaGramModel(ADAGRAM_MODEL, ADAGRAM_DICT)

In [11]:
def filter_dict(t):
    return [w for w in t.split() if w in am.dictionary and am.dictionary[w] >= 20]

In [12]:
N = 300

In [13]:
def sent_to_vec(sent, context):
    v = np.zeros((N,), dtype='float32')
    fc = filter_dict(context)
    fs = filter_dict(sent)
    if not fs:
        return v
    c = 0
    for w in fs:
        fc_cut = fc
        # fc_cut.remove(w) # not helping
        pi = am.disambiguate(w, " ".join(fc_cut)).argmax()
        vv = am.vec(w, pi)
        v += vv
        c += 1
    return v / c

Search
-----

In [6]:
import lucene
lucene.initVM()

from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.analysis.shingle import ShingleAnalyzerWrapper
from org.apache.lucene.document import Document, Field
from org.apache.lucene.index import IndexWriter, IndexWriterConfig, IndexReader
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.search import Sort, SortField
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.util import Version

from java.io import File

Lucene Index Creation
-----------

In [8]:
analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)

In [9]:
shingle_analyzer = ShingleAnalyzerWrapper(analyzer, 2, 3)

In [10]:
writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
writer = IndexWriter(SimpleFSDirectory(File(INDEX_DIR)), writerConfig)

def add_document(doc_text):
    doc = Document()
    doc.add(Field("text", doc_text, Field.Store.YES, Field.Index.ANALYZED))
    writer.addDocument(doc)

In [11]:
%%time
with open(CORPUS, encoding='utf8') as f:
    for line in (line.strip() for line in f):
        add_document(line)

CPU times: user 2min 8s, sys: 2.05 s, total: 2min 10s
Wall time: 1min 48s


In [12]:
print(writer.numDocs())
writer.close()

3693751


## Make prediction 

In [13]:
def iter_data(datafile, has_correct_answer=True, skip_first_line=False):
    with open(datafile, encoding='utf-8', errors='ignore') as f:
        if not skip_first_line:
            next(f)
        for l in f:
            if has_correct_answer:
                idd, q, correct, aa, ab, ac, ad = l.strip().split("\t")
            else:
                idd, q, aa, ab, ac, ad = l.strip('\n').split("\t")
                correct = "no"
            yield {"idd": idd, "q": q, "correct": correct, "aa": aa, "ab": ab, "ac": ac, "ad": ad}

In [14]:
from collections import defaultdict

In [15]:
%%time
res = defaultdict(list)
MAX = 100
docs_per_q = range(1, MAX)

analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
reader = IndexReader.open(SimpleFSDirectory(File(INDEX_DIR)))
searcher = IndexSearcher(reader)

for row in iter_data(TRAINING_SET):
#     vq = sent_to_vec(row['q'], row['q'])
#     va = sent_to_vec(row['aa'], row['q'] + " " + row['aa'])
#     vb = sent_to_vec(row['ab'], row['q'] + " " + row['ab'])
#     vc = sent_to_vec(row['ac'], row['q'] + " " + row['ac'])
#     vd = sent_to_vec(row['ad'], row['q'] + " " + row['ad'])
#     adam_scores = [np.dot(x, vq) for x in [va, vb, vc, vd]]
    queries = [row['aa'], row['ab'], row['ac'], row['ad']]
    queries = [row['q'] + ' ' + q  for q in queries]
    scores = defaultdict(list)
    for q in queries:
        query = QueryParser(Version.LUCENE_4_10_1, "text", analyzer).parse(q)
        hits = searcher.search(query, MAX)
        doc_importance = [hit.score for hit in hits.scoreDocs]
        for n in docs_per_q:
            scores[n].append(sum(doc_importance[:n]))
    for n in docs_per_q:
#         res[n].append(['A','B','C','D'][np.argmax(adam_scores)] == row["correct"])
        res[n].append(['A','B','C','D'][np.argmax(scores[n])] == row["correct"])
#         res[n].append(['A','B','C','D'][np.argmax(np.mean([adam_scores, scores[n]], axis=0))] == row["correct"])

CPU times: user 3min 30s, sys: 5.53 s, total: 3min 36s
Wall time: 3min 31s


new reptil cleaned topkek 0.3
`1 0.521008403361
2 0.525410164066
3 0.52781112445
4 0.534213685474
5 0.535414165666
6 0.530612244898
7 0.53181272509
8 0.53381352541
9 0.532212885154
10 0.52981192477
11 0.529411764706
12 0.527010804322
13 0.52581032413
14 0.526210484194`

#topkek cleaned filt
`1 0.512204881953
2 0.519807923169
3 0.52380952381
4 0.532613045218
5 0.531012404962
6 0.52781112445
7 0.528611444578
8 0.532212885154
9 0.530212084834
10 0.528611444578
11 0.526610644258
12 0.524609843938
13 0.523409363745`

#topkek adagram 150-100 reptil roman 2 more
`1 0.510604241697
2 0.519407763105
3 0.52380952381
4 0.532212885154
5 0.53181272509
6 0.528211284514
7 0.529411764706
8 0.533013205282
9 0.531412565026
10 0.528211284514
11 0.527410964386
12 0.524209683874
13 0.524209683874`

In [16]:
for x in sorted(res):
    print(x, np.mean(res[x]))

1 0.510604241697
2 0.519407763105
3 0.52380952381
4 0.532212885154
5 0.53181272509
6 0.528211284514
7 0.529411764706
8 0.533013205282
9 0.531412565026
10 0.528211284514
11 0.527410964386
12 0.524209683874
13 0.524209683874
14 0.525410164066
15 0.525010004002
16 0.524209683874
17 0.521808723489
18 0.521808723489
19 0.521808723489
20 0.520208083233
21 0.521808723489
22 0.521808723489
23 0.520608243297
24 0.522208883553
25 0.521008403361
26 0.522208883553
27 0.522208883553
28 0.521808723489
29 0.520608243297
30 0.520208083233
31 0.521808723489
32 0.521808723489
33 0.521008403361
34 0.521408563425
35 0.521008403361
36 0.522609043617
37 0.522208883553
38 0.522609043617
39 0.523009203681
40 0.523009203681
41 0.521808723489
42 0.522208883553
43 0.522609043617
44 0.52380952381
45 0.524209683874
46 0.52380952381
47 0.524209683874
48 0.523409363745
49 0.523009203681
50 0.522609043617
51 0.522208883553
52 0.52380952381
53 0.523409363745
54 0.522208883553
55 0.521808723489
56 0.522208883553
57 0.5

Results
-------

In [87]:
#topkek reptil cleaned

`1 0.518607442977
2 0.521008403361
3 0.523409363745
4 0.534213685474
5 0.533413365346
6 0.527410964386
7 0.529411764706
8 0.530612244898
9 0.528211284514
10 0.525410164066
11 0.526610644258
12 0.524609843938
13 0.521008403361
14 0.523409363745
15 0.521408563425
16 0.522208883553
17 0.521808723489
18 0.519807923169
19 0.521808723489`

In [79]:
#topkek cleaned

`
1 0.509403761505
2 0.517807122849
3 0.522208883553
4 0.532613045218
5 0.531412565026
6 0.525010004002
7 0.52781112445
8 0.52981192477
9 0.527410964386
10 0.525010004002
`

In [40]:
#topkek - adm 0.3

`
 0.517006802721
2 0.520208083233
3 0.523009203681
4 0.523409363745
5 0.52781112445
6 0.526210484194
7 0.527010804322
8 0.526210484194
9 0.528211284514
10 0.527410964386
11 0.529011604642
12 0.52581032413
13 0.525010004002
14 0.524609843938
15 0.523009203681
16 0.521408563425
17 0.522208883553`

In [39]:
#topkek luc

`1 0.513805522209
2 0.519007603041
3 0.52380952381
4 0.522609043617
5 0.525010004002
6 0.525410164066
7 0.527010804322
8 0.526610644258
9 0.529411764706
10 0.527010804322
11 0.52781112445
12 0.524209683874
13 0.525410164066
14 0.523409363745
15 0.522609043617
16 0.521808723489
17 0.521408563425`

In [42]:
#backup luc #kek

`1 0.488995598239
2 0.509803921569
3 0.511804721889
4 0.512204881953
5 0.513405362145
6 0.508603441377
7 0.511804721889
8 0.510204081633
9 0.511804721889
10 0.511004401761
11 0.512605042017
12 0.513805522209
13 0.514605842337
14 0.513005202081
15 0.515806322529
16 0.515406162465
17 0.515006002401
18 0.516206482593
19 0.516206482593
20 0.516606642657
21 0.517406962785
22 0.519007603041
23 0.516606642657
24 0.513005202081
25 0.512605042017
26 0.513405362145
27 0.513405362145
28 0.512605042017
29 0.511004401761
30 0.510204081633
31 0.508603441377
32 0.507803121248
33 0.50700280112
34 0.504601840736
35 0.504201680672
36 0.504201680672
37 0.5050020008
38 0.504601840736
39 0.503801520608
40 0.504201680672
41 0.503401360544
42 0.503401360544
43 0.503401360544
44 0.502601040416
45 0.501800720288
46 0.502200880352
47 0.503401360544
48 0.502601040416
49 0.502200880352`

`1 0.47619047619
2 0.495398159264
3 0.4949979992
4 0.500200080032
5 0.497398959584
6 0.498599439776
7 0.499799919968
8 0.495798319328
9 0.493397358944
10 0.49299719888
11 0.490996398559
12 0.491396558623
13 0.493797519008
14 0.491396558623
15 0.491796718687
16 0.49299719888
17 0.492597038816
18 0.491396558623
19 0.491396558623`

Submit
-----

In [19]:
%%time
docs_to_consider = 8

analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
reader = IndexReader.open(SimpleFSDirectory(File(INDEX_DIR)))
searcher = IndexSearcher(reader)

with open(SUBMISSION, "w") as s:
    print("id,correctAnswer", file=s)
    for row in iter_data(VALIDATION_SET, False, skip_first_line=True):
#         vq = sent_to_vec(row['q'], row['q'])
#         va = sent_to_vec(row['aa'], row['q'] + " " + row['aa'])
#         vb = sent_to_vec(row['ab'], row['q'] + " " + row['ab'])
#         vc = sent_to_vec(row['ac'], row['q'] + " " + row['ac'])
#         vd = sent_to_vec(row['ad'], row['q'] + " " + row['ad'])
#         adam_scores = [0.3 * np.dot(x, vq) for x in [va, vb, vc, vd]]
        queries = [row['aa'], row['ab'], row['ac'], row['ad']]
        queries = [row['q'] + ' ' + q  for q in queries]
        scores = []
        for q in queries:
            query = QueryParser(Version.LUCENE_4_10_1, "text", analyzer).parse(q)
            hits = searcher.search(query, docs_to_consider)
            doc_importance = [hit.score for hit in hits.scoreDocs]
            scores.append(sum(doc_importance))
#         guess = "ABCD"[np.argmax(np.mean([adam_scores, scores], axis=0))]
        guess = "ABCD"[np.argmax(scores)]
        s.write("%s,%s\n" % (row["idd"], guess))

CPU times: user 7min 26s, sys: 12.1 s, total: 7min 38s
Wall time: 7min 41s


Features
-----

In [None]:
FEATURES_LUCENE_ALL_SCORES = join(DATA_DIR, 'features/lucene_all.tsv')

In [None]:
%%time
MAX = 10
analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
reader = IndexReader.open(SimpleFSDirectory(File(INDEX_DIR)))
searcher = IndexSearcher(reader)

output_file = join(DATA_DIR, 'features', 'lucene_cumsum%d.tsv' % MAX)
with open(output_file, "w") as fs:
    for row in iter_data(TRAINING_SET):
        queries = [row['aa'], row['ab'], row['ac'], row['ad']]
        queries = [row['q'] + ' ' + q  for q in queries]
        features = []
        for q in queries:
            query = QueryParser(Version.LUCENE_4_10_1, "text", analyzer).parse(re.sub("[^a-zA-Z0-9]"," ", q))
            hits = searcher.search(query, MAX)
            doc_importances = [hit.score for hit in hits.scoreDocs]
            features.append(";".join(str(d) for d in doc_importances))
        print(row["idd"], row["correct"], *features, file=fs, sep="\t")

In [None]:
%%time

###### OUTPUT VECTORS
out_vec_dim = 7
training = True

if training:
    working_dataset = TRAINING_SET
    output_filename = 'lucene_vecs%d.tsv' % out_vec_dim
else:
    working_dataset = VALIDATION_SET
    output_filename = 'lucene_vecs%d_submission.tsv' % out_vec_dim

analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
reader = IndexReader.open(SimpleFSDirectory(File(INDEX_DIR)))
searcher = IndexSearcher(reader)

output_file = join(DATA_DIR, 'features', output_filename)

with open(output_file, "w") as fs:
    for row in iter_data(working_dataset, training):
        queries = [row['aa'], row['ab'], row['ac'], row['ad']]
        queries = [row['q'] + ' ' + q  for q in queries]
        features = []
        for q in queries:
            query = QueryParser(Version.LUCENE_4_10_1, "text", analyzer).parse(re.sub("[^a-zA-Z0-9]"," ", q))
            hits = searcher.search(query, out_vec_dim)
            doc_importances = [hit.score for hit in hits.scoreDocs]
            features.append(";".join(str(d) for d in np.cumsum(doc_importances)))
        print(row["idd"], row["correct"], *features, file=fs, sep="\t")

In [None]:
features

In [None]:
%%time
MAX = 13000
analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
reader = IndexReader.open(SimpleFSDirectory(File(INDEX_DIR)))
searcher = IndexSearcher(reader)

with open(FEATURES_LUCENE_ALL_SCORES, "w") as fs:
    for row in iter_data(TRAINING_SET):
        queries = [row['aa'], row['ab'], row['ac'], row['ad']]
        queries = [row['q'] + ' ' + q  for q in queries]
        features = []
        for q in queries:
            query = QueryParser(Version.LUCENE_4_10_1, "text", analyzer).parse(re.sub("[^a-zA-Z0-9]"," ", q))      
            hits = searcher.search(query, MAX)
            doc_importances = {hit.doc: hit.score for hit in hits.scoreDocs}
#             features.append(";".join(doc_importances))
#             print(doc_importances)
            break
#         print(row["idd"], row["correct"], *features, file=fs, sep="\t")
        break

In [None]:
sorted(doc_importances.values())