Проверка работы Reptil и его параметры
---------

In [1]:
from __future__ import unicode_literals, print_function, division
import os, sys
from os.path import join
import json
from codecs import open
import numpy as np

In [2]:
os.environ["LD_LIBRARY_PATH"]

'/usr/local/cuda-7.5/lib64:/root/reps/AdaGram.jl/lib'

In [3]:
import julia

In [14]:
class AdaGramModel(object):
    
    def __init__(self, path_to_model, path_to_dict):
        self.j = julia.Julia()
        self.j.eval("using AdaGram")
        self.j.eval('vm, dict = load_model("%s")' % path_to_model)
        self.d = self.j.eval('size(vm.In, 1)') # size of word vectors
        self.m = self.j.eval('size(vm.In, 2)') # number of context 
        self.n = self.j.eval('size(vm.In, 3)') # number of vectors
        # TODO: AdaGram.Dictionary -> Python dict()?
        self.dictionary = dict()
        with open(path_to_dict) as f:
            next(f) # skip strange empty token
            for l in f:
                r = l.strip().split()
                self.dictionary[r[0]] = int(r[1])
    
    def expected_pi(self, word):
        return self.j.eval('expected_pi(vm, dict.word2id["%s"])' % word)
    
    def disambiguate(self, word, context):
        return self.j.eval('disambiguate(vm, dict, "%s", split("%s"))' % (word, context))
    
    def vec(self, word, pi):
        assert pi + 1 <= self.m, "n of prototypes mismatch"
        return self.j.eval('vec(vm, dict, "%s", %d)' % (word, pi + 1))
    
    def nearest_neighbors(self, word, pi):
        return self.j.eval('nearest_neighbors(vm, dict, "%s", %d)' % (word, pi + 1))

In [15]:
ROOT_DATA = join(os.environ["HOME"], "data/allen-ai-challenge")
TRAINING_CLEANED = join(ROOT_DATA, "training_set_cleaned.tsv")
VALIDATION_CLEANED = join(ROOT_DATA, "validation_set_cleaned.tsv")
MERGED = join(ROOT_DATA, "corpus_paragraph_roman_2_short150-100.txt")

In [16]:
N = 300

In [17]:
ADAGRAM_MODEL = join(ROOT_DATA, "adam.model")
ADAGRAM_DICT = join(ROOT_DATA, "adam.dict")

In [18]:
am = AdaGramModel(ADAGRAM_MODEL, ADAGRAM_DICT)

In [19]:
# %%sh
# /root/reps/AdaGram.jl/utils/dictionary.sh /root/data/allen-ai-challenge/corpus_paragraph_unstable_short150-100.txt /root/data/allen-ai-challenge/adam.dict

In [20]:
#corpus_paragraph_unstable_short150-100.txt

sh train.sh --min-freq 20 --window 5 --workers 40 --epochs 5 --dim 300 --alpha 0.1 /root/data/allen-ai-challenge/corpus_paragraph_unstable_short150-100.txt /root/data/allen-ai-challenge/adam.dict /root/data/allen-ai-challenge/adam.model

In [21]:
# %%time
# %%sh
# /root/reps/AdaGram.jl/train.sh --min-freq 5 --window 5 --workers 47 --epochs 10 --dim 300 --alpha 0.15 /root/data/allen-ai-challenge/merged_corpus.txt /root/data/allen-ai-challenge/adam.dict /root/data/allen-ai-challenge/adam.model  

In [22]:
def filter_dict(t):
    return [w for w in t.split() if w in am.dictionary and am.dictionary[w] >= 20]

In [23]:
def sent_to_vec(sent, context):
    v = np.zeros((N,), dtype='float32')
    fc = filter_dict(context)
    fs = filter_dict(sent)
    if not fs:
        return v
    c = 0
    for w in fs:
        fc_cut = fc
        # fc_cut.remove(w) # not helping
        pi = am.disambiguate(w, " ".join(fc_cut)).argmax()
        vv = am.vec(w, pi)
        v += vv
        c += 1
    return v / c

In [24]:
%%time
tries = []
with open(TRAINING_CLEANED, encoding="utf-8") as f:
    for i, l in enumerate(f):
        [qid, q, r, aa, ab, ac, ad] = l.strip().split("\t")
        vq = sent_to_vec(q, q)
        va = sent_to_vec(aa, q + " " + aa)
        vb = sent_to_vec(ab, q + " " + ab)
        vc = sent_to_vec(ac, q + " " + ac)
        vd = sent_to_vec(ad, q + " " + ad)
#         va = sent_to_vec(aa, q)
#         vb = sent_to_vec(ab, q)
#         vc = sent_to_vec(ac, q)
#         vd = sent_to_vec(ad, q)        
        scores = [np.dot(x, vq) for x in [va, vb, vc, vd]]
        g = "ABCD"[np.argmax(scores)]
        tries.append(1 if g == r else 0)

CPU times: user 1min 1s, sys: 114 ms, total: 1min 1s
Wall time: 1min 1s


In [25]:
np.mean(tries)

0.42520000000000002

`sh train.sh --min-freq 10 --window 4 --workers 48 --epochs 5 --dim 300 --alpha 0.1 /root/data/allen-ai-challenge/merged_corpus.txt /root/data/allen-ai-challenge/adam.dict /root/data/allen-ai-challenge/adam.model
0.3972

sh train.sh --min-freq 5 --window 4 --workers 48 --epochs 5 --dim 300 --alpha 0.1 /root/data/allen-ai-challenge/merged_corpus.txt /root/data/allen-ai-challenge/adam.dict /root/data/allen-ai-challenge/adam.model
0.39119

sh train.sh --min-freq 20 --window 5 --workers 48 --epochs 5 --dim 300 --alpha 0.1 /root/data/allen-ai-challenge/merged_corpus.txt /root/data/allen-ai-challenge/adam.dict /root/data/allen-ai-challenge/adam.model
0.413

sh train.sh --min-freq 30 --window 5 --workers 48 --epochs 5 --dim 300 --alpha 0.1 /root/data/allen-ai-challenge/merged_corpus.txt /root/data/allen-ai-challenge/adam.dict /root/data/allen-ai-challenge/adam.model
0.3992
`