In [60]:
import gensim
import re
import multiprocessing
import numpy as np
from collections import Counter

In [61]:
print(gensim.__version__)

0.12.1


In [62]:
import os
from os.path import join

In [63]:
DATA_DIR = join(os.environ['HOME'], "data/allen-ai-challenge")

In [64]:
WIKI_DIR = join(DATA_DIR, "parsed_wiki_data")

In [65]:
TRAIN_SET = join(DATA_DIR, "training_set.tsv")

In [66]:
def parse_text(t):
    s = re.sub(r'[^\w\s]', '', t)
    r = s.lower().split()
    return r

In [67]:
def iter_text(directory):
    for fname in os.listdir(directory):
        with open(os.path.join(directory, fname), encoding="utf-8", errors="ignore") as f:
            for l in f:
                r = parse_text(l)
                if r != []:
                    yield r

In [68]:
for i, l in enumerate(iter_text(WIKI_DIR)):
    print(l)
    if i > 10:
        break

['39', 'is', 'a', 'song', 'by', 'british', 'rock', 'band', 'queen', 'composed', 'by', 'lead', 'guitarist', 'brian', 'may', 'it', 'is', 'the', 'fifth', 'track', 'on', 'their', 'fourth', 'studio', 'album', 'a', 'night', 'at', 'the', 'opera']
['the', 'song', 'relates', 'the', 'tale', 'of', 'a', 'group', 'of', 'space', 'explorers', 'who', 'embark', 'on', 'what', 'is', 'from', 'their', 'perspective', 'a', 'yearlong', 'voyage', 'upon', 'their', 'return', 'however', 'they', 'realise', 'that', 'a', 'hundred', 'years', 'have', 'passed', 'because', 'of', 'the', 'time', 'dilation', 'effect', 'in', 'einsteins', 'special', 'theory', 'of', 'relativity', 'and', 'the', 'loved', 'ones', 'they', 'left', 'behind', 'are', 'now', 'all', 'dead', 'or', 'aged']
['the', 'line', 'your', 'mothers', 'eyes', 'from', 'your', 'eyes', 'cry', 'to', 'me', 'refers', 'to', 'his', 'sense', 'of', 'loss', 'at', 'seeing', 'his', 'daughters', 'eyes', 'in', 'his', 'aged', 'granddaughters', 'eyes']
['the', 'song', 'was', 'also'

In [69]:
class Sentences(object):
    
    def __init__(self, directory):
        self.directory = directory
        
    def __iter__(self):
        for text in iter_text(self.directory):
            yield text

In [70]:
sentences = Sentences(WIKI_DIR)

In [71]:
multiprocessing.cpu_count()

4

In [72]:
%%time
word_model = gensim.models.Word2Vec(sentences, workers=multiprocessing.cpu_count(),
                                    size=40, iter=4)

CPU times: user 13min 2s, sys: 49.8 s, total: 13min 52s
Wall time: 7min 4s


In [73]:
len(word_model.vocab)

95708

In [74]:
word_model.most_similar("dwarf")

[('dwarfs', 0.8680418729782104),
 ('jupiterlike', 0.8368364572525024),
 ('starforming', 0.8171519637107849),
 ('mainsequence', 0.8158100843429565),
 ('stars', 0.8151733875274658),
 ('subbrown', 0.8139293193817139),
 ('sunlike', 0.812651515007019),
 ('supergiant', 0.8106808662414551),
 ('plutoids', 0.80396568775177),
 ('galaxies', 0.8015128374099731)]

In [46]:
def quantify_text(t, model):
    words = parse_text(t)
    emb = [word_model[w] for w in words if w in word_model.vocab and len(w) > 0]
    if emb != []:
        return np.mean(emb, axis=0)
    else:
        return np.zeros(model.vector_size)

In [47]:
from gensim import matutils
def similarity(v1, v2):
    return np.dot(matutils.unitvec(np.array(v1)), matutils.unitvec(np.array(v2)))

In [48]:
def range_answers(q, answers, models):
    scores = []
    for model in models:
        question = quantify_text(q, model)
        if (question == 0).all():
            return None
        ps = Counter()
        scores_model = []
        for a in answers:
            a_q = quantify_text(a, model)
            if (a_q == 0).all():
                scores_model.append(0) 
            else:
                si = similarity(question, a_q)
                scores_model.append(si)
        scores.append(scores_model)
    return np.mean(scores, axis=0)

In [75]:
tries = []
with open(TRAIN_SET) as f:
    next(f)
    for i, l in enumerate(f):
        [qid, q, r, aa, ab, ac, ad] = l.strip().split("\t")
        scores = range_answers(q, [aa, ab, ac, ad],
                               [word_model])
        no_scores = (scores == 0).all()
        if not no_scores:
            guess = "ABCD"[np.argmax(scores)]
        else:
            print(q, aa, ab, ac, ad)
        tries.append(1 if guess == r else 0)

A sperm that contains alleles HqT fuses with an egg that contains alleles hqt. Which of the following genotypes will form in the offspring? HHqqTt HhQqTt Hhqqtt HhqqTt


In [76]:
np.mean(tries)

0.3236