In [207]:
import gensim 
import re
import multiprocessing
import os
import numpy as np
from collections import Counter

In [208]:
print(gensim.__version__)

0.11.1.post1


In [209]:
gensim.models.word2vec.FAST_VERSION

1

In [181]:
DATA = "/data/datasets/ck12.txt"

In [182]:
!wc -l /data/datasets/ck12.txt

272578 /data/datasets/ck12.txt


In [183]:
def parse_text(t):
    s = re.sub(r'[^\w\s]', '', t)
    r = s.lower().split()
    return r

In [184]:
def iter_lines(fname):
    with open(fname) as f:
        for l in f:
            r = parse_text(l)
            if len(r) > 0:
                yield r

In [185]:
for i, t in enumerate(iter_lines(DATA)):
    print(t)
    if i > 1:
        break

['to', 'access', 'a', 'customizable', 'version', 'of', 'this', 'book', 'as', 'well', 'as', 'other', 'interactive', 'content', 'visit', 'wwwck12org']
['ck12', 'foundation', 'is', 'a', 'nonprofit', 'organization', 'with', 'a', 'mission', 'to', 'reduce', 'the', 'cost', 'of', 'textbook', 'materials', 'for', 'the', 'k12', 'market', 'both', 'in', 'the', 'us', 'and', 'worldwide', 'using', 'an', 'opencontent', 'webbased', 'collaborative', 'model', 'termed', 'the', 'flexbook', 'ck12', 'intends', 'to', 'pioneer', 'the', 'generation', 'and', 'distribution', 'of', 'highquality', 'educational', 'content', 'that', 'will', 'serve', 'both', 'as', 'core', 'text', 'as', 'well', 'as', 'provide', 'an', 'adaptive', 'environment', 'for', 'learning', 'powered', 'through', 'the', 'flexbook', 'platform']
['copyright', '2014', 'ck12', 'foundation', 'wwwck12org']


In [186]:
class Sentences(object):
    
    def __init__(self, filename):
        self.filename = filename
        
    def __iter__(self):
        for text in iter_lines(self.filename):
            yield text

In [187]:
sentences = Sentences(DATA)

In [188]:
%%time
word_model = gensim.models.Word2Vec(sentences, workers=multiprocessing.cpu_count(), size=300, iter=10)

CPU times: user 13min 15s, sys: 9.24 s, total: 13min 24s
Wall time: 2min 27s


In [189]:
word_model.save("/data/datasets/a2_w2v.model")

In [192]:
word_model.most_similar("dwarf")

[('dwarfs', 0.5507678985595703),
 ('eris', 0.5450934171676636),
 ('makemake', 0.5250632166862488),
 ('ceres', 0.5198060274124146),
 ('pluto', 0.49868565797805786),
 ('haumea', 0.46100103855133057),
 ('galaxies', 0.4567176103591919),
 ('irregular', 0.43478572368621826),
 ('supergiants', 0.4344647526741028),
 ('supergiant', 0.41168296337127686)]

In [None]:
GLOVE = "/data/datasets/"

In [193]:
TRAIN_SET = "/data/datasets/training_set.tsv"

In [194]:
def quantify_text(t):
    words = parse_text(t)
    return [w for w in words if w in word_model.vocab if len(w) > 2]

In [201]:
def range_answers(q, answers):
    question = quantify_text(q)
    ps = Counter()
    scores = []
    for a in answers:
        a_q = quantify_text(a)
        if a_q != []:
            si = word_model.n_similarity(question, a_q)
            scores.append(si)
        else:
            scores.append(0)
    return scores

In [202]:
tries = []
with open(TRAIN_SET) as f:
    next(f)
    for i, l in enumerate(f):
        [qid, q, r, aa, ab, ac, ad] = l.strip().split("\t")
        scores = range_answers(q, [aa, ab, ac, ad])
        guess = "ABCD"[scores.index(max(scores))]
#         print(q, aa, ab, ac, ad, scores, r, guess)
#         if i > 10:
#             break
        tries.append(1 if guess == r else 0)

In [203]:
np.mean(tries)

0.33119999999999999

In [204]:
len(tries)

2500

In [205]:
tries[0:10]

[0, 1, 0, 0, 0, 1, 0, 1, 0, 1]

In [206]:
len(word_model.vocab)

11553

In [175]:
with open("/data/datasets/w2v_a2.tsv", "w") as fo:
    for w in word_model.vocab:
        fo.write("%s\t%s\n" % (w, "\t".join(map(str, word_model[w]))))

In [172]:
VALIDATION_SET = "/data/datasets/validation_set.tsv"
SUBMISSION = "/data/datasets/submission.txt"

In [173]:
with open(SUBMISSION, "w") as s:
    s.write("id,correctAnswer\n")
    with open(VALIDATION_SET) as fv:
        next(fv)
        for l in fv:
            [qid, q, aa, ab, ac, ad] = l.strip().split("\t")
            scores = range_answers(q, [aa, ab, ac, ad])
            guess = "ABCD"[scores.index(max(scores))]
            s.write("%s,%s\n" % (qid, guess))