In [15]:
import csv
import bz2
import gensim.models.keyedvectors as kv

In [16]:
raw_vecs = bz2.open("../models/poetry-56817.w2v.bz2",'rt').readlines()

In [17]:
print("loaded {0} total vectors".format(len(raw_vecs)))

loaded 874564 total vectors


In [18]:
# load Harvard Inquirer terms
rows=list()
with open('../lexicons/Harvard_Inquirer-inqtabs.txt', 'rt') as csvfile:
    reader = csv.reader(csvfile, delimiter='\t')
    for i,row in enumerate(reader):
        if i == 0:
            header = row
        else:
            rows.append(row)
            
# extract words and drop to lowercase
inquirer = [w[0].lower() for w in rows]
    
# remove alternate meanings and reduce duplicates
inquirer = list(set([w.split("#")[0] for w in inquirer]))

In [19]:
# Load MacOS X dictionary terms
dict_terms = [line.rstrip() for line in open('/usr/share/dict/words')]
dict_terms = [w.lower() for w in dict_terms]

In [20]:
# combine vocabulary together
combined_vocab = list(set(dict_terms + inquirer))

In [21]:
print("loaded {0} total vocabulary terms".format(len(combined_vocab)))

loaded 234795 total vocabulary terms


In [None]:
# now create just vectors for vocabulary terms
newvecs = list()
for v in raw_vecs:
    if v.split()[0] in combined_vocab:
        newvecs.append(v)

In [None]:
# create and write header 
fp = bz2.open("../models/poetry-56817-reduced.w2v.bz2",'wt')
# preserve hard-coded vector size
nv = len(newvecs)
nd = 300

# assemble into header line with newline character
header = str(nv) + " " + str(nd) + "\n"
fp.writelines(header)

# write data for the vectors
fp.writelines(newvecs)
fp.close()

In [10]:
# now load and verify
model = kv.KeyedVectors.load_word2vec_format(bz2.open("../models/drama-17412-reduced.w2v.bz2"))

In [11]:
# verify vocab
vocab = list(model.vocab)
print("loaded {0} total vectors".format(len(vocab)))

loaded 63195 total vectors


In [14]:
model.most_similar("green",topn=25)

[('blue', 0.44744181632995605),
 ('gray', 0.372827410697937),
 ('grass', 0.36947953701019287),
 ('incarnadine', 0.35026347637176514),
 ('at', 0.3484750986099243),
 ('day', 0.347786009311676),
 ('four', 0.3458620607852936),
 ('hill', 0.3415071964263916),
 ('by', 0.3383297920227051),
 ('eye', 0.32951465249061584),
 ('black', 0.3286947011947632),
 ('golden', 0.3272470235824585),
 ('down', 0.3247103989124298),
 ('lawn', 0.3242352604866028),
 ('cockatoo', 0.3230116665363312),
 ('plantago', 0.32233089208602905),
 ('cerise', 0.32187095284461975),
 ('bushy', 0.32179051637649536),
 ('and', 0.3210379481315613),
 ('hall', 0.3174566626548767),
 ('as', 0.3154778480529785),
 ('fair', 0.31520888209342957),
 ('each', 0.3150211572647095),
 ('meadow', 0.31499573588371277),
 ('are', 0.30976858735084534)]