In [156]:
import pandas as pd
import pickle
import json
from gensim.models import Word2Vec
from gensim.corpora import Dictionary
from gensim.models.callbacks import CallbackAny2Vec
from gensim.models import KeyedVectors
import numpy as np

In [3]:
jstor_df = pd.read_feather("../data/large_files/jstor_df_v1.feather")

article_ids_dict = pickle.load(open("../data/article_ids_dict.pickle", "rb"))

dictionary = pickle.load(open("../data/dictionary_main.pickle", "rb"))
len(dictionary)

freqs_tups = pickle.load(open("../data/freqs_tups.pickle", "rb"))

In [4]:
class FriendlyCorpus:
    def __iter__(self):
        for id in ids:
            short_id_str = str(article_ids_dict[id])
            id_filepath = "../data/large_files/article_docs/{}.pickle".format(short_id_str)
            for doc in pickle.load(open(id_filepath, "rb")):
                yield [dictionary[tup[0]] for tup in doc]

# word2vec model development

In [5]:
ids = jstor_df["id"].tolist()[:1000]
corpus = FriendlyCorpus()
corpus_len = len([doc for doc in corpus])

In [109]:
model = Word2Vec(vector_size=100, window=3, negative=5, ns_exponent=1, sg=0, workers=8)

In [110]:
model.build_vocab_from_freq(word_freq=dict(freqs_tups))

In [111]:
%%time
model.train(corpus_iterable=corpus, total_examples=corpus_len, epochs=5)

CPU times: user 47.1 s, sys: 589 ms, total: 47.7 s
Wall time: 29.2 s


(22240101, 22783065)

In [114]:
model.wv.most_similar("Paul")

[('Pauline', 0.807790994644165),
 ('Branick', 0.5509141683578491),
 ('Krister', 0.5115583539009094),
 ('undisputed', 0.4922533631324768),
 ('deutero_Pauline', 0.4876696467399597),
 ('erature', 0.4874403774738312),
 ('thi', 0.48649412393569946),
 ('colossian', 0.4637281894683838),
 ('spe', 0.45533061027526855),
 ('Finlan', 0.45361194014549255)]

In [112]:
model.wv.most_similar("Romans")

[('Galatians', 0.746419370174408),
 ('Thessalonians', 0.6330987215042114),
 ('Ephesians', 0.6219400763511658),
 ('Eph', 0.6069772839546204),
 ('Corinthians', 0.6063165664672852),
 ('Philippians', 0.5878782272338867),
 ('Matera', 0.5725627541542053),
 ('521', 0.563494861125946),
 ('Colossians', 0.5631004571914673),
 ('Brendan', 0.5271100401878357)]

In [113]:
model.wv.most_similar("Jesus")

[('crucified', 0.6054939031600952),
 ('risen', 0.5864917039871216),
 ('Ali', 0.5596235394477844),
 ('parousia', 0.5448322892189026),
 ('supper', 0.5399717688560486),
 ('crucify', 0.539387583732605),
 ('Philem', 0.5347006916999817),
 ('Grillmeier', 0.5329602360725403),
 ('Epaphras', 0.5325378179550171),
 ('Nicodemus', 0.5318003296852112)]

In [115]:
len(model.wv.index_to_key)

41385

In [117]:
words_to_keep = [tup[0] for tup in freqs_tups[:3000]]

In [118]:
def filter_wv(wv, words_to_keep):
    model_words = wv.index_to_key
    words_to_trim = list((set(model_words).difference(set(words_to_keep))))
    ids_to_trim = [wv.key_to_index[w] for w in words_to_trim]
    for w in words_to_trim:
        del wv.key_to_index[w]
    wv.vectors = np.delete(wv.vectors, ids_to_trim, axis=0)
    wv.fill_norms()
    for i in sorted(ids_to_trim, reverse=True):
        del(wv.index_to_key[i])
    return wv

In [119]:
test_wv = model.wv

In [120]:
model_vw_filtered = filter_wv(test_wv, words_to_keep)

In [None]:
# but there is much more straightforward solution to this!
model_vw_filtered = model.wv.vectors_for_all(words_to_keep)

In [121]:
len(model_vw_filtered)

3000

In [124]:
model_vw_filtered.most_similar("Paul")

[('Pauline', 0.807790994644165),
 ('colossian', 0.4637281894683838),
 ('athanasius', 0.43054693937301636),
 ('contributor', 0.4175826907157898),
 ('ephesian', 0.41415196657180786),
 ('inspire', 0.4122663140296936),
 ('Titus', 0.40873152017593384),
 ('shape', 0.4032384753227234),
 ('Robinson', 0.3954716622829437),
 ('Hebrews', 0.39504387974739075)]

In [134]:
def ids_from_colvals(df_name, col, matchstring):
    ids = eval('{0}[{0}["{1}"]{2}]'.format(df_name, col, matchstring))["id"].tolist()
    return ids

In [137]:
generations = []
for n in range (1900, 2001, 20):
    generations.append((str(n) + "," + str(n+19)))
generations

['1900,1919', '1920,1939', '1940,1959', '1960,1979', '1980,1999', '2000,2019']

In [139]:
test_gen = generations[-2]
test_gen

'1980,1999'

In [140]:
ids = ids_from_colvals("jstor_df", "publicationYear", ".between({})".format(test_gen))
len(ids)

2520

In [141]:
corpus = FriendlyCorpus()
corpus_len = len([doc for doc in corpus])
model = Word2Vec(vector_size=150, window=3, negative=5, ns_exponent=1, sg=0, workers=8)
model.build_vocab_from_freq(word_freq=dict(freqs_tups))
model.train(corpus_iterable=corpus, total_examples=corpus_len, epochs=5)
model_vectors = model.wv

(39413004, 40391550)

In [149]:
model_vectors.save("../data/large_files/model_vectors_{}.wv".format(test_gen))

In [154]:
model_vectors_loaded = KeyedVectors.load("../data/large_files/model_vectors_{}.wv".format(test_gen))

# word2vec main application

In [157]:
generations = []
for n in range (1900, 2001, 20):
    generations.append((str(n) + "," + str(n+19)))
generations

['1900,1919', '1920,1939', '1940,1959', '1960,1979', '1980,1999', '2000,2019']

In [158]:
for gen in generations:
    print("generation {} model training".format(gen))
    ids = ids_from_colvals("jstor_df", "publicationYear", ".between({})".format(test_gen))
    corpus = FriendlyCorpus()
    corpus_len = len([doc for doc in corpus])
    model = Word2Vec(vector_size=150, window=3, negative=5, ns_exponent=1, sg=0, workers=8)
    model.build_vocab_from_freq(word_freq=dict(freqs_tups))
    model.train(corpus_iterable=corpus, total_examples=corpus_len, epochs=5)
    model_vectors = model.wv
    print("generation {} ready. saving...".format(gen))
    model_vectors.save("../data/large_files/model_vectors_{}.wv".format(gen))

generation 1900,1919 model training
generation 1900,1919 ready. saving...
generation 1920,1939 model training
generation 1920,1939 ready. saving...
generation 1940,1959 model training
generation 1940,1959 ready. saving...
generation 1960,1979 model training
generation 1960,1979 ready. saving...
generation 1980,1999 model training
generation 1980,1999 ready. saving...
generation 2000,2019 model training
generation 2000,2019 ready. saving...
