In [None]:
import nltk
from htrc_features import FeatureReader
import numpy as np
import pandas as pd

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import gensim.models.keyedvectors as kv

from sklearn.metrics.pairwise import cosine_similarity

from sklearn.decomposition import PCA
from sklearn.manifold import MDS

from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
documents = ["mdp.39015004308212",
             "mdp.39015024849682",
             "coo.31924014522449",
             "uc1.32106015388678",
             "uc2.ark:/13960/t9w08xb8w",
             "uiug.30112012873441"]

In [None]:
def get_pages(document):
    fr = FeatureReader([document])
    vol = next(fr.volumes())
    ptc = vol.tokenlist(pos=False, case=False).reset_index().drop(['section'], axis=1)
    page_list = set(ptc['page'])
    
    rows=list()
    for page in page_list:
        page_data = str()
        
        # operate on each token
        for page_tokens in ptc.loc[ptc['page'] == page].iterrows():
            if page_tokens[1][1].isalpha():
                page_data += (' '.join([page_tokens[1][1]] * page_tokens[1][2])) + " "

        # Doc2Vec needs comma separated list of words
        rows.append(page_data.split())
    return rows

In [None]:
pages = list()
for d in documents:
    for page in get_pages(d):
        pages.append(page)

# convert to TaggedDocument
tagged_data = [TaggedDocument(words=_d, tags=[str(i)]) for i, _d in enumerate(pages)]

In [None]:
model = Doc2Vec(tagged_data, 
                dm=1, # operate on "paragraphs" (pages) with distributed memory model
                vector_size=300, # larger vector size might produce better results
                min_count=5, # drop words with very few repetitions
                window=150, # larger window size needed because of extracted features
                workers=2)

In [None]:
# "Interview" the model
vocab_size, dim = model.wv.vectors.shape
print("vocab:", vocab_size)
print("depth:", dim)

In [None]:
# save word2vec model
model.save_word2vec_format("../models/doc2vec-htrc-collection.w2v")

In [None]:
# overwrite model with KV 
model =  kv.KeyedVectors.load_word2vec_format("../models/doc2vec-htrc-collection.w2v")

In [None]:
model.most_similar("mother")

In [None]:
model.most_similar("schooner",topn=25)

In [None]:
def scatter_terms_pca(term):
    neighbor_vectors=list()
    neighbor_words=list()

    for word, j in model.most_similar(term,topn=15):
        neighbor_words.append(word)
        neighbor_vectors.append(model[word])
   
    pca = PCA(n_components=2)
    plot_data = pca.fit_transform(neighbor_vectors)
    xs, ys = plot_data[:, 0], plot_data[:, 1]

    fig = plt.figure(figsize=(20, 15))
    plt.clf()
    plt.title("PCA Neighboring Terms for: " + term)
    plt.style.use('ggplot')
    plt.scatter(xs, ys, marker = '^')
    for i, w in enumerate(neighbor_words):
         plt.annotate(w, xy = (xs[i], ys[i]), xytext = (3, 3),
            textcoords = 'offset points', ha = 'left', va = 'top')
    plt.show()  

In [None]:
scatter_terms_pca("father")