In [None]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from gensim.models import FastText
import numpy as np

In [None]:
fp = open("../texts/deephaven.txt").read()
sentences = sent_tokenize(fp)
sentences = [word_tokenize(s) for s in sentences]
sentences = [[t.lower() for t in s] for s in sentences]

In [None]:
model = FastText(sentences,min_count=2)

In [None]:
model.wv.most_similar("town")

In [None]:
from gensim.models import KeyedVectors
google_model = KeyedVectors.load_word2vec_format("../models/google-vectors.w2v",binary=True)

In [None]:
vocab = list(google_model.key_to_index.keys())

In [None]:
types = list(set([t for s in sentences for t in s]))
types = [t for t in types if t in vocab]
embs = [google_model[t] for t in types]

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
dist_matrix = 1 - cosine_similarity(embs)

In [None]:
for i in np.argsort(dist_matrix[types.index("town")])[:25]:
    print(types[i])

In [None]:
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt

In [None]:
pca = PCA(n_components = 2)

neighbor_types = [types[t] for t in np.argsort(dist_matrix[types.index("town")])[:25]]
neighbor_embs = [google_model[t] for t in neighbor_types]

plot_data = pca.fit_transform(neighbor_embs)
xs, ys = plot_data[:, 0], plot_data[:, 1]

fig = plt.figure(figsize=(20, 15))
plt.clf()
plt.title("PCA Neighboring Terms for 'town'")
plt.style.use('ggplot')
plt.scatter(xs, ys, marker = '^')
for i, w in enumerate(neighbor_types):
     plt.annotate(w, xy = (xs[i], ys[i]), xytext = (3, 3),
        textcoords = 'offset points', ha = 'left', va = 'top')
plt.show()  