In [30]:
import os

import gensim
from gensim.utils import simple_preprocess
from nltk import sent_tokenize

from sklearn.decomposition import PCA

import pandas as pd
import numpy as np

import plotly.express as px

In [17]:
story = []
for filename in os.listdir("data"):
    with open(os.path.join("data", filename), encoding="cp1252") as f:
        corpus = f.read()
        raw_sent = sent_tokenize(corpus)
        for sent in raw_sent:
            story.append(simple_preprocess(sent))

In [18]:
len(story)

145020

In [19]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=4,
)

In [20]:
model.build_vocab(story)

In [21]:
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(6569648, 8628190)

In [22]:
model.wv.most_similar("stannis")

[('renly', 0.8140129446983337),
 ('robert', 0.7864481806755066),
 ('euron', 0.7132577896118164),
 ('aerys', 0.7054409980773926),
 ('peace', 0.7043545246124268),
 ('victarion', 0.6935116648674011),
 ('tommen', 0.6920387744903564),
 ('tywin', 0.6842411160469055),
 ('battle', 0.6764733195304871),
 ('dragonstone', 0.6693668961524963)]

In [23]:
model.wv.most_similar("king")

[('realm', 0.6891324520111084),
 ('baratheon', 0.688623309135437),
 ('throne', 0.6663892865180969),
 ('prince', 0.6445778012275696),
 ('tourney', 0.6205719709396362),
 ('battle', 0.6103635430335999),
 ('site', 0.6101223230361938),
 ('council', 0.6100698709487915),
 ('victory', 0.6070058345794678),
 ('foreswore', 0.5999483466148376)]

In [24]:
model.wv.similarity(w1="tyrion", w2="dwarf")

0.6933798

In [25]:
model.wv.similarity(w1="tyrion", w2="tywin")

0.45910305

In [27]:
y = model.wv.index_to_key

In [28]:
pca = PCA(n_components=3)

In [29]:
X = pca.fit_transform(model.wv.get_normed_vectors())

In [32]:
fig = px.scatter_3d(X[:100], x=0, y=1, z=2, color=y[:100])
fig.show()

This would have worked even better if we would have removed stopwords but still we can see the related words are together.