## Import subtitles_lines data from a pickle object

In [1]:
import pickle

with open('./results/subtitles_lines.pkl', 'rb') as f:
    subtitles_lines = pickle.load(f)

In [2]:
%time
from nltk.tokenize import word_tokenize

sentences = []
for subtitle_lines in subtitles_lines:
    # first three sentences and the last one are not to be used
    for line in subtitle_lines[3:-1]:
        sentences.append(word_tokenize(line))

CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 2.86 µs


In [3]:
len(sentences)

112094

## Word2Vec model

In [4]:
from gensim.models import Word2Vec
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize

In [5]:
# example
# from nltk.corpus import movie_reviews
# sentences = [list(s) for s in movie_reviews.sents()]

In [21]:
%%time
model = Word2Vec(sentences, size=100, window=5, min_count=1, workers=4)

CPU times: user 4.16 s, sys: 30.8 ms, total: 4.19 s
Wall time: 1.75 s


In [22]:
model.init_sims(replace=True)

In [23]:
model.wv.most_similar(positive=['Ross', 'Emily'], negative=['Mark'])

[('Rachel', 0.8477544784545898),
 ('Joey', 0.8386672139167786),
 ('Chandler', 0.8252462148666382),
 ('Monica', 0.8095682263374329),
 ('Phoebe', 0.7773568034172058),
 ('Ben', 0.6479400396347046),
 ('Emma', 0.640067458152771),
 ('jumpy', 0.6378891468048096),
 ('everybody', 0.634537398815155),
 ('it', 0.6272266507148743)]

## embedding distance between NE objects

In [9]:
import json

In [10]:
#  I extracted only these two types because other types has no object.
person_objects = None
gpe_objects = None

with open('./results/object_class.json', 'r') as f:
    lines = f.readlines()
    for line in lines:
        obj = json.loads(line)
        if obj["title"] == "PERSON":
#             person_objects = json.loads("["+obj["objects"]+"]")
            person_objects = json.loads(obj["objects"])
        elif obj["title"] == "GPE":
#             gpe_objects = json.loads("["+obj["objects"]+"]")
            gpe_objects = json.loads(obj["objects"])

### for PERSON obejcts

In [32]:
persons = [obj['title'] for obj in person_objects]
gpes = [obj['title'] for obj in gpe_objects]

In [52]:
idx = 1
f = open('./results/person_object_relations.json', 'w')
d = dict()
d["id"] = "P1"
d["type"] = "relations"
d["title"] = "Embedding Distances Between PERSON objects"
d["objects"] = []

for i, a in enumerate(persons):
    for j, b in enumerate(persons):
        if i is not j:
            d_ = dict()
            d_["id"] = "R" + str(idx)
            d_["class"] = "P1"
            d_["type"] = "relation"
            d_["source"] = "O" + str(i+1)
            d_["target"] = "O" + str(j+1)
            d_["value"] = model.wv.similarity(a, b)
            d["objects"].append(d_)
            idx += 1

f.write (json.dumps(d))
f.close()

### for GPE objects

In [None]:
idx = 1
f = open('./results/gpe_object_relations.json', 'w')
d = dict()
d["id"] = "P2"
d["type"] = "relations"
d["title"] = "Embedding Distances Between GPE objects"
d["objects"] = []

for i, a in enumerate(gpes):
    for j, b in enumerate(gpes):
        if i is not j:
            d_ = dict()
            d_["id"] = "R" + str(idx)
            d_["class"] = "P1"
            d_["type"] = "relation"
            d_["source"] = "O" + str(i+1)
            d_["target"] = "O" + str(j+1)
            d_["value"] = model.wv.similarity(a, b)
            d["objects"].append(d_)
            idx += 1

f.write (json.dumps(d))
f.close()

## Visualize word2vec vectors (option)

In [None]:
# visualization
from sklearn.manifold import TSNE
import pandas as pd
import re
import matplotlib.pyplot as plt

In [None]:
vocab = list(model.wv.vocab)
X = model[vocab]

In [None]:
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)

In [None]:
df = pd.DataFrame(X_tsne, index=vocab, columns=['x', 'y'])
df = df[:100]

fig = plt.figure()
fig.set_figwidth(10)
fig.set_figheight(10)

ax = fig.add_subplot(1, 1, 1)
ax.scatter(df['x'], df['y'])

for word, pos in df.iterrows():
    ax.annotate(word, pos)