In [1]:
from gensim.models import FastText
from gensim.models import Word2Vec

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import seaborn as sns
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt

## Check models

In [11]:
model_ft_doc = FastText.load("models/fasttext_doc")
model_w2v_doc = Word2Vec.load("models/w2v_doc")
model_ft_sen = FastText.load("models/fasttext_sen")
model_w2v_sen = Word2Vec.load("models/w2v_sen")

In [15]:
print(model_w2v_doc.wv.most_similar('laki'))
print(model_w2v_sen.wv.most_similar('laki'))
print(model_ft_doc.wv.most_similar('laki'))
print(model_ft_sen.wv.most_similar('laki'))

[('asetus', 0.6654728055000305), ('voimaantulosäännöksen', 0.4646611511707306), ('valkuaisveron', 0.44757983088493347), ('laiksi', 0.44592708349227905), ('muutetaan', 0.4403388202190399), ('väliaikaisesta', 0.43962162733078003), ('päätös', 0.4387626647949219), ('säätämisestä', 0.4273509383201599), ('ilmoitusperusteisesta', 0.41885361075401306), ('voimaantulosäännösten', 0.40928956866264343)]
[('asetus', 0.6453289985656738), ('laiksi', 0.6009200811386108), ('väliaikaisesta', 0.5712264776229858), ('muutetaan', 0.5249903202056885), ('moittimisajasta', 0.4956638813018799), ('momlaki', 0.4774267077445984), ('eduskunnan', 0.4773203134536743), ('kumotaan', 0.471310019493103), ('kiinteistönsaannon', 0.46193408966064453), ('säätämisestä', 0.447407066822052)]
[('aki', 0.8580690622329712), ('yvalaki', 0.8556310534477234), ('jalaki', 0.8551520705223083), ('momlaki', 0.8549357652664185), ('adoptiolaki', 0.8112454414367676), ('rehulaki', 0.8036662936210632), ('puitelaki', 0.7919964790344238), ('kudo

## Visualize selected model

In [66]:
df = pd.DataFrame(model.wv.vectors)
data = df.loc[0:19999,].copy()

In [67]:
pca = PCA(n_components=50)
pca_res = pca.fit_transform(data)
sum(pca.explained_variance_ratio_)

0.7614675299264491

In [68]:
import time

time_start = time.time()
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_res = tsne.fit_transform(data)
print('t-SNE done! Time elapsed: {} seconds'.format(time.time() - time_start))

[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 20000 samples in 0.123s...
[t-SNE] Computed neighbors for 20000 samples in 90.814s...
[t-SNE] Computed conditional probabilities for sample 1000 / 20000
[t-SNE] Computed conditional probabilities for sample 2000 / 20000
[t-SNE] Computed conditional probabilities for sample 3000 / 20000
[t-SNE] Computed conditional probabilities for sample 4000 / 20000
[t-SNE] Computed conditional probabilities for sample 5000 / 20000
[t-SNE] Computed conditional probabilities for sample 6000 / 20000
[t-SNE] Computed conditional probabilities for sample 7000 / 20000
[t-SNE] Computed conditional probabilities for sample 8000 / 20000
[t-SNE] Computed conditional probabilities for sample 9000 / 20000
[t-SNE] Computed conditional probabilities for sample 10000 / 20000
[t-SNE] Computed conditional probabilities for sample 11000 / 20000
[t-SNE] Computed conditional probabilities for sample 12000 / 20000
[t-SNE] Computed conditional probabilities for s

In [69]:
data['tsne1'] = tsne_res[:,0]
data['tsne2'] = tsne_res[:,1]

In [70]:
vocab = list(model.wv.vocab.keys())

In [None]:
plt.figure(figsize=(50,50))
p = sns.scatterplot(x="tsne1", y="tsne2", data = data, alpha=0.3)

for i in range(0, 19999, 100):
    p.text(data['tsne1'][i], data['tsne2'][i], vocab[i], size='medium', color='black', weight='semibold')