# Poincaré Embedding with Japanese Wordnet 

### Download Japanese Wordnet

In [None]:
! wget -c http://compling.hss.ntu.edu.sg/wnja/data/1.1/wnjpn.db.gz
! gunzip -fk wnjpn.db.gz

### Download library

In [None]:
! wget -c https://gist.githubusercontent.com/sambaiz/a0508c9ed379b3218e30/raw/525b1a59acef1a50c04e71ecf1dfe17d84a4f1f6/wn.py

### import libraries

In [None]:
from wn import *
from gensim.models.poincare import PoincareModel
import random
import gensim.viz.poincare
import plotly

### Japanese Wordnet connection

In [None]:
def getWords2(wordid):
    cur = conn.execute("select * from word where wordid=?", (wordid,))
    return [Word(*row) for row in cur]

In [None]:
def getSenses2(synset):
    cur = conn.execute("select * from sense where synset=?", (synset,))
    return [Sense(*row) for row in cur]

In [None]:
def getAllWords():
    cur = conn.execute("select * from word")
    return [Word(*row) for row in cur]

In [None]:
def abstract_word(lemma):
    result = []
    for word in getWords(lemma):
        for sense in getSenses(word):
            if sense.src != 'hand': 
                continue
            for synlink in getSynLinks(sense, 'hype'):
                abst_senses = getSenses2(synlink.synset2)
                for abst_sense in abst_senses:
                    if abst_sense and word.wordid != abst_sense.wordid and abst_sense.lang == "jpn":
                        w2 = getWords2(abst_sense.wordid)[0]
                        if w2.pos == "n":
                            result.append(w2.lemma)
                    
    return result

### Connection test

In [None]:
# number of words
len(getAllWords())

In [None]:
getWords("幸せ")

In [None]:
getWords("ヨーグルト")

In [None]:
abstract_word("ヨーグルト")

In [None]:
abstract_word("本棚")

### Generate word paris

In [None]:
wordlist = [w.lemma for w in getAllWords() if w.pos == "n" and w.lang == "jpn"]
len(wordlist)

In [None]:
pairs = [(w, aw) for w in wordlist for aw in abstract_word(lemma = w)]
len(pairs)

In [None]:
# show pairs
pairs[0:20]

### Train model 

In [None]:
# parameters
dimension = 100
negative = 10 # negative samples
seed = 25252 # nico nico ni-
epochs =1000

In [None]:
model = PoincareModel(pairs, size=dimension, negative=negative, seed=seed)

In [None]:
%%time
model.train(epochs=epochs)

In [None]:
# save model
model_name = "poincare-dim{}-negative{}-epoch{}.model".format(dimension, negative, epochs)
model.save(model_name)

In [None]:
# load model
# model = model.load(model_name)

### Test the model

In [None]:
model.kv.most_similar("牛乳")

In [None]:
model.kv.distance("ヨーグルト", "牛乳")

In [None]:
model.kv.difference_in_hierarchy("生物", "動物")

### Train 2D model

In [None]:
# parameters
negative = 10 # negative samples
seed = 25252 # nico nico ni-
epochs =1000

In [None]:
model_2d = PoincareModel(pairs, size=2, negative=negative, seed=seed)

In [None]:
%%time
model_2d.train(epochs=epochs)

In [None]:
# save model
model_2d_name = "poincare-dim{}-negative{}-epoch{}.model".format(2, negative, epochs)
model_2d.save(model_2d_name)

In [None]:
# load model
# model_2d = model.load(model_2d_name)

### Visualize the model

In [None]:
# parameters
samples = 200
seed = 25252 # nico nico ni-

In [None]:
random.seed(seed)
sample_words = random.choices(list(model.kv.vocab.keys()), k=samples)
sample_words = list(set(sample_words))

In [None]:
sample_pairs = [(w1, w2) for w1, w2 in pairs if w1 in sample_words or w2 in sample_words]

In [None]:
plotly.offline.init_notebook_mode(connected=False)
prefecutre_map = gensim.viz.poincare.poincare_2d_visualization(
                                                                model=model_2d,
                                                                tree=sample_pairs,
                                                                num_nodes=10, 
                                                                figure_title="Japanese Wordnet",
                                                                show_node_labels=sample_words)
plotly.offline.iplot(prefecutre_map)