In [70]:
import os
import logging
import numpy as np
import pandas as pd
from gensim.models.poincare import PoincareModel, PoincareKeyedVectors, PoincareRelations

from utils.data_loader import get_data_list

In [None]:
"""
Listing training args (most to be accepted via cmd line args)

model specs:
size (int, optional) – Number of dimensions of the trained model.
alpha (float, optional) – Learning rate for training.
negative (int, optional) – Number of negative samples to use.
workers (int, optional) – Number of threads to use for training the model.
epsilon (float, optional) – Constant used for clipping embeddings below a norm of one.
regularization_coeff (float, optional) – Coefficient used for l2-regularization while training (0 effectively disables regularization).
burn_in (int, optional) – Number of epochs to use for burn-in initialization (0 means no burn-in).
burn_in_alpha (float, optional) – Learning rate for burn-in initialization, ignored if burn_in is 0.
init_range (2-tuple (float, float)) – Range within which the vectors are randomly initialized.
dtype (numpy.dtype) – The numpy dtype to use for the vectors in the model (numpy.float64, numpy.float32 etc). Using lower precision floats may be useful in increasing training speed and reducing memory usage.
seed (int, optional) – Seed for random to ensure reproducibility.


model train:
epochs (int) – Number of iterations (epochs) over the corpus.
batch_size (int, optional) – Number of examples to train on in a single batch.
print_every (int, optional) – Prints progress and average loss after every print_every batches.


other misc things:
- train file path
- prob threshold
- experiment name (can be deduced from hyper-params)
- logging dir
- model save dir

logging the model loss and other things to check if the model is learning anything?

"""

In [43]:
logging.basicConfig(level=logging.INFO) # for reporting events during normal run 

trn_file = "./data/book_small/book_train.txt"
prob_threshold = 0.5
trn_list = get_data_list(trn_file, prob_threshold)

In [58]:
embed_dim = 10
model = PoincareModel(train_data=trn_list, size=embed_dim)

INFO:gensim.models.poincare:loading relations from train data..
INFO:gensim.models.poincare:loaded 1790 relations from train data, 69 nodes


In [63]:
model.train(epochs=10, print_every=50)

INFO:gensim.models.poincare:training model of size 10 with 1 workers on 1790 relations for 10 epochs and 10 burn-in epochs, using lr=0.10000 burn-in lr=0.01000 negative=10
INFO:gensim.models.poincare:starting training (10 epochs)----------------------------------------
INFO:gensim.models.poincare:training on epoch 1, examples #490-#500, loss: 15.27
INFO:gensim.models.poincare:time taken for 500 examples: 0.16 s, 3102.91 examples / s
INFO:gensim.models.poincare:training on epoch 1, examples #990-#1000, loss: 14.91
INFO:gensim.models.poincare:time taken for 500 examples: 0.14 s, 3568.83 examples / s
INFO:gensim.models.poincare:training on epoch 1, examples #1490-#1500, loss: 14.92
INFO:gensim.models.poincare:time taken for 500 examples: 0.15 s, 3379.68 examples / s
INFO:gensim.models.poincare:training on epoch 2, examples #490-#500, loss: 14.66
INFO:gensim.models.poincare:time taken for 500 examples: 0.13 s, 3759.70 examples / s
INFO:gensim.models.poincare:training on epoch 2, examples #

In [51]:
model.save("./model1.save")

In [56]:
model.kv.distance(8, 35)

0.26403962330840747