# Creation of Embeddings

- This script produces embeddings for WN18 and FB15k 

In [6]:
training_file_fb = "/work/jportisc/kbc_rdf2vec/strategy_grid_2/evaluation_2/nt_files/FB15k.nt"
training_file_wn = "/work/jportisc/kbc_rdf2vec/strategy_grid_2/evaluation_2/nt_files/WN18.nt"

## KGloVe Training

In [46]:
kglove_executable = "/work-ceph/jportisc/code/kglove/kglove_jan/RDFConverter"
glove_executable = "/work-ceph/jportisc/code/kglove/glove/GloVe-1.2/build/glove"

threads = 10
vector_dimension = 200

"""
mode <0...7>
        (0) Unif
        (1) Pred
        (2) InvPred
        (3) Obj
        (4) InvObj
        (5) InvObjSD
        (6) PredObj
        (7) InvPredObj
"""
kglove_mode = 2

kglove_vocab_result_path = "glove_vocab_file_out_InvPred_alpha_0.10000000000000001_eps_1.0000000000000001e-05.bin"
kglove_input_file_result_path = "glove_input_file_out_InvPred_alpha_0.10000000000000001_eps_1.0000000000000001e-05.bin"

fb_cmd_1 = f"{kglove_executable} graph {training_file_fb} mode {kglove_mode}"
print(fb_cmd_1)

/work-ceph/jportisc/code/kglove/kglove_jan/RDFConverter graph /work/jportisc/kbc_rdf2vec/strategy_grid_2/evaluation_2/nt_files/FB15k.nt mode 2


Now let's create the frequency file for FB:

In [8]:
!$fb_cmd_1

Using graph file: '/work/jportisc/kbc_rdf2vec/strategy_grid_2/evaluation_2/nt_files/FB15k.nt'
Using mode: 2
2021-08-23 06:51:12 copying graph
2021-08-23 06:51:12 copying graph - nodes copied
2021-08-23 06:51:12 graph copied
2021-08-23 06:51:12 After first fast phase, 131/14951 nodes are done, starting iterative phase
2021-08-23 06:51:12 10000/14951 done
2021-08-23 06:51:12 All done14951/14951 done
writing to glove_input_file_out_InvPred_alpha_0.10000000000000001_eps_1.0000000000000001e-05.bin
	and glove_vocab_file_out_InvPred_alpha_0.10000000000000001_eps_1.0000000000000001e-05.bin
2021-08-23 07:38:59 Processed 10000/14951 BCV computations


Move frequency files to separate directory:

In [18]:
kglove_fb = "./kglove_fb"

from pathlib import Path

def move_to_dir(directory_path: str, file_path: str):
    # create directory if it does not exist
    directory = Path(directory_path)
    if not directory.is_dir():
        directory.mkdir()
    file = Path(file_path)
    if file.is_file():
        file.rename(directory.joinpath(file_path))
    
move_to_dir(directory_path=kglove_fb, file_path=kglove_vocab_result_path)
move_to_dir(directory_path=kglove_fb, file_path=kglove_input_file_result_path)

Now let's create the frequency file for WN:

In [21]:
wn_cmd_1 = f"{kglove_executable} graph {training_file_wn} mode {kglove_mode}"
print(wn_cmd_1)

/work-ceph/jportisc/code/kglove/kglove_jan/RDFConverter graph /work/jportisc/kbc_rdf2vec/strategy_grid_2/evaluation_2/nt_files/WN18.nt mode 2


In [23]:
!$wn_cmd_1

Using graph file: '/work/jportisc/kbc_rdf2vec/strategy_grid_2/evaluation_2/nt_files/WN18.nt'
Using mode: 2
2021-08-24 08:48:08 copying graph
2021-08-24 08:48:08 copying graph - nodes copied
2021-08-24 08:48:08 graph copied
2021-08-24 08:48:08 After first fast phase, 214/40943 nodes are done, starting iterative phase
2021-08-24 08:48:08 10000/40943 done
2021-08-24 08:48:09 20000/40943 done
2021-08-24 08:48:09 30000/40943 done
2021-08-24 08:48:09 40000/40943 done
2021-08-24 08:48:09 All done40943/40943 done
writing to glove_input_file_out_InvPred_alpha_0.10000000000000001_eps_1.0000000000000001e-05.bin
	and glove_vocab_file_out_InvPred_alpha_0.10000000000000001_eps_1.0000000000000001e-05.bin
2021-08-24 08:54:06 Processed 10000/40943 BCV computations
2021-08-24 08:56:00 Processed 20000/40943 BCV computations
2021-08-24 08:56:38 Processed 30000/40943 BCV computations
2021-08-24 08:57:15 Processed 40000/40943 BCV computations


In [26]:
kglove_wn = "./kglove_wn"

move_to_dir(directory_path=kglove_wn, file_path=kglove_vocab_result_path)
move_to_dir(directory_path=kglove_wn, file_path=kglove_input_file_result_path)

Now let's actually train embeddings using GloVe:

In [40]:
from pathlib import Path

def get_input_file_path(result_path: str) -> str:
    result_dir = Path(result_path)
    for file in result_dir.iterdir():
        if "glove_input_file_out" in file.name:
            return str(file.resolve())
            
def get_vocab_file_path(result_path: str) -> str:
    result_dir = Path(result_path)
    for file in result_dir.iterdir():
        if "glove_vocab_file_out" in file.name:
            return str(file.resolve())

In [72]:
fb_input_file = get_input_file_path(result_path=kglove_fb)
fb_vocab_file = get_vocab_file_path(result_path=kglove_fb)
fb_save_file = str(Path(kglove_fb).joinpath("vectors").resolve())


fb_cmd_2 = f"{glove_executable} -input-file {fb_input_file} -vocab-file {fb_vocab_file} -save-file {fb_save_file} -iter 20 -threads {threads} -vector-size {vector_dimension} -binary 2"
print(fb_cmd_2)

/work-ceph/jportisc/code/kglove/glove/GloVe-1.2/build/glove -input-file /home/jportisc/jupyter_notebooks/KBC_Journal/kglove_fb/glove_input_file_out_InvPred_alpha_0.10000000000000001_eps_1.0000000000000001e-05.bin -vocab-file /home/jportisc/jupyter_notebooks/KBC_Journal/kglove_fb/glove_vocab_file_out_InvPred_alpha_0.10000000000000001_eps_1.0000000000000001e-05.bin -save-file /home/jportisc/jupyter_notebooks/KBC_Journal/kglove_fb/vectors -iter 20 -threads 10 -vector-size 200 -binary 2


In [68]:
!$fb_cmd_2

TRAINING MODEL
Read 126017904 lines.
Initializing parameters...done.
vector size: 200
vocab size: 16296
x_max: 100.000000
alpha: 0.750000
iter: 001, cost: 0.000493
iter: 002, cost: 0.000432
iter: 003, cost: 0.000383
iter: 004, cost: 0.000345
iter: 005, cost: 0.000314
iter: 006, cost: 0.000288
iter: 007, cost: 0.000267
iter: 008, cost: 0.000249
iter: 009, cost: 0.000235
iter: 010, cost: 0.000222
iter: 011, cost: 0.000211
iter: 012, cost: 0.000202
iter: 013, cost: 0.000194
iter: 014, cost: 0.000187
iter: 015, cost: 0.000181
iter: 016, cost: 0.000176
iter: 017, cost: 0.000171
iter: 018, cost: 0.000167
iter: 019, cost: 0.000163
iter: 020, cost: 0.000159


In [73]:
wn_input_file = get_input_file_path(result_path=kglove_wn)
wn_vocab_file = get_vocab_file_path(result_path=kglove_wn)
wn_save_file = str(Path(kglove_wn).joinpath("vectors").resolve())

wn_cmd_2 = f"{glove_executable} -input-file {wn_input_file} -vocab-file {wn_vocab_file} -save-file {wn_save_file} -iter 20 -threads {threads} -vector-size {vector_dimension} -binary 2"
print(wn_cmd_2)

/work-ceph/jportisc/code/kglove/glove/GloVe-1.2/build/glove -input-file /home/jportisc/jupyter_notebooks/KBC_Journal/kglove_wn/glove_input_file_out_InvPred_alpha_0.10000000000000001_eps_1.0000000000000001e-05.bin -vocab-file /home/jportisc/jupyter_notebooks/KBC_Journal/kglove_wn/glove_vocab_file_out_InvPred_alpha_0.10000000000000001_eps_1.0000000000000001e-05.bin -save-file /home/jportisc/jupyter_notebooks/KBC_Journal/kglove_wn/vectors -iter 20 -threads 10 -vector-size 200 -binary 2


In [74]:
!$wn_cmd_2

TRAINING MODEL
Read 256884810 lines.
Initializing parameters...done.
vector size: 200
vocab size: 40961
x_max: 100.000000
alpha: 0.750000
iter: 001, cost: 0.000174
iter: 002, cost: 0.000170
iter: 003, cost: 0.000169
iter: 004, cost: 0.000168
iter: 005, cost: 0.000167
iter: 006, cost: 0.000166
iter: 007, cost: 0.000165
iter: 008, cost: 0.000164
iter: 009, cost: 0.000163
iter: 010, cost: 0.000162
iter: 011, cost: 0.000161
iter: 012, cost: 0.000160
iter: 013, cost: 0.000159
iter: 014, cost: 0.000158
iter: 015, cost: 0.000157
iter: 016, cost: 0.000156
iter: 017, cost: 0.000155
iter: 018, cost: 0.000154
iter: 019, cost: 0.000153
iter: 020, cost: 0.000153
