In [1]:
import config
import models
import tensorflow as tf
import numpy as np
import os


In [2]:
train_data_path = "./benchmarks/FB15K/"
test_data_path = "./benchmarks/FB15K_OOV/"

train_file_path = "./res/model.vec.tf"
train_embedding_path = "./res/embedding.vec.json"

test_file_path = "./res/model_new.vec.tf"
test_embedding_path = "./res/embedding_new.vec.json"

# Run TransE To Create initial embeddings

In [None]:
"""
Method:

Run the normal transe example (example_train_transe.py)
Write the embeddings as a file that can be read
use embeddings to initialize embedding layer in TransE_freeze.py

Append random embeddings for new entities and relations
set config freeze_train_embeddings = true
figure out how to update only the embeddings for a certain set of indices
figure out how to make sure that we only see examples using new items to speed convergence
compare new+old embeddings
"""
os.environ['CUDA_VISIBLE_DEVICES']='7'
con = config.Config()
#True: Input test files from the same folder.
con.set_in_path(train_data_path)
con.set_test_link_prediction(True)
con.set_test_triple_classification(True)
con.set_work_threads(8)
con.set_train_times(10)
con.set_nbatches(20)
con.set_alpha(0.001)
con.set_margin(1.0)
con.set_bern(0)
con.set_dimension(100)
con.set_ent_neg_rate(1)
con.set_rel_neg_rate(0)
con.set_opt_method("SGD")

In [None]:
#Models will be exported via tf.Saver() automatically.
con.set_export_files(train_file_path, 0)
#Model parameters will be exported to json files automatically.
con.set_out_files(train_embedding_path)
#Initialize experimental settings.
con.init()

In [None]:

#Set the knowledge embedding model
con.set_model(models.TransE)


In [None]:
#Train the model.
con.run()

# Run TransE Freeze using Embeddings produced in above step

In [3]:
os.environ['CUDA_VISIBLE_DEVICES']='7'
#Input training files from benchmarks/FB15K/ folder.
con = config.Config()
#True: Input test files from the same folder.
con.set_in_path(test_data_path)
con.set_test_link_prediction(False)
con.set_test_triple_classification(False)
con.set_work_threads(8)
con.set_train_times(10)
con.set_nbatches(6) # Total obs = 18 . Need ~3 per batch, so batch size = total / n_batches 
con.set_alpha(0.001)
con.set_margin(1.0)
con.set_bern(0)
con.set_dimension(100)
con.set_ent_neg_rate(1)
con.set_rel_neg_rate(0)
con.set_opt_method("SGD")


con.set_freeze_train_embeddings(True)
con.set_embedding_initializer_path(train_embedding_path)


In [4]:
#Models will be exported via tf.Saver() automatically.
con.set_export_files(test_file_path, 0)
#Model parameters will be exported to json files automatically.
con.set_out_files(test_embedding_path)
#Initialize experimental settings.
con.init()
#Set the knowledge embedding model
con.set_model(models.TransE_freeze)



New entities found:
-- Total Entities in embedding file: 14951
-- Total Entities in data: 14970 




New relationships found:
-- Total Relationships in embedding file: 1345
-- Total Relationships in data: 1354 

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
keep_dims is deprecated, use keepdims instead


Instructions for updating:
keep_dims is deprecated, use keepdims instead


Instructions for updating:
Use tf.cast instead.


Instructions for updating:
Use tf.cast instead.


Instructions for updating:
Use `tf.global_variables_initializer` instead.


Instructions for updating:
Use `tf.global_variables_initializer` instead.


In [7]:
#Train the model.
con.run()

Epoch: 0, loss: 3.031768798828125, time: 0.0
Epoch: 1, loss: 2.1548867225646973, time: 1.1920928955078125e-06
Epoch: 2, loss: 3.203535556793213, time: 0.0
Epoch: 3, loss: 2.2079691886901855, time: 0.0
Epoch: 4, loss: 1.8855860233306885, time: 0.0
Epoch: 5, loss: 2.1341300010681152, time: 9.5367431640625e-07
Epoch: 6, loss: 0.8497586250305176, time: 0.0
Epoch: 7, loss: 2.463907241821289, time: 0.0
Epoch: 8, loss: 2.1021907329559326, time: 0.0
Epoch: 9, loss: 2.0507194995880127, time: 9.5367431640625e-07


# Compare new and old embeddings

- Any entity or relationship that was created in the training data should be the same
- Any entity or relationship that was created in the training data, but involved in the test data should still be the same
- Any entity or relationship that was not created in training should be different


In [None]:
import json
with open("./res/embedding.vec.json", "r") as f: 
    old_embeddings = json.loads(f.read())
    old_ent_embeddings = old_embeddings["ent_embeddings"]
    old_rel_embeddings = old_embeddings["rel_embeddings"]


In [None]:
with open("./res/embedding_new.vec.json", "r") as f: 
    new_embeddings = json.loads(f.read())
    new_ent_embeddings = new_embeddings["ent_embeddings"]
    new_rel_embeddings = new_embeddings["rel_embeddings"]

### Compare Entities

In [None]:
# Entities that should be the same are all entities below (and including) 14950, but in train2id
# e.g. 1234, 1018, 4169
print(old_ent_embeddings[1233] == new_ent_embeddings[1233])
print(old_ent_embeddings[1234] == new_ent_embeddings[1234])
print(old_ent_embeddings[1235] == new_ent_embeddings[1235])

In [None]:
# Entities that are new should have embeddings in the new data, but not in the old
try:
    print(old_ent_embeddings[14951])
except IndexError:
    print("No embedding for entity 14951 in train embeddings")
    
print("Embedding for entity 14951 in new embeddings:")    
print(new_ent_embeddings[14951][0:10])    

### Compare Relationships

In [None]:
# Relationships that should be the same are all entities below (and including) 1344, but in train2id
# e.g. 58, 135, 38
print(old_rel_embeddings[57] == new_rel_embeddings[57])
print(old_rel_embeddings[58] == new_rel_embeddings[58])
print(old_rel_embeddings[59] == new_rel_embeddings[59])

In [None]:
# Relationships that are new should have embeddings in the new data, but not in the old
try:
    print(old_rel_embeddings[1345])
except IndexError:
    print("No relationship embedding for 1345 in train embeddings")
    
print("Embedding for relationship 1345 in new embeddings:")    
print(new_rel_embeddings[1345][0:10])  

### Lengths should be as expected

In [None]:
print(len(old_ent_embeddings))
print(len(new_ent_embeddings))
print(len(old_rel_embeddings))
print(len(new_rel_embeddings))

In [None]:
new_rel_embeddings[1353][0:5]