# Creating Appropriate Dictionary Files from ECAD Data

In this document I will convert Joes ecad files into appropriate *2id.txt files

These are:

1. entity2id
2. relation2id
3. train2id.txt

Each of these is required by the OpenKE library to function properly

Furthermore, we must convert the embeddings from the initial training phase into an embedding file consumable by OpenKE



In [None]:
import json

In [1]:
# TRAIN
ecad_train_graph_path = "/Users/joelcarl/Downloads/pybiggraph-20190516-230034.tsv"
ecad_train_embeddings_path = "/Users/joelcarl/Downloads/ecad_gensim_embeddings.txt" # Where the embeddings are now

train2id_path = "/Users/joelcarl/Desktop/train2id.txt"
entity2id_path = "/Users/joelcarl/Desktop/entity2id.txt"
relation2id_path = "/Users/joelcarl/Desktop/relation2id.txt"
ecad_openKE_embeddings_path = "/Users/joelcarl/Desktop/ecad_gensim_embeddings.json" # Where the OpenKE compatible embeddings should go

# TEST
ecad_test_graph_path = "/Users/joelcarl/Downloads/pybiggraph-20190517-080024.tsv"
test_data_path = "/Users/joelcarl/Desktop/test/"

train2id_test_path = test_data_path + "train2id.txt"
entity2id_test_path = test_data_path + "entity2id.txt"
relation2id_test_path = test_data_path + "relation2id.txt"




# Step 1. Convert train graph items into ids

The files are organized as text files, where each line is a tab separated `entity\trelationship\tentity`

What we will do is go over the file to create two dictionaries: 
    1. An index of entity to monotonically increasing id
    2. An index of relationship to monotonically increasing id

at the same time we will write a new file, train2id.txt on the fly 

### Write training graph with indices instead of names

In [None]:
entity2id_dict = {}
relation2id_dict = {}
i = 1
ent_id = 0
rel_id = 0
num_train_graph_lines = sum(1 for line in open(ecad_train_graph_path))

with open(ecad_train_graph_path) as f:  
    with open(train2id_path, "w") as train2id_f:        
        # By convention, the count of the num lines must be at the top of the file
        train2id_f.write(str(num_train_graph_lines) + "\n")        
        for line in f:
            # Get entities, add to dicts based on increasing id
            ent_1, rel, ent_2 = line.strip("\n").split("\t")
            if ent_1 not in entity2id_dict:
                entity2id_dict[ent_1] = ent_id
                ent_id += 1
            if ent_2 not in entity2id_dict:
                entity2id_dict[ent_2] = ent_id
                ent_id += 1
            if rel not in relation2id_dict:
                relation2id_dict[rel] = rel_id
                rel_id += 1            
            
            # Write to file:
            train2id_f.write("\t".join([str(item) for item in [entity2id_dict[ent_1], entity2id_dict[ent_2], relation2id_dict[rel]]]) + "\n")
    

### Write relation2id and entity2id

In [None]:
# Write relation2id
with open(relation2id_path, "w") as relation2id_f: 
    sorted_rel = sorted(relation2id_dict.items(), key=lambda kv: kv[1])
    relation2id_f.write(str(len(sorted_rel)) + "\n")  
    for rel, idx in sorted_rel:
        relation2id_f.write("\t".join([rel, str(idx)]) + "\n")


In [None]:
# Write entity2id
with open(entity2id_path, "w") as entity2id_f: 
    sorted_ent = sorted(entity2id_dict.items(), key=lambda kv: kv[1])
    entity2id_f.write(str(len(sorted_ent)) + "\n")  
    for ent, idx in sorted_ent:
        entity2id_f.write("\t".join([ent, str(idx)]) + "\n")


### Convert Embeddings into OpenKE Format

We must convert the embeddings file into the format:

`{"ent_embeddings": [[..., ..., ..., ], ], "rel_embeddings": [[..., ..., ..., ], ]}`

In the order of the rel and entity dictionaries

In [None]:
train_embedding_dict = {}
with open(ecad_train_embeddings_path, "r") as f:
    next(f) # Skip header
    for line in f:
        l_splt = line.strip("\n").split(" ")
        # First value is either an entity or relation, remaining values are embedding values
        train_embedding_dict[l_splt[0]] = [float(num) for num in l_splt[1:]]


In [None]:
# Create lists of embeddings in OpenKE format:
embedding_dict = {"ent_embeddings":[train_embedding_dict[ent] for ent, idx in sorted_ent],
                  "rel_embeddings":[train_embedding_dict[rel] for rel, idx in sorted_rel]}


In [None]:
# Dump to json
with open(ecad_openKE_embeddings_path, 'w') as fp:
    json.dump(embedding_dict, fp)

# Step 2. Convert Test Graph items into Appropriate Files

For the test graph, we only want to write instances that are **NOT** in the train graph, same goes for entity and relationships. However, we must be careful to maintain the appropriate indices. That is, the new entity and relationship ids must continue from the training indices. 


# Note: You can restart the kernel here, just make sure to run the top cell to get paths

In [None]:
# Load existing dictionaries
entity2id_dict = {}
with open(entity2id_path, "r") as entity2id_f: 
    next(entity2id_f) # Skip header
    for line in entity2id_f:
        ent, idx = line.strip("\n").split("\t")
        entity2id_dict[ent] = int(idx)
     

In [None]:
relation2id_dict = {}
with open(relation2id_path, "r") as relation2id_f: 
    next(relation2id_f) # Skip header
    for line in relation2id_f:
        rel, idx = line.strip("\n").split("\t")
        relation2id_dict[rel] = int(idx)

In [None]:
# Get starting indices

# with open(train2id_path, "r") as train2id_f:  
#     train2id_idx = int(train2id_f.readline().strip("\n"))

with open(relation2id_path, "r") as relation2id_f: 
    rel_id = int(relation2id_f.readline().strip("\n"))
    
with open(entity2id_path, "r") as entity2id_f:     
    ent_id = int(entity2id_f.readline().strip("\n"))
    
# print("train2id start index:\t\t"+str(train2id_idx))
print("relation2id start index:\t"+str(rel_id))
print("entity2id start index:\t\t"+str(ent_id))

In [None]:
num_test_graph_lines = sum(1 for line in open(ecad_test_graph_path))

with open(ecad_test_graph_path) as f:  
    with open(train2id_test_path, "w") as train2id_f:        
        # By convention, the count of the num lines must be at the top of the file
        train2id_f.write(str(num_test_graph_lines) + "\n")        
        for line in f:
            write_to_file = False
            # Get entities, add to dicts based on increasing id
            ent_1, rel, ent_2 = line.strip("\n").split("\t")
            if ent_1 not in entity2id_dict:
                entity2id_dict[ent_1] = ent_id
                ent_id += 1
                write_to_file = True
            if ent_2 not in entity2id_dict:
                entity2id_dict[ent_2] = ent_id
                ent_id += 1
                write_to_file = True
            if rel not in relation2id_dict:
                relation2id_dict[rel] = rel_id
                rel_id += 1            
                write_to_file = True
                
            # Write to file **only** if one of the entities or rel are not in the training file:
            if write_to_file:
                train2id_f.write("\t".join([str(item) for item in [entity2id_dict[ent_1], entity2id_dict[ent_2], relation2id_dict[rel]]]) + "\n")
    

In [None]:
# Write relation2id
with open(relation2id_test_path, "w") as relation2id_f: 
    sorted_rel = sorted(relation2id_dict.items(), key=lambda kv: kv[1])
    relation2id_f.write(str(len(sorted_rel)) + "\n")  
    for rel, idx in sorted_rel:
        relation2id_f.write("\t".join([rel, str(idx)]) + "\n")

In [None]:
# Write entity2id
with open(entity2id_test_path, "w") as entity2id_f: 
    sorted_ent = sorted(entity2id_dict.items(), key=lambda kv: kv[1])
    entity2id_f.write(str(len(sorted_ent)) + "\n")  
    for ent, idx in sorted_ent:
        entity2id_f.write("\t".join([ent, str(idx)]) + "\n")

# run training?

In [5]:
test_file_path = "/Users/joelcarl/Desktop/test/"


test_file_path = "/Users/joelcarl/Desktop/test/model_new.vec.tf"
test_embedding_path = "/Users/joelcarl/Desktop/test/embedding_new.vec.json"

In [6]:
import config
import models
import tensorflow as tf
import numpy as np
import os


In [9]:
os.environ['CUDA_VISIBLE_DEVICES']='7'
#Input training files from benchmarks/FB15K/ folder.
con = config.Config()
#True: Input test files from the same folder.
con.set_in_path(test_data_path)
con.set_test_link_prediction(False)
con.set_test_triple_classification(False)
con.set_work_threads(8)
con.set_train_times(10)
con.set_nbatches(6) # Total obs = 18 . Need ~3 per batch, so batch size = total / n_batches 
con.set_alpha(0.001)
con.set_margin(1.0)
con.set_bern(0)
con.set_dimension(400) # dimension of embedding
con.set_ent_neg_rate(1)
con.set_rel_neg_rate(0)
con.set_opt_method("SGD")


con.set_freeze_train_embeddings(True)
con.set_embedding_initializer_path(ecad_openKE_embeddings_path)


In [10]:
#Models will be exported via tf.Saver() automatically.
con.set_export_files(test_file_path, 0)
#Model parameters will be exported to json files automatically.
con.set_out_files(test_embedding_path)
#Initialize experimental settings.
con.init()
#Set the knowledge embedding model
con.set_model(models.TransE_freeze)



New entities found:
-- Total Entities in embedding file: 20
-- Total Entities in data: 5000 
Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
keep_dims is deprecated, use keepdims instead


Instructions for updating:
keep_dims is deprecated, use keepdims instead


Instructions for updating:
Use tf.cast instead.


Instructions for updating:
Use tf.cast instead.


Instructions for updating:
Use `tf.global_variables_initializer` instead.


Instructions for updating:
Use `tf.global_variables_initializer` instead.


In [11]:
#Train the model.
con.run()

Epoch: 0, loss: 811.2923736572266, time: 0.0
Epoch: 1, loss: 691.4817504882812, time: 0.0
Epoch: 2, loss: 691.9060363769531, time: 0.0
Epoch: 3, loss: 769.8413848876953, time: 0.0
Epoch: 4, loss: 721.5551605224609, time: 7.152557373046875e-07
Epoch: 5, loss: 781.9990234375, time: 9.5367431640625e-07
Epoch: 6, loss: 744.8305206298828, time: 0.0
Epoch: 7, loss: 764.2288513183594, time: 0.0
Epoch: 8, loss: 606.1986999511719, time: 0.0
Epoch: 9, loss: 435.43772888183594, time: 0.0
