# Creating Appropriate Dictionary Files from ECAD Data

In this document I will convert Joes ecad files into appropriate *2id.txt files

These are:

1. entity2id
2. relation2id
3. train2id.txt

Each of these is required by the OpenKE library to function properly

Furthermore, we must convert the embeddings from the initial training phase into an embedding file consumable by OpenKE



In [21]:
import json

In [22]:
# TRAIN
ecad_train_graph_path = "/Users/joelcarl/Downloads/pybiggraph-20190516-230034.tsv"
ecad_train_embeddings_path = "/Users/joelcarl/Downloads/ecad_gensim_embeddings.txt" # Where the embeddings are now



train2id_path = "/Users/joelcarl/Desktop/train2id.txt"
entity2id_path = "/Users/joelcarl/Desktop/entity2id.txt"
relation2id_path = "/Users/joelcarl/Desktop/relation2id.txt"
ecad_openKE_embeddings_path = "/Users/joelcarl/Desktop/ecad_gensim_embeddings.json" # Where the OpenKE compatible embeddings should go

# TEST
ecad_test_graph_path = "/Users/joelcarl/Downloads/pybiggraph-20190517-080024.tsv"

train2id_test_path = "/Users/joelcarl/Desktop/test/train2id.txt"
entity2id_test_path = "/Users/joelcarl/Desktop/test/entity2id.txt"
relation2id_test_path = "/Users/joelcarl/Desktop/test/relation2id.txt"

# Step 1. Convert train graph items into ids

The files are organized as text files, where each line is a tab separated `entity\trelationship\tentity`

What we will do is go over the file to create two dictionaries: 
    1. An index of entity to monotonically increasing id
    2. An index of relationship to monotonically increasing id

at the same time we will write a new file, train2id.txt on the fly 

### Write training graph with indices instead of names

In [18]:
entity2id_dict = {}
relation2id_dict = {}
i = 1
ent_id = 0
rel_id = 0
num_train_graph_lines = sum(1 for line in open(ecad_train_graph_path))

with open(ecad_train_graph_path) as f:  
    with open(train2id_path, "w") as train2id_f:        
        # By convention, the count of the num lines must be at the top of the file
        train2id_f.write(str(num_train_graph_lines) + "\n")        
        for line in f:
            # Get entities, add to dicts based on increasing id
            ent_1, rel, ent_2 = line.strip("\n").split("\t")
            if ent_1 not in entity2id_dict:
                entity2id_dict[ent_1] = ent_id
                ent_id += 1
            if ent_2 not in entity2id_dict:
                entity2id_dict[ent_2] = ent_id
                ent_id += 1
            if rel not in relation2id_dict:
                relation2id_dict[rel] = rel_id
                rel_id += 1            
            
            # Write to file:
            train2id_f.write("\t".join([str(item) for item in [entity2id_dict[ent_1], entity2id_dict[ent_2], relation2id_dict[rel]]]) + "\n")
    

### Write relation2id and entity2id

In [19]:
# Write relation2id
with open(relation2id_path, "w") as relation2id_f: 
    sorted_rel = sorted(relation2id_dict.items(), key=lambda kv: kv[1])
    relation2id_f.write(str(len(sorted_rel)) + "\n")  
    for rel, idx in sorted_rel:
        relation2id_f.write("\t".join([rel, str(idx)]) + "\n")


In [20]:
# Write entity2id
with open(entity2id_path, "w") as entity2id_f: 
    sorted_ent = sorted(entity2id_dict.items(), key=lambda kv: kv[1])
    entity2id_f.write(str(len(sorted_ent)) + "\n")  
    for ent, idx in sorted_ent:
        entity2id_f.write("\t".join([ent, str(idx)]) + "\n")


### Convert Embeddings into OpenKE Format

We must convert the embeddings file into the format:

`{"ent_embeddings": [[..., ..., ..., ], ], "rel_embeddings": [[]]}`

In the order of the rel and entity dictionaries

In [None]:
try:
    embs = open(embedding_path, 'r')
# Store configuration file values
except FileNotFoundError:
    raise Exception('Entity embedding file not found: {}'.format(embedding_path))

embedding_dict = json.loads(embs.read())	
ent_embedding = embedding_dict["ent_embeddings"]
self.ent_embedding_length = len(ent_embedding)

In [None]:
train_embedding_dict = {}
with open(ecad_train_embeddings_path, "r") as f:
    next(f) # Skip header
    for line in f:
        l_splt = line.strip("\n").split("\s")
        train_embedding_dict[l_splt[0]] = l_splt[1:] 


In [None]:
{"ent_embeddings": [[0.042046695947647095, 0.00392%

# Step 2. Convert Test Graph items into Appropriate Files

For the test graph, we only want to write instances that are **NOT** in the train graph, same goes for entity and relationships. However, we must be careful to maintain the appropriate indices. That is, the new entity and relationship ids must continue from the training indices. 


# Note: You can restart the kernel here, just make sure to run the top cell to get paths

In [11]:
# Load existing dictionaries
entity2id_dict = {}
with open(entity2id_path, "r") as entity2id_f: 
    next(entity2id_f) # Skip header
    for line in entity2id_f:
        ent, idx = line.strip("\n").split("\t")
        entity2id_dict[ent] = int(idx)
     

In [12]:
relation2id_dict = {}
with open(relation2id_path, "r") as relation2id_f: 
    next(relation2id_f) # Skip header
    for line in relation2id_f:
        rel, idx = line.strip("\n").split("\t")
        relation2id_dict[rel] = int(idx)

In [13]:
# Get starting indices

# with open(train2id_path, "r") as train2id_f:  
#     train2id_idx = int(train2id_f.readline().strip("\n"))

with open(relation2id_path, "r") as relation2id_f: 
    rel_id = int(relation2id_f.readline().strip("\n"))
    
with open(entity2id_path, "r") as entity2id_f:     
    ent_id = int(entity2id_f.readline().strip("\n"))
    
# print("train2id start index:\t\t"+str(train2id_idx))
print("relation2id start index:\t"+str(rel_id))
print("entity2id start index:\t\t"+str(ent_id))

relation2id start index:	3
entity2id start index:		4580


In [14]:
num_test_graph_lines = sum(1 for line in open(ecad_test_graph_path))

with open(ecad_test_graph_path) as f:  
    with open(train2id_test_path, "w") as train2id_f:        
        # By convention, the count of the num lines must be at the top of the file
        train2id_f.write(str(num_test_graph_lines) + "\n")        
        for line in f:
            write_to_file = False
            # Get entities, add to dicts based on increasing id
            ent_1, rel, ent_2 = line.strip("\n").split("\t")
            if ent_1 not in entity2id_dict:
                entity2id_dict[ent_1] = ent_id
                ent_id += 1
                write_to_file = True
            if ent_2 not in entity2id_dict:
                entity2id_dict[ent_2] = ent_id
                ent_id += 1
                write_to_file = True
            if rel not in relation2id_dict:
                relation2id_dict[rel] = rel_id
                rel_id += 1            
                write_to_file = True
                
            # Write to file **only** if one of the entities or rel are not in the training file:
            if write_to_file:
                train2id_f.write("\t".join([str(item) for item in [entity2id_dict[ent_1], entity2id_dict[ent_2], relation2id_dict[rel]]]) + "\n")
    

In [15]:
# Write relation2id
with open(relation2id_test_path, "w") as relation2id_f: 
    sorted_rel = sorted(relation2id_dict.items(), key=lambda kv: kv[1])
    relation2id_f.write(str(len(sorted_rel)) + "\n")  
    for rel, idx in sorted_rel:
        relation2id_f.write("\t".join([rel, str(idx)]) + "\n")

In [16]:
# Write entity2id
with open(entity2id_test_path, "w") as entity2id_f: 
    sorted_ent = sorted(entity2id_dict.items(), key=lambda kv: kv[1])
    entity2id_f.write(str(len(sorted_ent)) + "\n")  
    for ent, idx in sorted_ent:
        entity2id_f.write("\t".join([ent, str(idx)]) + "\n")