## Load the embeddings

In [12]:
import numpy as np
import collections
import array
import time

SubgraphMeta = collections.namedtuple('SubgraphMeta', 'typ rel ent siz')
Subgraph = collections.namedtuple('Subgraph', 'emb meta')
Result = collections.namedtuple('Result', 'query posh post')
   
inputfile = '/Users/jacopo/Desktop/kgemb/fb15k_avgsubgraphs.bin'
logfile = '/Users/jacopo/Desktop/kgemb/fb15K_subgraphs_valid.txt'
emb_meta_e = '/Users/jacopo/Desktop/kgemb/models/fb15k/best-model/E-meta'
emb_e_path = '/Users/jacopo/Desktop/kgemb/models/fb15k/best-model/E.0'
emb_meta_r = '/Users/jacopo/Desktop/kgemb/models/fb15k/best-model/R-meta'
emb_r_path = '/Users/jacopo/Desktop/kgemb/models/fb15k/best-model/R.0'

I load the file with the subgraph embeddings produced from Trident (for now I don't need them)

In [13]:
subgraphs_meta = []
embeddings = []
with open(inputfile, 'rb') as fin:
    b_nsubgraphs = fin.read(8)
    nsubgraphs = int.from_bytes(b_nsubgraphs, byteorder='little', signed=False)
    for i in range(nsubgraphs):
        line = fin.read(25)
        typ = line[0]
        rel = int.from_bytes(line[1:9], byteorder='little', signed=False)
        ent = int.from_bytes(line[9:17], byteorder='little', signed=False)
        siz = int.from_bytes(line[17:], byteorder='little', signed=False)        
        sg = SubgraphMeta(typ=typ, ent=ent, siz=siz, rel=rel)
        subgraphs_meta.append(sg)
    # Load the average embeddings
    emb_meta = fin.read(10)
    dims = int.from_bytes(emb_meta[:2], byteorder='big', signed=False)
    mincard = int.from_bytes(emb_meta[2:], byteorder='big', signed=False)
    for i in range(nsubgraphs):
        b_emb = fin.read(dims * 8)
        emb = np.frombuffer(b_emb, dtype=np.float64)
        embeddings.append(emb)

subgraphs = []
for i in range(nsubgraphs):
    subgraphs.append(Subgraph(emb=embeddings[i], meta=subgraphs_meta[i]))
subgraphs_meta = []
embeddings = []

Load all the embeddings.

In [14]:
def load_embeddings(meta, e_path):
    batch_size = 0
    dim = 0
    n = 0
    with open(meta, 'rb') as fmeta:
        raw = fmeta.read(10)
        batch_size = int.from_bytes(raw[:4], byteorder='little', signed=False)
        n = int.from_bytes(raw[4:8], byteorder='little', signed=False)
        dim = int.from_bytes(raw[8:], byteorder='little', signed=False)
    e = np.zeros(shape=(n,dim))
    with open(e_path, 'rb') as fin:
        for i in range(n):
            line = fin.read(8 + dim * 8)            
            emb = np.frombuffer(line[8:], dtype=np.float64)
            e[i] = emb
    return e
emb_e = load_embeddings(emb_meta_e, emb_e_path)
emb_r = load_embeddings(emb_meta_r, emb_r_path)

now load all the results

In [27]:
results = []
with open(logfile, 'rt') as fin:
    header = fin.readline()
    for l in fin:
        tkns = l.split('\t')
        query = tkns[0]
        pos_answer_subgraph_head = int(tkns[1])
        pos_answer_subgraph_tail = int(tkns[2])
        results.append(Result(query, pos_answer_subgraph_head, pos_answer_subgraph_tail))
# Create the training data
data = np.zeros((len(results), 3), dtype=np.int)
for i in range(len(results)):
    query = results[i].query
    tkns = query.split(' ')
    r = int(tkns[1])
    e = int(tkns[2])
    data[i][0] = r
    data[i][1] = e
    pos = results[i].posh
    if pos > 0:
        if pos < 3:
            data[i][2] = 1
        elif  pos < 5:
            data[i][2] = 2
        elif pos < 10:
            data[i][2] = 3
        else:
            data[i][2] = 4
        #elif pos < 20:
        #    data[i][2] = 4
        #elif pos < 50:
        #    data[i][2] = 5   
        #elif pos < 100:
        #    data[i][2] = 6
# Take away 10% which should be used for the validation
idx_val=np.random.choice(data.shape[0], int(data.shape[0]*0.10), replace=False)
valid_data = data[idx_val,:]
train_data = np.delete(data, idx_val, axis=0)

classes = np.zeros(5)
for t in train_data:
    pos_h = t[2]
    classes[pos_h] += 1
print(classes)

[13723.  3347.  1375.  1654.  3860.]


### Learning

Learning a simple logistic regression model using tensorflow

In [28]:
import tensorflow as tf

#### Params

In [29]:
learning_rate = 0.01
training_epochs = 100
batch_size = 100
display_step = 1
n_input = dims * 2
n_classes = 5
n_hidden_1 = 256 # n neurons first layer
n_hidden_2 = 256 # n neurons second layer

#### Input

In [30]:
# Set up the input queues
t_emb_e = tf.constant(emb_e)
t_emb_r = tf.constant(emb_r)

# Initialize the training data
t = tf.constant(train_data)
ds = tf.data.Dataset.from_tensor_slices(t)
ds = ds.shuffle(buffer_size=100)
ds = ds.batch(batch_size)

# Initialize the valid data
t_valid = tf.constant(valid_data)
ds_valid = tf.data.Dataset.from_tensor_slices(t_valid)
ds_valid = ds_valid.batch(batch_size)

#iter = ds.make_initializable_iterator()
iter = tf.data.Iterator.from_structure(ds.output_types, ds.output_shapes)
el = iter.get_next()

# Lookup embeddings for the inputs
rel, ent, y = tf.split(el, num_or_size_splits=3, axis=1)
rel = tf.reshape(rel, shape=[-1]) # Flatten the tensor
ent = tf.reshape(ent, shape=[-1]) # Flatten the tensor
rel_emb = tf.nn.embedding_lookup(t_emb_r, rel)
ent_emb = tf.nn.embedding_lookup(t_emb_e, ent)
inp = tf.concat([rel_emb, ent_emb], axis=1)

#Process the labels
y = tf.reshape(y, shape=[-1])
y_hot = tf.one_hot(y, n_classes)

#### Model

In [31]:
weights = {
    'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1], dtype=tf.float64), dtype=tf.float64),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2], dtype=tf.float64), dtype=tf.float64),
    'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes], dtype=tf.float64), dtype=tf.float64)
}
biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1], dtype=tf.float64), dtype=tf.float64),
    'b2': tf.Variable(tf.random_normal([n_hidden_2], dtype=tf.float64), dtype=tf.float64),
    'out': tf.Variable(tf.random_normal([n_classes], dtype=tf.float64), dtype=tf.float64)
}
layer_1 = tf.add(tf.matmul(inp, weights['h1']), biases['b1'])
layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
out_layer = tf.matmul(layer_2, weights['out']) + biases['out']
pred = tf.nn.softmax_cross_entropy_with_logits_v2(logits=out_layer, labels=y_hot)

#### Gradient

In [32]:
loss_op = tf.reduce_mean(pred)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)

#### Testing

In [33]:
# New operator to perform the predictions
predictions = tf.nn.softmax(out_layer, name='predictions')
# Compare the softmax with the actual values
pred_indices = tf.argmax(predictions, axis=1)
res = tf.equal(pred_indices, y)
res = tf.cast(res, tf.int32)
sres = tf.reduce_sum(res)

#### Learning

In [34]:
init = tf.global_variables_initializer()
train_input_initializer = iter.make_initializer(ds)
valid_input_initializer = iter.make_initializer(ds_valid)

with tf.Session() as sess:    
    sess.run([init])
    for epoch in range(training_epochs):
        start = time.time()
        sess.run(train_input_initializer)
        # Loop over all batches
        avg_loss = 0
        num_batch = 0
        while True:
            try:
                loss = sess.run([loss_op, train_op])                
                num_batch += 1
                avg_loss += loss[0]
            except (tf.errors.OutOfRangeError, StopIteration):
                break
            except e:
                print(e)
                break
        print('Train epoch', epoch, " Loss={:.6f}".format(avg_loss / num_batch), "Time={:.4f}sec".format(time.time() - start))
        if epoch % 10 == 0:
            # Test the performance on the valid dataset
            sess.run(valid_input_initializer)
            num_batch = 0
            correct = 0
            while True:
                try:
                    p = sess.run([sres])                
                    correct += p[0]
                    num_batch += 1
                except (tf.errors.OutOfRangeError, StopIteration):
                    break
                except e:
                    print(e)
            print("Correct predictions=", correct)
print("Optimization Finished!")

Train epoch 0  Loss=574.235329 Time=0.6429sec
Correct predictions= 1164
Train epoch 1  Loss=258.990702 Time=0.4262sec
Train epoch 2  Loss=138.925642 Time=0.4271sec
Train epoch 3  Loss=104.929911 Time=0.4478sec
Train epoch 4  Loss=72.269823 Time=0.4202sec
Train epoch 5  Loss=61.621947 Time=0.4148sec
Train epoch 6  Loss=48.734735 Time=0.4108sec
Train epoch 7  Loss=39.609300 Time=0.4053sec
Train epoch 8  Loss=29.352749 Time=0.4053sec
Train epoch 9  Loss=24.134601 Time=0.4119sec
Train epoch 10  Loss=22.870800 Time=0.4035sec
Correct predictions= 1341
Train epoch 11  Loss=19.161370 Time=0.4067sec
Train epoch 12  Loss=17.644384 Time=0.4063sec
Train epoch 13  Loss=14.584012 Time=0.4058sec
Train epoch 14  Loss=11.013591 Time=0.4113sec
Train epoch 15  Loss=10.569966 Time=0.4032sec
Train epoch 16  Loss=9.071127 Time=0.4027sec
Train epoch 17  Loss=8.246739 Time=0.4031sec
Train epoch 18  Loss=7.545294 Time=0.4212sec
Train epoch 19  Loss=6.690223 Time=0.4279sec
Train epoch 20  Loss=6.734526 Time=0.4

#### Testing