## Load the embeddings

In [150]:
import numpy as np
import collections
import array
import time

SubgraphMeta = collections.namedtuple('SubgraphMeta', 'typ rel ent siz')
Subgraph = collections.namedtuple('Subgraph', 'emb meta')
Result = collections.namedtuple('Result', 'query posh post')
   
inputfile = '/var/scratch/uji300/kgemb/fb15k_avgsubgraphs.bin'
logfile = '/var/scratch/uji300/kgemb/fb15K_subgraphs_valid.txt'
emb_meta_e = '/var/scratch/uji300/kgemb/models/fb15k/best-model/E-meta'
emb_e_path = '/var/scratch/uji300/kgemb/models/fb15k/best-model/E.0'
emb_meta_r = '/var/scratch/uji300/kgemb/models/fb15k/best-model/R-meta'
emb_r_path = '/var/scratch/uji300/kgemb/models/fb15k/best-model/R.0'

I load the file with the subgraph embeddings produced from Trident (for now I don't need them)

In [151]:
subgraphs_meta = []
embeddings = []
with open(inputfile, 'rb') as fin:
    b_nsubgraphs = fin.read(8)
    nsubgraphs = int.from_bytes(b_nsubgraphs, byteorder='little', signed=False)
    for i in range(nsubgraphs):
        line = fin.read(25)
        typ = line[0]
        rel = int.from_bytes(line[1:9], byteorder='little', signed=False)
        ent = int.from_bytes(line[9:17], byteorder='little', signed=False)
        siz = int.from_bytes(line[17:], byteorder='little', signed=False)        
        sg = SubgraphMeta(typ=typ, ent=ent, siz=siz, rel=rel)
        subgraphs_meta.append(sg)
    # Load the average embeddings
    emb_meta = fin.read(10)
    dims = int.from_bytes(emb_meta[:2], byteorder='big', signed=False)
    mincard = int.from_bytes(emb_meta[2:], byteorder='big', signed=False)
    for i in range(nsubgraphs):
        b_emb = fin.read(dims * 8)
        emb = np.frombuffer(b_emb, dtype=np.float64)
        embeddings.append(emb)

subgraphs = []
for i in range(nsubgraphs):
    subgraphs.append(Subgraph(emb=embeddings[i], meta=subgraphs_meta[i]))
subgraphs_meta = []
embeddings = []

Load all the embeddings.

In [152]:
def load_embeddings(meta, e_path):
    batch_size = 0
    dim = 0
    n = 0
    with open(meta, 'rb') as fmeta:
        raw = fmeta.read(10)
        batch_size = int.from_bytes(raw[:4], byteorder='little', signed=False)
        n = int.from_bytes(raw[4:8], byteorder='little', signed=False)
        dim = int.from_bytes(raw[8:], byteorder='little', signed=False)
    e = np.zeros(shape=(n,dim))
    with open(e_path, 'rb') as fin:
        for i in range(n):
            line = fin.read(8 + dim * 8)            
            emb = np.frombuffer(line[8:], dtype=np.float64)
            e[i] = emb
    return e
emb_e = load_embeddings(emb_meta_e, emb_e_path)
emb_r = load_embeddings(emb_meta_r, emb_r_path)

now load all the results

In [153]:
def get_train_valid_data(logfile, graph_type='POS'):
    results = []
    with open(logfile, 'rt') as fin:
        header = fin.readline()
        for l in fin:
            tkns = l.split('\t')
            query = tkns[0]
            pos_answer_subgraph_head = int(tkns[1])
            pos_answer_subgraph_tail = int(tkns[2])
            results.append(Result(query, pos_answer_subgraph_head, pos_answer_subgraph_tail))
    # Create the training data
    data_pos = np.zeros((len(results), 3), dtype=np.int)
    data_spo = np.zeros((len(results), 3), dtype=np.int)
    for i in range(len(results)):
        query = results[i].query
        tkns = query.split(' ')
        h = int(tkns[0])
        r = int(tkns[1])
        t = int(tkns[2])
        
        
        data_pos[i][0] = r
        data_pos[i][1] = t
        
        data_spo[i][0] = r
        data_spo[i][1] = h
        
        posh = results[i].posh
        post = results[i].post
        
        if posh > 0:
            if posh < 3:
                data_pos[i][2] = 1
            elif  posh < 5:
                data_pos[i][2] = 2
            elif posh < 10:
                data_pos[i][2] = 3
            else:
                data_pos[i][2] = 4
        
        if post > 0:
            if post < 3:
                data_spo[i][2] = 1
            elif  post < 5:
                data_spo[i][2] = 2
            elif post < 10:
                data_spo[i][2] = 3
            else:
                data_spo[i][2] = 4
     
    # Take away 10% which should be used for the validation
    idx_val=np.random.choice(data_pos.shape[0], int(data_pos.shape[0]*0.10), replace=False)

    valid_data_pos = data_pos[idx_val,:]
    train_data_pos = np.delete(data_pos, idx_val, axis=0)
    
    valid_data_spo = data_spo[idx_val,:]
    train_data_spo = np.delete(data_spo, idx_val, axis=0)
    
    classes = np.zeros(5)
    for t in train_data_pos:
        pos = t[2]
        classes[pos] += 1
    print(classes)

    return train_data_pos, valid_data_pos, train_data_spo, valid_data_spo


In [163]:
train_data_pos, valid_data_pos, train_data_spo, valid_data_spo = get_train_valid_data(logfile)#graph_type='SPO'

train_data = train_data_pos
valid_data = valid_data_pos

#train_data = train_data_spo
#valid_data = valid_data_spo

testCount = 0
for x,y in zip(valid_data_pos, valid_data_spo):
    testCount += 1
    if x[0] != y[0]:
        print("FATAL")
print ("Test count = ", testCount)

[13774.  3330.  1374.  1650.  3831.]
Test count =  2662


### Learning

Learning a simple logistic regression model using tensorflow

In [164]:
import tensorflow as tf

#### Params

In [165]:
learning_rate = 0.01
training_epochs = 50#100
batch_size = 100
display_step = 1
n_input = dims * 2
n_classes = 5
n_hidden_1 = 256 # n neurons first layer
n_hidden_2 = 256 # n neurons second layer

#### Input

In [166]:
# Set up the input queues
t_emb_e = tf.constant(emb_e)
t_emb_r = tf.constant(emb_r)

# Initialize the training data
t = tf.constant(train_data)
ds = tf.data.Dataset.from_tensor_slices(t)
ds = ds.shuffle(buffer_size=100)
ds = ds.batch(batch_size)

# Initialize the valid data
t_valid = tf.constant(valid_data)
ds_valid = tf.data.Dataset.from_tensor_slices(t_valid)
ds_valid = ds_valid.batch(batch_size)


#with tf.Session() as default_session:
#    test = default_session.run([t_valid])
    #print(test[0][55])
#    print(len(test[0]))

#iter = ds.make_initializable_iterator()
iter = tf.data.Iterator.from_structure(ds.output_types, ds.output_shapes)
el = iter.get_next()

# Lookup embeddings for the inputs
rel, ent, y = tf.split(el, num_or_size_splits=3, axis=1)
rel = tf.reshape(rel, shape=[-1]) # Flatten the tensor
ent = tf.reshape(ent, shape=[-1]) # Flatten the tensor
rel_emb = tf.nn.embedding_lookup(t_emb_r, rel)
ent_emb = tf.nn.embedding_lookup(t_emb_e, ent)
inp = tf.concat([rel_emb, ent_emb], axis=1)

#Process the labels
y = tf.reshape(y, shape=[-1])
y_hot = tf.one_hot(y, n_classes)

#### Model

In [167]:
weights = {
    'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1], dtype=tf.float64), dtype=tf.float64),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2], dtype=tf.float64), dtype=tf.float64),
    'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes], dtype=tf.float64), dtype=tf.float64)
}
biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1], dtype=tf.float64), dtype=tf.float64),
    'b2': tf.Variable(tf.random_normal([n_hidden_2], dtype=tf.float64), dtype=tf.float64),
    'out': tf.Variable(tf.random_normal([n_classes], dtype=tf.float64), dtype=tf.float64)
}
layer_1 = tf.add(tf.matmul(inp, weights['h1']), biases['b1'])
layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
out_layer = tf.matmul(layer_2, weights['out']) + biases['out']
pred = tf.nn.softmax_cross_entropy_with_logits_v2(logits=out_layer, labels=y_hot)


#### Gradient

In [168]:
loss_op = tf.reduce_mean(pred)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)

#### Testing

In [169]:
# New operator to perform the predictions
predictions = tf.nn.softmax(out_layer, name='predictions')
# Compare the softmax with the actual values
pred_indices = tf.argmax(predictions, axis=1)
res = tf.equal(pred_indices, y)
res = tf.cast(res, tf.int32)
sres = tf.reduce_sum(res)


#### Learning

In [170]:
init = tf.global_variables_initializer()
train_input_initializer = iter.make_initializer(ds)
valid_input_initializer = iter.make_initializer(ds_valid)

with tf.Session() as sess:
    sess.run([init])
    for epoch in range(training_epochs):
        start = time.time()
        sess.run(train_input_initializer)
        # Loop over all batches
        avg_loss = 0
        num_batch = 0
        while True:
            try:
                loss = sess.run([loss_op, train_op])                
                num_batch += 1
                avg_loss += loss[0]
            except (tf.errors.OutOfRangeError, StopIteration):
                break
            except e:
                print(e)
                break
        print('Train epoch', epoch, " Loss={:.6f}".format(avg_loss / num_batch), "Time={:.4f}sec".format(time.time() - start))
        if epoch % 10 == 0:
            # Test the performance on the valid dataset
            sess.run(valid_input_initializer)
            
            num_batch = 0
            correct = 0
            myCorrect = 0
            predLog = open('fb15k-predictions.pos.log', 'w')
            while True:
                try:
                    p = sess.run([sres, pred_indices, y, rel, ent, t_valid])                
                    correct += p[0]
                    
                    #print("# batches = ", num_batch)
                    #print("p = ", len(p))
                    #out = pred_indices.eval()
                    #print("predicted indices: ", p[1])
                    #expected = y.eval()
                    #print("Expected indices : ", p[2])
                    #print("correct predictions = ", p[0])
                    #print('*'*80)
                    row = ""
                    count = 0
                    for r,e,k in zip(p[3], p[4], p[1]):
                        row += str(r) + " " + str(e) + " " + str(k) + "\n"
                        #print(str(r) + " " + str(e) + " " + str(k))
                        #print(p[5][100*num_batch + count])
                        if int(k) == p[5][100*num_batch + count][2]:
                            myCorrect += 1
                        count +=1
                    predLog.write(row)
                    
                    num_batch += 1
                    #print(count, " rows written")
                    
                except (tf.errors.OutOfRangeError, StopIteration):
                    break
                except e:
                    print(e)
            print("Correct predictions= ", correct)
            print("*** MY correct Pred= ", myCorrect)
    predLog.close()
print("Optimization Finished!")

Train epoch 0  Loss=538.971007 Time=0.8379sec
Correct predictions=  1146
*** MY correct Pred=  1146
Train epoch 1  Loss=231.894607 Time=0.5200sec
Train epoch 2  Loss=198.348340 Time=0.5075sec
Train epoch 3  Loss=133.655735 Time=0.5134sec
Train epoch 4  Loss=81.048480 Time=0.5071sec
Train epoch 5  Loss=58.802704 Time=0.5102sec
Train epoch 6  Loss=48.161137 Time=0.5105sec
Train epoch 7  Loss=35.214795 Time=0.5042sec
Train epoch 8  Loss=27.850070 Time=0.5067sec
Train epoch 9  Loss=23.328911 Time=0.5171sec
Train epoch 10  Loss=17.664958 Time=0.5004sec
Correct predictions=  1344
*** MY correct Pred=  1344
Train epoch 11  Loss=13.979755 Time=0.4998sec
Train epoch 12  Loss=11.956160 Time=0.5044sec
Train epoch 13  Loss=14.336175 Time=0.4982sec
Train epoch 14  Loss=12.539554 Time=0.5173sec
Train epoch 15  Loss=10.326099 Time=0.5025sec
Train epoch 16  Loss=8.199671 Time=0.5038sec
Train epoch 17  Loss=7.205559 Time=0.5187sec
Train epoch 18  Loss=7.666417 Time=0.6668sec
Train epoch 19  Loss=6.5397

#### Testing

In [44]:
!pwd

/home/uji300/karma/trident/scripts


## SPO

In [172]:
# Set up the input queues
t_emb_e = tf.constant(emb_e)
t_emb_r = tf.constant(emb_r)

# Initialize the training data
t = tf.constant(train_data_spo)
ds = tf.data.Dataset.from_tensor_slices(t)
ds = ds.shuffle(buffer_size=100)
ds = ds.batch(batch_size)

# Initialize the valid data
t_valid = tf.constant(valid_data_spo)
ds_valid = tf.data.Dataset.from_tensor_slices(t_valid)
ds_valid = ds_valid.batch(batch_size)


#with tf.Session() as default_session:
#    test = default_session.run([t_valid])
    #print(test[0][55])
#    print(len(test[0]))

#iter = ds.make_initializable_iterator()
iter = tf.data.Iterator.from_structure(ds.output_types, ds.output_shapes)
el = iter.get_next()

# Lookup embeddings for the inputs
rel, ent, y = tf.split(el, num_or_size_splits=3, axis=1)
rel = tf.reshape(rel, shape=[-1]) # Flatten the tensor
ent = tf.reshape(ent, shape=[-1]) # Flatten the tensor
rel_emb = tf.nn.embedding_lookup(t_emb_r, rel)
ent_emb = tf.nn.embedding_lookup(t_emb_e, ent)
inp = tf.concat([rel_emb, ent_emb], axis=1)

#Process the labels
y = tf.reshape(y, shape=[-1])
y_hot = tf.one_hot(y, n_classes)

In [173]:
weights = {
    'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1], dtype=tf.float64), dtype=tf.float64),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2], dtype=tf.float64), dtype=tf.float64),
    'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes], dtype=tf.float64), dtype=tf.float64)
}
biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1], dtype=tf.float64), dtype=tf.float64),
    'b2': tf.Variable(tf.random_normal([n_hidden_2], dtype=tf.float64), dtype=tf.float64),
    'out': tf.Variable(tf.random_normal([n_classes], dtype=tf.float64), dtype=tf.float64)
}
layer_1 = tf.add(tf.matmul(inp, weights['h1']), biases['b1'])
layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
out_layer = tf.matmul(layer_2, weights['out']) + biases['out']
pred = tf.nn.softmax_cross_entropy_with_logits_v2(logits=out_layer, labels=y_hot)


#### Gradient

In [174]:
loss_op = tf.reduce_mean(pred)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)

#### Testing

In [175]:
# New operator to perform the predictions
predictions = tf.nn.softmax(out_layer, name='predictions')
# Compare the softmax with the actual values
pred_indices = tf.argmax(predictions, axis=1)
res = tf.equal(pred_indices, y)
res = tf.cast(res, tf.int32)
sres = tf.reduce_sum(res)


#### Learning

In [176]:
init = tf.global_variables_initializer()
train_input_initializer = iter.make_initializer(ds)
valid_input_initializer = iter.make_initializer(ds_valid)

with tf.Session() as sess:
    sess.run([init])
    for epoch in range(training_epochs):
        start = time.time()
        sess.run(train_input_initializer)
        # Loop over all batches
        avg_loss = 0
        num_batch = 0
        while True:
            try:
                loss = sess.run([loss_op, train_op])                
                num_batch += 1
                avg_loss += loss[0]
            except (tf.errors.OutOfRangeError, StopIteration):
                break
            except e:
                print(e)
                break
        print('Train epoch', epoch, " Loss={:.6f}".format(avg_loss / num_batch), "Time={:.4f}sec".format(time.time() - start))
        if epoch % 10 == 0:
            # Test the performance on the valid dataset
            sess.run(valid_input_initializer)
            
            num_batch = 0
            correct = 0
            myCorrect = 0
            predLog = open('fb15k-predictions.spo.log', 'w')
            while True:
                try:
                    p = sess.run([sres, pred_indices, y, rel, ent, t_valid])                
                    correct += p[0]
                    
                    #print("# batches = ", num_batch)
                    #print("p = ", len(p))
                    #out = pred_indices.eval()
                    #print("predicted indices: ", p[1])
                    #expected = y.eval()
                    #print("Expected indices : ", p[2])
                    #print("correct predictions = ", p[0])
                    #print('*'*80)
                    row = ""
                    count = 0
                    for r,e,k in zip(p[3], p[4], p[1]):
                        row += str(r) + " " + str(e) + " " + str(k) + "\n"
                        #print(str(r) + " " + str(e) + " " + str(k))
                        #print(p[5][100*num_batch + count])
                        if int(k) == p[5][100*num_batch + count][2]:
                            myCorrect += 1
                        count +=1
                    predLog.write(row)
                    
                    num_batch += 1
                    #print(count, " rows written")
                    
                except (tf.errors.OutOfRangeError, StopIteration):
                    break
                except e:
                    print(e)
            print("Correct predictions= ", correct)
            print("*** MY correct Pred= ", myCorrect)
    predLog.close()
print("Optimization Finished!")

Train epoch 0  Loss=376.057455 Time=0.8333sec
Correct predictions=  1269
*** MY correct Pred=  1269
Train epoch 1  Loss=311.153680 Time=0.5860sec
Train epoch 2  Loss=202.498197 Time=0.6498sec
Train epoch 3  Loss=130.024789 Time=0.6707sec
Train epoch 4  Loss=84.181611 Time=0.6818sec
Train epoch 5  Loss=55.407388 Time=0.6971sec
Train epoch 6  Loss=41.994379 Time=0.6820sec
Train epoch 7  Loss=37.773584 Time=0.6692sec
Train epoch 8  Loss=24.142956 Time=0.6871sec
Train epoch 9  Loss=22.431698 Time=0.5913sec
Train epoch 10  Loss=17.931837 Time=0.5079sec
Correct predictions=  1179
*** MY correct Pred=  1179
Train epoch 11  Loss=15.196494 Time=0.5050sec
Train epoch 12  Loss=14.133160 Time=0.5085sec
Train epoch 13  Loss=12.538621 Time=0.5018sec
Train epoch 14  Loss=12.906458 Time=0.5071sec
Train epoch 15  Loss=9.693669 Time=0.5057sec
Train epoch 16  Loss=9.497685 Time=0.5058sec
Train epoch 17  Loss=8.202697 Time=0.5020sec
Train epoch 18  Loss=6.579006 Time=0.4991sec
Train epoch 19  Loss=7.65730