In [1]:
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True

In [2]:
f = open('链路预测数据集/FB15k/freebase_mtr100_mte100-train.txt')
data = f.read()
triples = data.split('\n')
triples = triples[:483142]

totals = list([entity_and_relation for triple in triples for entity_and_relation in triple.split('\t')])
print('三元组数量',len(triples))
print('实体和关系数量',len(set(totals)))

三元组数量 483142
实体和关系数量 16296


In [3]:
head_entities = list([triple.split('\t')[0] for triple in triples])
relations =  list([triple.split('\t')[1] for triple in triples])
tail_entities =  list([triple.split('\t')[2] for triple in triples])

# 全部实体
total_entities = head_entities + tail_entities

print('实体数',len(set(head_entities + tail_entities)))
print('关系数',len(set(relations)))

实体数 14951
关系数 1345


### 负采样三元组

In [4]:
import random

# 生成负采样的三元组
invalid_head_entities = []
invalid_relations = []
invalid_tail_entities = []

for i in range(len(relations)):
    random_int = random.randint(0,len(head_entities)-1)
    random_int1 = random.randint(0,len(head_entities)-1)
    
    # 不同时替换头尾实体
    if i%2 ==0:
        invalid_head_entities.append(head_entities[i])
        invalid_relations.append(relations[i])
        invalid_tail_entities.append(tail_entities[random_int1])
    else:
        invalid_head_entities.append(head_entities[random_int])
        invalid_relations.append(relations[i])
        invalid_tail_entities.append(tail_entities[i])

### 实体和关系编码

In [5]:
# 为每个实体/关系分配倒排索引
# 根据 名称 查找 id
reverse_index_entities = dict([key,index] for index,key in enumerate(set(total_entities)))
print('实体索引长度',len(reverse_index_entities))

reverse_index_relation = dict([key,index] for index,key in enumerate(set(relations)))
print('关系索引长度',len(reverse_index_relation))

# 黄金三元组索引
head_entities_encode = list([reverse_index_entities[entry] for entry in head_entities])
relations_encode = list([reverse_index_relation[relation] for relation in relations])
tail_entities_encode = list([reverse_index_entities[entry] for entry in tail_entities])

#无效三元组索引
invalid_head_entities_encode = list([reverse_index_entities[entry] for entry in invalid_head_entities])
invalid_relations_encode = list([reverse_index_relation[relation] for relation in invalid_relations])
invalid_tail_entities_encode = list([reverse_index_entities[entry] for entry in invalid_tail_entities])

print(head_entities_encode[123456],relations_encode[123456],tail_entities_encode[123456])
print(head_entities_encode[12345],relations_encode[12345],tail_entities_encode[12345])
print(head_entities_encode[12234],relations_encode[12234],tail_entities_encode[12234])

print(invalid_head_entities_encode[123456],invalid_relations_encode[123456],invalid_tail_entities_encode[123456])
print(invalid_head_entities_encode[12345],invalid_relations_encode[12345],invalid_tail_entities_encode[12345])
print(invalid_head_entities_encode[12234],invalid_relations_encode[12234],invalid_tail_entities_encode[12234])

实体索引长度 14951
关系索引长度 1345
934 91 7744
2872 120 10893
3642 861 8034
934 91 2159
12175 120 10893
3642 861 8948


### 组合数据

In [6]:
# 编码后的数据
golden_data = list(zip(head_entities_encode,relations_encode,tail_entities_encode))
invalid_data = list(zip(invalid_head_entities_encode,invalid_relations_encode,invalid_tail_entities_encode))
    
print(golden_data[123456])
print(invalid_data[123456])

print('三元组数量',len(golden_data))
print('三元组数量',len(invalid_data))

(934, 91, 7744)
(934, 91, 2159)
三元组数量 483142
三元组数量 483142


In [7]:
keep = 0 
def nextbatch(batchsize=400):
    global keep
    start = keep
    end = start + batchsize
    
    if end > len(golden_data):
        end = end % len(golden_data)
        keep = end
        return golden_data[start:]+golden_data[:end],invalid_data[start:]+invalid_data[:end]
    else:
        keep = end
        return golden_data[start:end],invalid_data[start:end]

### 定义模型

In [8]:
import tensorflow as tf
import math

# 实体数量
entities_num = len(reverse_index_entities)
# 关系数量
relations_num = len(reverse_index_relation)
dim = 100
margin = 1.50
learning_rate = 1e-2

# embedding层
ent_embeds =  tf.Variable(tf.truncated_normal([entities_num, dim], stddev=1.0 / math.sqrt(dim)))

rel_embeds =  tf.Variable(tf.truncated_normal([relations_num, dim], stddev=1.0 / math.sqrt(dim)))

rel_matrix =  tf.Variable(tf.truncated_normal([relations_num, dim * dim], stddev=1.0 / math.sqrt(dim)))

#定义输入
pos_hs = tf.placeholder(tf.int32, shape=[None])
pos_rs = tf.placeholder(tf.int32, shape=[None])
pos_ts = tf.placeholder(tf.int32, shape=[None])
neg_hs = tf.placeholder(tf.int32, shape=[None])
neg_rs = tf.placeholder(tf.int32, shape=[None])
neg_ts = tf.placeholder(tf.int32, shape=[None])

#从embedding层取值
# 实体向量需要转置，这里用reshape
phs_origin = tf.reshape(tf.nn.embedding_lookup(ent_embeds, pos_hs), [-1, dim, 1])
prs = tf.nn.embedding_lookup(rel_embeds, pos_rs)
pts_origin = tf.reshape(tf.nn.embedding_lookup(ent_embeds, pos_ts), [-1, dim, 1])

nhs_origin = tf.reshape(tf.nn.embedding_lookup(ent_embeds, neg_hs), [-1, dim, 1])
nrs = tf.nn.embedding_lookup(rel_embeds, neg_rs)
nts_origin = tf.reshape(tf.nn.embedding_lookup(ent_embeds, neg_ts), [-1, dim, 1])

p_matrix = tf.reshape(tf.nn.embedding_lookup(rel_matrix, pos_rs),
                                  [-1, dim, dim])
n_matrix = tf.reshape(tf.nn.embedding_lookup(rel_matrix, neg_rs),
                                  [-1, dim, dim])

phs = tf.reshape(tf.matmul(p_matrix, phs_origin), [-1, dim])
pts = tf.reshape(tf.matmul(p_matrix, pts_origin), [-1, dim])
phs = tf.nn.l2_normalize(phs, 1)
pts = tf.nn.l2_normalize(pts, 1)

nhs = tf.reshape(tf.matmul(n_matrix, nhs_origin), [-1, dim])
nts = tf.reshape(tf.matmul(n_matrix, nts_origin), [-1, dim])
nhs = tf.nn.l2_normalize(nhs, 1)
nts = tf.nn.l2_normalize(nts, 1)

# margin loss
pos_triple_loss = tf.reduce_sum(tf.abs(phs + prs - pts),axis = 1)
neg_triple_loss = tf.reduce_sum(tf.abs(nhs + nrs - nts),axis = 1)
triple_loss = tf.reduce_sum(tf.nn.relu(tf.constant(margin) + pos_triple_loss - neg_triple_loss))

# limited_loss
# pos_margin = 0.50
# neg_margin = 5.50
# balance =  0.2
# pos_score = tf.reduce_sum(tf.square(phs + prs - pts),axis = 1)
# neg_score = tf.reduce_sum(tf.square(nhs + nrs - nts),axis = 1)
# pos_loss = tf.reduce_sum(tf.nn.relu(pos_score - tf.constant(pos_margin)))
# neg_loss = tf.reduce_sum(tf.nn.relu(tf.constant(neg_margin) - neg_score))
# triple_loss = tf.add(pos_loss, balance * neg_loss, name='limited_loss')
        
triple_optimizer = tf.train.AdagradOptimizer(learning_rate).minimize(triple_loss)

In [None]:
import time
epochs_num = 200
batch_size = 2000

sess = tf.Session(config=config)
sess.run(tf.initialize_all_variables())

Instructions for updating:
Use `tf.global_variables_initializer` instead.


In [None]:
for step in range(epochs_num):
    time_start = time.time() #开始计时
    loss_total = 0
    print('start:',keep)
    for i in range(len(golden_data)//batch_size + 1):
#         print('batch_size',batch_size)
#         print('range',range(len(golden_data)//batch_size + 1))
        batch_pos,batch_neg = nextbatch(batchsize=batch_size)   
        feed_dict = {pos_hs: [x[0] for x in batch_pos], pos_rs: [x[1] for x in batch_pos], pos_ts: [x[2] for x in batch_pos],
                     neg_hs: [x[0] for x in batch_neg], neg_rs: [x[1] for x in batch_neg], neg_ts: [x[2] for x in batch_neg]}
        loss_val, _ = sess.run([triple_loss, triple_optimizer], feed_dict=feed_dict)
        # 损失统计
        loss_total += loss_val
        data_pass = int(i*60/(len(golden_data)//batch_size + 1)) + 1
        data_left = 60 - data_pass
        
        print("\r epoch %d, %s%s,loss_val %f" % (step,data_pass*'>',data_left*' ',loss_val),end='')
    print("\repoch %d, %s%s,loss_val %f" % (step,data_pass*'>',data_left*' ',loss_val),end='\n')
    print("\rstep %d, loss_val %f" % (step, loss_total/(len(batch_pos)//batch_size + 1)))
    time_end = time.time()    #结束计时
    sum_t= time_end - time_start   #运行所花时间
    print('time cost', sum_t, 's')

start: 0
epoch 0, >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>,loss_val 869.51178000
step 0, loss_val 238394.457062
time cost 30.717217922210693 s
start: 858
epoch 1, >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>,loss_val 118.2953644
step 1, loss_val 66248.383385
time cost 33.66998028755188 s
start: 1716
epoch 2, >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>,loss_val 42.50317882
step 2, loss_val 31592.714468
time cost 32.818262338638306 s
start: 2574
epoch 3, >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>,loss_val 26.55599664
step 3, loss_val 18066.700434
time cost 31.113813161849976 s
start: 3432
 epoch 4, >>>>>>>>>>>>>>>>                                            ,loss_val 93.8384021

In [None]:
saver = tf.train.Saver()    # 生成saver
saver.save(sess, "./tensorflow_transr_model.h5")     # 当路径不存在时，会自动创建路径

In [None]:
batch_pos,batch_neg = nextbatch(batchsize=batch_size)   
feed_dict = {pos_hs: [x[0] for x in batch_pos], pos_rs: [x[1] for x in batch_pos], pos_ts: [x[2] for x in batch_pos],
                     neg_hs: [x[0] for x in batch_neg], neg_rs: [x[1] for x in batch_neg], neg_ts: [x[2] for x in batch_neg]}

print(sess.run(pos_triple_loss, feed_dict=feed_dict))
print(sess.run(neg_triple_loss, feed_dict=feed_dict))

In [None]:
entity_embedding_index = []
rel_embedding_index = []
rel_mapping_index = []

#为关系 属性构建索引

entity_ids = [reverse_index_entities[i] for i in list(reverse_index_entities)]
relation_ids = [reverse_index_relation[i] for i in list(reverse_index_relation)]

entities_embedding = sess.run(phs_origin,feed_dict={pos_hs:entity_ids})
relation_embedding = sess.run(prs,feed_dict={pos_rs:relation_ids})
relation_matrix = sess.run(p_matrix,feed_dict={pos_rs:relation_ids})

# url到embedding的三个字典
entities_embedding_dict = dict(zip(list(reverse_index_entities),entities_embedding))
relation_embedding_dict = dict(zip(list(reverse_index_relation),relation_embedding))
relation_matrix_dict = dict(zip(list(reverse_index_relation),relation_matrix))

In [None]:
# 读入训练数据
# 先用hit@10标准测试一下
valid_file = open('链路预测数据集/FB15k/freebase_mtr100_mte100-valid.txt')
valid_data = valid_file.read()
valid_triples = valid_data.split('\n')[:50000]

#生成验证集实体和关系集合
valid_head_entities = list([triple.split('\t')[0] for triple in valid_triples])
valid_relations =  list([triple.split('\t')[1] for triple in valid_triples])
valid_tail_entities =  list([triple.split('\t')[2] for triple in valid_triples])

链接预测

In [None]:
print('开始预测')
import numpy as np
import time

hit10 = 0
hit50 = 0
hit100 = 0
num = 0

def takeFirst(elem):
    return elem[0]

for head,relation,tail in zip(valid_head_entities[:1000],valid_relations[:1000],valid_tail_entities[:1000]):
    num = num+1
    time_start = time.time() #开始计时

    print(head,relation,tail)
    #　头实体嵌入
    valid_head_embedding = entities_embedding_dict[head]
    #　关系的嵌入
    valid_relation_embedding = relation_embedding_dict[relation]
    # 关系的投影
    valid_relation_matrix = relation_matrix_dict[relation]
    
    
    mapped_head = np.dot(valid_relation_matrix,valid_head_embedding).T
    mapped_head_and_relation = mapped_head + valid_relation_embedding

    #　与所有的实体计算距离
    distance_list = []
    for index,entity in enumerate(list(reverse_index_entities)):
        tail_embedding = entities_embedding_dict[entity]
        mapped_tail_embedding = np.dot(valid_relation_matrix,tail_embedding).T
        # 二阶距离
        distance = np.linalg.norm(mapped_head_and_relation - mapped_tail_embedding)
        distance_list.append([distance,entity])

    # 对所有距离进行排序
    distance_list.sort(key=takeFirst)
    
    predict_100_list = list([entity[1] for entity in distance_list])[:100]
    predict_50_list = list([entity[1] for entity in distance_list])[:50]
    predict_10_list = list([entity[1] for entity in distance_list])[:10]

    if tail in predict_100_list:
        hit100 = hit100+1
        
    if tail in predict_50_list:
        hit50 = hit50+1
        
    if tail in predict_10_list:
        hit10 = hit10 + 1

    print('hit',hit10/num,hit50/num,hit100/num,num)