In [1]:
import keras
keras.__version__

import tensorflow as tf
import keras.backend.tensorflow_backend as KTF

config = tf.ConfigProto()  
config.gpu_options.allow_growth=True   #不全部占满显存, 按需分配
session = tf.Session(config=config)

KTF.set_session(session)

Using TensorFlow backend.


知识嵌入的数据集

四个数据集FB15K，FB15K-237，WN18，WN18RR，YAGO3-10

FB15K数据集统计
* 包含14951 个实体 和1345种关系
* 训练集包含483142个三元组，验证集包含50000个三元组，测试集包含59071个三元组
* 使用此数据时，引用其原始论文


### 通过Keras实现TransE模型
步骤：
1. 统计一共有多少个实体
* 将关系和属性一起训练
* 设定嵌入维度
* 生成等量的负采样三元组
* 通过Embedding层进行训练
* 保存Embedding层参数用于判断
* 与结果对照

读入数据，统计实体和关系的总个数

In [2]:
f = open('链路预测数据集/FB15k/freebase_mtr100_mte100-train.txt')
data = f.read()
triples = data.split('\n')
triples = triples[:483142]
totals = list([entity_and_relation for triple in triples for entity_and_relation in triple.split('\t')])
print(len(set(totals)))
#嵌入的总数为16297

16296


In [3]:
# 生成头实体集合 尾实体集合 关系集合 生成负采样三元组
head_entities = list([triple.split('\t')[0] for triple in triples])
relations =  list([triple.split('\t')[1] for triple in triples])
tail_entities =  list([triple.split('\t')[2] for triple in triples])
print('实体数',len(set(head_entities+tail_entities)))
print('关系数',len(set(relations)))

实体数 14951
关系数 1345


生成负采样三元组

In [4]:
import random

# 生成负采样的三元组
invalid_head_entities = []
invalid_relations = []
invalid_tail_entities = []

for i,relation in enumerate(relations):
    random_int = random.randint(0,len(head_entities)-1)
    random_int1 = random.randint(0,len(head_entities)-1)
    
    # 不同时替换头尾实体
    if i%2 ==0:
        invalid_head_entities.append(head_entities[i])
        invalid_relations.append(relation)
        invalid_tail_entities.append(tail_entities[random_int1])
    else:
        invalid_head_entities.append(head_entities[random_int])
        invalid_relations.append(relation)
        invalid_tail_entities.append(tail_entities[i])
#     invalid_head_entities.append(head_entities[random_int])
#     invalid_relations.append(relation)
#     invalid_tail_entities.append(tail_entities[random_int1])

实体编码

In [5]:
# 为每个实体和关系生成数字表示
import numpy as np

# 为每个实体/关系分配倒排索引
reverse_index = dict([key,index] for index,key in enumerate(set(totals)))

#数据标签化

head_entities_encode = list([reverse_index[entry] for entry in head_entities])
relations_encode = list([reverse_index[relation] for relation in relations])
tail_entities_encode = list([reverse_index[entry] for entry in tail_entities])

invalid_head_entities_encode = list([reverse_index[entry] for entry in invalid_head_entities])
invalid_relations_encode = list([reverse_index[relation] for relation in invalid_relations])
invalid_tail_entities_encode = list([reverse_index[entry] for entry in invalid_tail_entities])

模型部分

In [6]:
from keras import models
from keras import layers
from keras import optimizers
from keras import Input
from keras import Model
from keras.layers import Flatten

max_word = 16296
embedding_dim = 100
maxlen = 6
margin = 1

y_hat = list([np.zeros((6,embedding_dim)) for entry in head_entities])

# Keras中的第一位代表向量
# 一次输入两个三元组
golden_head_input = Input(shape=(1,))
golden_relation_input = Input(shape=(1,))
golden_tail_input = Input(shape=(1,))

invalid_head_input = Input(shape=(1,))
invalid_relation_input = Input(shape=(1,))
invalid_tail_input = Input(shape=(1,))

# 对向量进行拼接
input_concatenate = layers.concatenate([golden_head_input,golden_relation_input,golden_tail_input,
                                         invalid_head_input,invalid_relation_input,invalid_tail_input],axis=1)

# 一次输入6个嵌入
layer_embedding = layers.Embedding(max_word,embedding_dim,input_length = maxlen)(input_concatenate)
# layer_flatten = layers.Flatten()(layer_embedding)
layer_flatten = layer_embedding

# 输出
keras_model = Model([golden_head_input,golden_relation_input,golden_tail_input,
                     invalid_head_input,invalid_relation_input,invalid_tail_input],layer_flatten)
keras_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_5 (I

In [7]:
import keras.backend.tensorflow_backend as K
import keras

def keras_custom_loss_function(y_true,y_pred):
    g_head_em = y_pred[0]
    g_rel_em = y_pred[1]
    g_tail_em = y_pred[2]
    i_head_em = y_pred[3]
    i_rel_em = y_pred[4]
    i_tail_em = y_pred[5]
    
#     return K.abs(margin + K.square(g_head_em+g_rel_em-g_tail_em) - K.square(i_head_em+i_rel_em+i_tail_em) + y_true)
    golden_energy =  K.sum(K.abs(g_head_em+g_rel_em-g_tail_em),axis=1)
    invalid_energy = K.sum(K.abs(i_head_em+i_rel_em+i_tail_em),axis=1)
    
    return K.sum(K.relu(margin + golden_energy - invalid_energy))


optimizer = keras.optimizers.RMSprop(0.1)

# keras_model.compile(loss = keras_custom_loss_function,optimizer = optimizer)
keras_model.compile(loss = keras_custom_loss_function,optimizer='rmsprop')

In [None]:
print(len(head_entities_encode))
print(len(tail_entities_encode))

483142
483142


In [None]:
keras_model.fit([head_entities_encode, relations_encode,tail_entities_encode,
                invalid_head_entities_encode,invalid_relations_encode,invalid_tail_entities_encode], [y_hat],
          epochs=200, batch_size=399)

keras_model.save("TransE.h5")

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200

In [None]:
from keras.models import load_model

# 所有实体　关系的编码
total_encode = [reverse_index[data] for data in totals]

# 所有实体　关系的编码
entity_code = list([reverse_index[entity] for entity in totals])

keras_model = load_model("TransE.h5",custom_objects={'keras_custom_loss_function':keras_custom_loss_function
})

In [None]:
print(len(list(set(totals))))

In [None]:
entity_set = list(set(totals))

entity_set_code =  list([reverse_index[entity] for entity in entity_set])

entity_embedding = keras_model.predict([entity_set_code,entity_set_code,entity_set_code,
                                        entity_set_code,entity_set_code,entity_set_code])[:,0]

#　根据uri　寻找embedding
TransE_index = dict(zip(entity_set,entity_embedding))

In [None]:
# Transe序列化
import pickle

with open('transe.data', mode="wb+") as f:
    pickle.dump(TransE_index,f)

In [None]:
# TransE_index反序列化

with open('transe.data','rb') as f:
    data = pickle.load(f)

In [None]:
TransE_embeddings = keras_model.predict([entity_code,entity_code,entity_code,
                                        entity_code,entity_code,entity_code])[:,0]
# TransE_embeddings = keras_model.predict([entity_code,entity_code,entity_code,
#                                         entity_code,entity_code,entity_code])

# print(list(TransE_embeddings)[10])

In [None]:
print(list(TransE_index)[0])
print(list(TransE_embeddings)[0])
print(TransE_index[list(TransE_index)[0]])

链接预测评测代码

In [None]:
# 读入训练数据
# 先用hit@10标准测试一下
valid_file = open('链路预测数据集/FB15k/freebase_mtr100_mte100-valid.txt')
valid_data = valid_file.read()
valid_triples = valid_data.split('\n')[:50000]

#生成验证集实体和关系集合
valid_head_entities = head_entities = list([triple.split('\t')[0] for triple in valid_triples])
valid_relations =  list([triple.split('\t')[1] for triple in valid_triples])
valid_tail_entities =  list([triple.split('\t')[2] for triple in valid_triples])

In [None]:
# 寻找最近的值
import numpy as np

def find_nearest(array, value):

    array = np.asarray(array)

    idx = (np.abs(array - value)).argmin()

    return array[idx]

In [None]:
import operator

def takeFirst(elem):
    return elem[0]

# 生成要预测的实体
# 对每个实体进行预测，这里应该封装成一个函数
for head,relation,tail in zip(valid_head_entities[:10],valid_relations[:10],valid_tail_entities[:10]):
    #　头实体嵌入
    valid_head_embedding = TransE_index[head]
    #　关系的嵌入
    valid_relation_embedding = TransE_index[relation]
    #　头尾实体加和
    head_and_relation = valid_head_embedding + valid_relation_embedding
    # 预测一个尾实体
    predict_tail_embedding = head_and_relation
    
    #　与所有的实体计算距离
    distance_list = []
    # 遍历TransE嵌入中的所有实体
    for index,entity in enumerate(list(TransE_index)):
        
        #　与TransE中的所有实体进行比较
        embedding = TransE_index[entity]
        
        # 二阶距离
        distance = np.linalg.norm(predict_tail_embedding - embedding)
        distance_list.append([distance,entity])

        # 对所有距离进行排序
        distance_list.sort(key=takeFirst)

    print(distance_list[0][1])

    # 打印前10个预测结果    
    print('预测',distance_list[:10])
    print('标签',tail)
    if tail in distance_list[:5000]:
        print('hit@5000yes')
    else:
        print('hit@5000no')
        
    if tail in distance_list[:10]:
        print('hit@10yes')
    else:
        print('hit@10no')
        print('\n')


In [None]:
 print(list(TransE_index)[:10])
for key in list(TransE_index)[:10]:
    print(TransE_index[key])