In [1]:
# 三元组读取和持久化.rdflib的一项主要的功能就是将一种基于语法（如xml,n3,ntriples,trix,JSON）的文件变换成一个RDF格式的知识
# 支持解析和序列化的文件格式：RDF/XML,N3,NTriples,N-Quads,Turtle,TriX,RDFa,Microdata,JSON-LD。
from rdflib import Graph 
import random
import numpy as np
import tensorflow as tf
import math
import datetime as dt
# cPickle可以对任意一种类型的python对象进行序列化操作
import cPickle
# kitchen includes functions to make gettext easier to use, handling unicode text easier 
# (conversion with bytes, outputting xml, and calculating how many columns a string takes)
from kitchen.text.converters import getwriter, to_bytes, to_unicode
# i18n => 国际化：https://pythonhosted.org/kitchen/api-i18n.html
from kitchen.i18n import get_translation_object
translations = get_translation_object('example')
_ = translations.ugettext
b_ = translations.lgettext

In [2]:
import platform
print('Python version: %s' % platform.python_version())
print('Tensorflow version: %s' % tf.__version__)

Python version: 2.7.16
Tensorflow version: 1.9.0


In [3]:
### Combine two KG 
## 加载数据集
# DWY-NB consists of two datasets DY-NB and DW-NB; each dataset consists of a pair of KGs that can be used for the evaluation of EAtechniques. 
# The two KGs of DY-NB are subsets of DBpedia [Auer et al., 2007] and Yago [Hoffart et al., 2013], respectively. 
# The two KGs of DW-NB are subsets of DBpedia and Wikidata [Vrandecic and Krotzsch, 2014].

# ttl是Turtle 格式的简称, 是RDF数据的表达格式之一。RDF的最初的的格式是xml/rdf格式，但是过于繁琐，turtle则直观简单很多
# Turtle数据描述：
# @prefix entity: <http://www.wikidata.org/entity#>
# @prefix rdf-schema: <http://www.w3.org/2000/01/rdf-schema#> 
# @prefix XMLSchema: <http://www.w3.org/2001/XMLSchema#>
# entity:Q1376298 rdf-schema:label "Europe"
# entity:Q5312467 entity:P569 "1821-09-26" ^^XMLSchema:date
# 其实wd.ttl使用的是ntriples存储
lgd_filename = 'DWY-NB/DW-NB/wd.ttl'  # (The subset of Wikidata KG)
dbp_filename = 'DWY-NB/DW-NB/dbp_wd.ttl' # (The subset of DBpedia KG)
map_file = 'DWY-NB/DW-NB/mapping_wd.ttl' # (The known entity alignment as testing data)

# 创建一个图谱
graph = Graph()
# 解析wd.ttl, dbp_wd.ttl, 文件格式为ntriples
graph.parse(location=lgd_filename, format='nt')
graph.parse(location=dbp_filename, format='nt')

# 解析mapping_wd.ttl
map_graph = Graph()
map_graph.parse(location=map_file, format='nt')

<Graph identifier=N5abe871c1af34c318384399cf826d73c (<class 'rdflib.graph.Graph'>)>

In [4]:
# 实体标签label
entity_label_dict = dict()

# 遍历所解析的图谱, 并将头尾实体存储在dict中, 以头为key, 尾为value
for s,p,o in graph:
    if (unicode)(p) == u'http://www.w3.org/2000/01/rdf-schema#label':
        entity_label_dict[s] = (unicode)(o)

In [5]:
# 统计三元组的头实体个数
num_subj_triple = dict()
for s,p,o in graph:
    if num_subj_triple.get(s) == None:
        num_subj_triple[s] = 1
    else:
        num_subj_triple[s] += 1

In [6]:
### Automatically extracted intersection predicates ###
# 自动提取相交谓词
intersection_predicates = ['http://www.wikidata.org/entity/P36',\
'http://www.wikidata.org/entity/P185',\
'http://www.wikidata.org/entity/P345',\
'http://www.wikidata.org/entity/P214',\
'http://www.wikidata.org/entity/P40',\
'http://www.wikidata.org/entity/P569',\
'http://www.wikidata.org/entity/P102',\
'http://www.wikidata.org/entity/P175',\
'http://www.wikidata.org/entity/P131',\
'http://www.wikidata.org/entity/P577',\
'http://www.wikidata.org/entity/P140',\
'http://www.wikidata.org/entity/P400',\
'http://www.wikidata.org/entity/P736',\
'http://www.wikidata.org/entity/P1432',\
'http://www.wikidata.org/entity/P159',\
'http://www.wikidata.org/entity/P136',\
'http://www.wikidata.org/entity/P1477',\
'http://www.wikidata.org/entity/P227',\
'http://www.wikidata.org/entity/P6',\
'http://www.wikidata.org/entity/P108',\
'http://www.wikidata.org/entity/P585',\
'http://www.wikidata.org/entity/P239',\
'http://www.wikidata.org/entity/P98',\
'http://www.wikidata.org/entity/P54',\
'http://www.wikidata.org/entity/P17',\
'http://www.wikidata.org/entity/P244',\
'http://www.wikidata.org/entity/P238',\
'http://www.wikidata.org/entity/P287',\
'http://www.wikidata.org/entity/P570',\
'http://www.wikidata.org/entity/P176',\
'http://www.wikidata.org/entity/P119',\
'http://www.wikidata.org/entity/P230',\
'http://www.wikidata.org/entity/P50',\
'http://www.wikidata.org/entity/P57',\
'http://www.wikidata.org/entity/P969',\
'http://www.wikidata.org/entity/P20',\
'http://www.wikidata.org/entity/P374',\
'http://www.wikidata.org/entity/P19',\
'http://www.wikidata.org/entity/P84',\
'http://www.wikidata.org/entity/P166',\
'http://www.wikidata.org/entity/P571',\
'http://www.wikidata.org/entity/P184',\
'http://www.wikidata.org/entity/P473',\
'http://www.wikidata.org/entity/P219',\
'http://www.wikidata.org/entity/P170',\
'http://www.wikidata.org/entity/P26',\
'http://www.wikidata.org/entity/P580',\
'http://www.wikidata.org/entity/P1015',\
'http://www.wikidata.org/entity/P408',\
'http://www.wikidata.org/entity/P172',\
'http://www.wikidata.org/entity/P220',\
'http://www.wikidata.org/entity/P177',\
'http://www.wikidata.org/entity/P178',\
'http://www.wikidata.org/entity/P161',\
'http://www.wikidata.org/entity/P27',\
'http://www.wikidata.org/entity/P742',\
'http://www.wikidata.org/entity/P607',\
'http://www.wikidata.org/entity/P286',\
'http://www.wikidata.org/entity/P361',\
'http://www.wikidata.org/entity/P1082',\
'http://www.wikidata.org/entity/P344',\
'http://www.wikidata.org/entity/P106',\
'http://www.wikidata.org/entity/P112',\
'http://www.wikidata.org/entity/P1036',\
'http://www.wikidata.org/entity/P229',\
'http://www.w3.org/2000/01/rdf-schema#label',\
'http://www.wikidata.org/entity/P126',\
'http://www.wikidata.org/entity/P750',\
'http://www.wikidata.org/entity/P144',\
'http://www.wikidata.org/entity/P69',\
'http://www.wikidata.org/entity/P264',\
'http://www.wikidata.org/entity/P218',\
'http://www.wikidata.org/entity/P110',\
'http://www.wikidata.org/entity/P86',\
'http://www.wikidata.org/entity/P957',\
'http://www.wikidata.org/entity/P1040',\
'http://www.wikidata.org/entity/P200',\
'http://www.wikidata.org/entity/P605',\
'http://www.wikidata.org/entity/P118',\
'http://www.wikidata.org/entity/P127']

# 相交谓词赋值
intersection_predicates_uri = intersection_predicates

In [7]:
import rdflib
import re # 正则表达式
import collections

# 字符量长度
literal_len = 10

# 返回字符类型
def dataType(string):
    odp='string'
    patternBIT=re.compile('[01]') # bit
    patternINT=re.compile('[0-9]+') # int
    patternFLOAT=re.compile('[0-9]+\.[0-9]+') # float
    patternTEXT=re.compile('[a-zA-Z0-9]+') # text
    if patternTEXT.match(string):
        odp= "string"
    if patternINT.match(string):
        odp= "integer"
    if patternFLOAT.match(string):
        odp= "float"
    return odp

### Return: data, data_type
# 返回data,以及dataType
def getRDFData(o):
    # 判断是否是URIRef类型
    if isinstance(o, rdflib.term.URIRef):
        data_type = "uri"
    else:
        data_type = o.datatype
        if data_type == None:
            data_type = dataType(o) # 得到具体数据类型
        else:
            if "#" in o.datatype:
                data_type = o.datatype.split('#')[1].lower() # 以#进行分割, 得到第二部分
            else:
                data_type = dataType(o)
        ## 对时间特殊处理
        if data_type == 'gmonthday' or data_type=='gyear':
            data_type = 'date'
        # 对positiveinteger, nonnegativeinteger特殊处理
        if data_type == 'positiveinteger' or data_type == 'int' or data_type == 'nonnegativeinteger':
            data_type = 'integer'
    return o, data_type

# 反转dict：并交换key,value值
# iteritems返回迭代器
def invert_dict(d):
    return dict([(v, k) for k, v in d.iteritems()])

# 得到字面量数组：literal_len = 10
# o = getRDFData(o), 返回[data, dataType].
# char_vocab: 字符向量
def getLiteralArray(o, literal_len, char_vocab):
    literal_object = list()
    # literal_object初始化为0
    for i in range(literal_len):
        literal_object.append(0)
    # 判断数据类型是否是'uri', 
    # 是字面量, 则对字面量进行处理
    if o[1] != 'uri':
        max_len = min(literal_len, len(o[0]))
        for i in range(max_len):
            # char_vocab没有字符o[0][i]对应的字符向量
            if char_vocab.get(o[0][i]) == None:
                char_vocab[o[0][i]] = len(char_vocab) # 字符向量为len(char_vocab)
            literal_object[i] = char_vocab[o[0][i]] 
    # 是'uri', 并且o是头实体,通过entity_label_dict得到尾实体
    elif entity_label_dict.get(o[0]) != None:
        label = entity_label_dict.get(o[0])
        max_len = min(literal_len, len(label))
        for i in range(max_len):
            # char_vocab没有字符label[i]对应的字符向量
            if char_vocab.get(label[i]) == None:
                char_vocab[label[i]] = len(char_vocab)
            literal_object[i] = char_vocab[label[i]]
        
    return literal_object

# entity 词向量
entity_vocab = dict()
# dbp entity词向量
entity_dbp_vocab = list()

# 实体词向量的负样本
entity_dbp_vocab_neg = list()
entity_lgd_vocab_neg = list()

# 谓词向量
predicate_vocab = dict()
predicate_vocab['<NONE>'] = 0

# 实体字面量词向量
entity_literal_vocab = dict()
# 实体字面量的负样本
entity_literal_dbp_vocab_neg = list()
entity_literal_lgd_vocab_neg = list()

# 存储uri类型的三元组数据
data_uri = [] ###[ [[s,p,o,p_trans],[chars],predicate_weight], ... ]
data_uri_0 = []

# 存储字面量类型的三元组数据
data_literal_0 = []
data_literal = []

# 存储进行了谓词转换后的数据
data_uri_trans = []
data_literal_trans = []

# 字符向量
char_vocab = dict()
char_vocab['<pad>'] = 0
#tmp_data = []

# 实体权重
pred_weight = dict()

# 三元组个数
num_triples = 0

for s, p, o in graph:
    
    num_triples += 1
    
    # [data, data_type]
    s = getRDFData(s)
    p = getRDFData(p)
    o = getRDFData(o)
    
    # 更新谓词权重
    if pred_weight.get(p[0]) == None:
        pred_weight[p[0]] = 1
    else:
        pred_weight[p[0]] += 1

    ## all vocab for finding neg sample => 寻找负样本的所有词汇
    # 设置头实体s
    if entity_literal_vocab.get(s[0]) == None:
        # 设置实体字面量词向量：词向量为len(entity_literal_vocab)
        entity_literal_vocab[s[0]] = len(entity_literal_vocab) 
        if (unicode)(s[0]).startswith(u'http://dbpedia.org/resource/'):
            entity_literal_dbp_vocab_neg.append(s[0]) # dbp_vocab_neg
        else:
            entity_literal_lgd_vocab_neg.append(s[0]) # lgd_vocab_neg
    
    # 设置尾实体o
    if entity_literal_vocab.get(o[0]) == None:
        # 词向量为len(entity_literal_vocab) 
        entity_literal_vocab[o[0]] = len(entity_literal_vocab)
        if (unicode)(s[0]).startswith(u'http://dbpedia.org/resource/'):
            entity_literal_dbp_vocab_neg.append(o[0]) # dbp_vocab_neg
        else:
            entity_literal_lgd_vocab_neg.append(o[0]) # lgd_vocab_neg
    
    # 设置头实体s词向量
    if entity_vocab.get(s[0]) == None:
        idx = len(entity_vocab)
        entity_vocab[s[0]] = idx # 实体词向量为len(entity_vocab)
        if (unicode)(s[0]).startswith(u'http://dbpedia.org/resource/'):
            entity_dbp_vocab.append(idx)
            entity_dbp_vocab_neg.append(s[0])
        else:
            entity_lgd_vocab_neg.append(s[0])
    
    # 设置谓词向量 => len(predicate_vocab)
    if predicate_vocab.get(p[0]) == None:
        predicate_vocab[p[0]] = len(predicate_vocab)
    
    # 数据类型为'uri' => 即尾实体还是一个关系, 设置尾实体的词向量
    if o[1] == 'uri':
        if entity_vocab.get(o[0]) == None:
            entity_vocab[o[0]] = len(entity_vocab)
            if (unicode)(s[0]).startswith(u'http://dbpedia.org/resource/'):
                entity_dbp_vocab_neg.append(o[0]) # dbp_vocab_neg
            else:
                entity_lgd_vocab_neg.append(o[0]) # lgd_vocab_neg
        # 得到字面量对象, 并根据o更新char_vocab, 对尾实体o进行字符级别处理
        literal_object = getLiteralArray(o, literal_len, char_vocab)
        # 若当前谓词不在相交谓词中, 则将对应的实体向量进行存储data_uri_0
        if (unicode)(p[0]) not in intersection_predicates_uri:
            data_uri_0.append([[entity_vocab[s[0]], predicate_vocab[p[0]], entity_vocab[o[0]], 0], literal_object])
        else:
             # 若当前谓词在相交谓词中, 则将对应的实体向量进行存储data_uri
            data_uri.append([[entity_vocab[s[0]], predicate_vocab[p[0]], entity_vocab[o[0]], 0], literal_object])
            ### DATA TRANS => 数据转换
            # 找到重复谓词
            # Counter统计字符重复次数
            # A generator of predicates with the given subject and object
            duplicate_preds = [item for item, count in collections.Counter(graph.predicates(o[0],None)).items() if count > 1]
            if True:
              # 找到以o[0]开头的三元组
                for g1 in graph.triples((o[0],None,None)):
                    if len(g1) > 0:
                        s1,p1,o1 = g1

                        s1 = getRDFData(s1)
                        p1 = getRDFData(p1)
                        o1 = getRDFData(o1)
                        
                        # entity 以及谓词词向量处理 
                        if entity_vocab.get(o1[0]) == None:
                            entity_vocab[o1[0]] = len(entity_vocab)
                        if (unicode)(s1[0]).startswith(u'http://dbpedia.org/resource/'):
                            entity_dbp_vocab_neg.append(o1[0])
                        else:
                            entity_lgd_vocab_neg.append(o1[0])
                            
                        if entity_vocab.get(o1[1]) == None:
                            entity_vocab[o1[1]] = len(entity_vocab)
                        if predicate_vocab.get(p1[0]) == None:
                            predicate_vocab[p1[0]] = len(predicate_vocab)
                        
                        # 两个谓词不相等, 并且与intersection_predicates_uri存在交集
                        if p[0] != p1[0] \
                            and len(set((unicode)(x) for x in (graph.predicates(s[0]))).intersection(set(intersection_predicates_uri))) != 0:
                            # 头实体为URIRef 并且谓词存在于intersection_predicates_uri. 
                            # 存储转换内容到isinstance
                            if isinstance(o1[0], rdflib.term.URIRef) and (unicode)(p1[0]) in intersection_predicates_uri:
                                data_uri_trans.append([[entity_vocab[s[0]], predicate_vocab[p[0]], entity_vocab[o1[0]], predicate_vocab[p1[0]]], getLiteralArray(o1, literal_len, char_vocab)])
                            # 头实体为URIRef 并且谓词存为某个特定值 
                            elif isinstance(o1[0], rdflib.term.Literal) and (unicode)(p1[0]) == u'http://www.w3.org/2000/01/rdf-schema#label':
                                data_literal_trans.append([[entity_vocab[s[0]], predicate_vocab[p[0]], entity_vocab[o1[1]], predicate_vocab[p1[0]]], getLiteralArray(o1, literal_len, char_vocab)])
                              #tmp_data.append((s[0], p[0], o[0], p1[0], o1[0]))
              ##############
    else:
        # 数据类型为字面量, 进行字面量的处理
        if entity_vocab.get(o[1]) == None:
            entity_vocab[o[1]] = len(entity_vocab)
        # 得到字面量对象, 并根据o更新char_vocab, 对字面量进行字符级别处理
        literal_object = getLiteralArray(o, literal_len, char_vocab)
        # 若当前谓词不在相交谓词中, 则将对应的实体向量进行存储data_literal_0
        if (unicode)(p[0]) not in intersection_predicates:
            data_literal_0.append([[entity_vocab[s[0]], predicate_vocab[p[0]], entity_vocab[o[1]], 0], literal_object])
        else:
            # 若当前谓词不在相交谓词中, 则将对应的实体向量进行存储data_literal
            data_literal.append([[entity_vocab[s[0]], predicate_vocab[p[0]], entity_vocab[o[1]], 0], literal_object])

# 词向量翻转
reverse_entity_vocab = invert_dict(entity_vocab)
reverse_predicate_vocab = invert_dict(predicate_vocab)
reverse_char_vocab = invert_dict(char_vocab)
reverse_entity_literal_vocab = invert_dict(entity_literal_vocab)

#Add predicate weight => 增加谓词权重

for i in range(0, len(data_uri)):
    # data_uri = [] ###[ [[s,p,o,p_trans],[chars],predicate_weight], ... ]
    s = reverse_entity_vocab.get(data_uri[i][0][0])
    p = reverse_predicate_vocab.get(data_uri[i][0][1])
    # 谓词权重 = 出现次数 / 三元组总数
    data_uri[i].append([(pred_weight.get(p)/float(num_triples))])

# 下面的转换过程同上
for i in range(0, len(data_uri_0)):
    s = reverse_entity_vocab.get(data_uri_0[i][0][0])
    p = reverse_predicate_vocab.get(data_uri_0[i][0][1])
    data_uri_0[i].append([(pred_weight.get(p)/float(num_triples))])

for i in range(0, len(data_uri_trans)):
    s = reverse_entity_vocab.get(data_uri_trans[i][0][0])
    p = reverse_predicate_vocab.get(data_uri_trans[i][0][1])
    data_uri_trans[i].append([(pred_weight.get(p)/float(num_triples))])
    
for i in range(0, len(data_literal)):
    s = reverse_entity_vocab.get(data_literal[i][0][0])
    p = reverse_predicate_vocab.get(data_literal[i][0][1])
    data_literal[i].append([(pred_weight.get(p)/float(num_triples))])

for i in range(0, len(data_literal_0)):
    s = reverse_entity_vocab.get(data_literal_0[i][0][0])
    p = reverse_predicate_vocab.get(data_literal_0[i][0][1])
    data_literal_0[i].append([(pred_weight.get(p)/float(num_triples))])
    
for i in range(0, len(data_literal_trans)):
    s = reverse_entity_vocab.get(data_literal_trans[i][0][0])
    p = reverse_predicate_vocab.get(data_literal_trans[i][0][1])
    data_literal_trans[i].append([(pred_weight.get(p)/float(num_triples))])
    
if len(data_uri_trans) < 100:
    data_uri_trans = data_uri_trans+data_uri_trans
    
print (len(entity_vocab), len(predicate_vocab), len(char_vocab), len(entity_dbp_vocab))

(273161, 1172, 786, 61502)


In [20]:
# 对上述向量进行持久化操作 - pickle模块实现了基本的数据序列化和反序列化。
# 通过pickle模块的序列化操作我们能够将程序中运行的对象信息保存到文件中去，永久存储
cPickle.dump(entity_literal_vocab, open("data/vocab_all.pickle", "wb")) 
cPickle.dump(char_vocab, open("data/vocab_char.pickle", "wb"))
cPickle.dump(entity_vocab, open("data/vocab_entity.pickle", "wb")) 
cPickle.dump(predicate_vocab, open("data/vocab_predicate.pickle", "wb")) 
cPickle.dump(entity_dbp_vocab, open("data/vocab_kb1.pickle", "wb")) 
cPickle.dump(entity_dbp_vocab_neg, open("data/vocab_kb1_neg.pickle", "wb")) 
cPickle.dump(entity_lgd_vocab_neg, open("data/vocab_kb2_neg.pickle", "wb")) 
cPickle.dump(entity_label_dict, open("data/entity_label.pickle", "wb")) 
cPickle.dump(entity_literal_dbp_vocab_neg, open("data/vocab_kb1_all_neg.pickle", "wb")) 
cPickle.dump(entity_literal_lgd_vocab_neg, open("data/vocab_kb2_all_neg.pickle", "wb")) 
cPickle.dump(data_uri, open("data/data_uri.pickle", "wb"))
cPickle.dump(data_uri_0, open("data/data_uri_n.pickle", "wb"))
cPickle.dump(data_literal, open("data/data_literal.pickle", "wb"))
cPickle.dump(data_literal_0, open("data/data_literal_n.pickle", "wb"))
cPickle.dump(data_uri_trans, open("data/data_trans.pickle", "wb"))