Generates dataset (entities and relation dictionaries; train, test, validation files) in n-triples format for training KGE models

In [70]:
DATASETS = ['GALEN']
FILE_PATH = 'data\{}\{}_norm_mod.owl'
ENTITY_PATH = 'data\{}\proscutes\entities.dict'
RELATION_PATH = "data\{}\proscutes\\relations.dict"
SAVE_PATH = 'data\{}\proscutes\{}.txt'
DATA_PATH = 'data\{}\{}_{}.txt'

In [71]:
def getEntityRelations(filename, all_subcls):
    classes = {}
    relations = {}
    triples = {}
    with open(filename) as f:
        for line in f:
            # Ignore SubObjectPropertyOf
            if line.startswith('SubObjectPropertyOf'):
                line = line.strip()[20:-1]
                if line.startswith('ObjectPropertyChain'):
                    line_chain = line.strip()[20:-1]
                    line1 = line.split(")")
                    line10 = line1[0].split()
                    r1 = line10[0].strip()
                    r2 = line10[1].strip()
                    r3 = line1[1].strip()
                    if r1 not in relations:
                        relations[r1] = len(relations)
                    if r2 not in relations:
                        relations[r2] = len(relations)
                    if r3 not in relations:
                        relations[r3] = len(relations)
                else:
                    it = line.split(' ')
                    r1 = it[0].strip()
                    r2 = it[1].strip()
                    if r1 not in relations:
                        relations[r1] = len(relations)
                    if r2 not in relations:
                        relations[r2] = len(relations)
            # Ignore SubClassOf()
            line = line.strip()[11:-1]
            if not line:
                continue
            if line.startswith('ObjectIntersectionOf('):
                # C and D SubClassOf E
                # triple????
                it = line.split(' ')
                c = it[0][21:]
                d = it[1][:-1]
                e = it[2]
                if c not in classes:
                    classes[c] = len(classes)
                if d not in classes:
                    classes[d] = len(classes)
                if e not in classes:
                    classes[e] = len(classes)
                form = 'nf2'
                if e == 'owl:Nothing':
                    form = 'disjoint'
            elif line.startswith('ObjectSomeValuesFrom('):
                # R some C SubClassOf D
                # (d, r, c)
                it = line.split(' ')
                r = it[0][21:].strip()
                c = it[1][:-1]
                d = it[2]
                if c not in classes:
                    classes[c] = len(classes)
                if d not in classes:
                    classes[d] = len(classes)
                if r not in relations:
                    relations[r] = len(relations)
                if d not in triples:
                    triples[d] = {}
                if c not in triples[d]:
                    triples[d][c] = []
                triples[d][c].append(r)
            elif line.find('ObjectSomeValuesFrom') != -1:
                # C SubClassOf R some D
                # (c, r, d)
                it = line.split(' ')
                c = it[0]
                r = it[1][21:].strip()
                d = it[2][:-1]
                if c not in classes:
                    classes[c] = len(classes)
                if d not in classes:
                    classes[d] = len(classes)
                if r not in relations:
                    relations[r] = len(relations)
                if c not in triples:
                    triples[c] = {}
                if d not in triples[c]:
                    triples[c][d] = []
                triples[c][d].append(r)
            else:
                # C SubClassOf D
                # (c, subclass, d)
                it = line.split(' ')
                c = it[0]
                d = it[1]
                r = 'SubClassOf'
                if r not in relations:
                    relations[r] = len(relations)
                if c not in classes:
                    classes[c] = len(classes)
                if d not in classes:
                    classes[d] = len(classes)
                if c not in triples:
                    triples[c] = {}
                if d not in triples[c]:
                    triples[c][d] = []
                triples[c][d].append(r)
    if 'owl:Thing' not in classes:
        classes['owl:Thing'] = len(classes)
    prot_ids = []
    class_keys = list(classes.keys())
    for val in all_subcls:
        if val not in class_keys:
            cid = len(classes)
            classes[val] = cid
            prot_ids.append(cid)
        else:
            prot_ids.append(classes[val])
    return classes, relations, triples

In [72]:
def load_cls(data_file):
    subs=list()
    counter=0
    with open(data_file,'r') as f:
        for line in f:
            counter+=1
            it = line.strip().split()
            cls1 = it[0]
            cls2 = it[1]
            subs.append(cls1)
            subs.append(cls2)
    train_cls = list(set(subs))
    return train_cls,counter

In [73]:
def getTriples(path, triples):
  tuples = []
  with open(path) as f:
    for line in f:
      it = line.strip().split()
      cls1 = it[0]
      cls2 = it[1]
      if cls1 in triples:
        if cls2 in triples[cls1]:
          for r in triples[cls1][cls2]:
            tuples.append(cls1 + '\t' + r + '\t' + cls2)
        else:
          print(line)
      else:
        print(line)
  return tuples

In [None]:
for dataset in DATASETS:
  print("DATASET ", dataset)
  file_path = FILE_PATH.format(dataset, dataset)
  train_cls,_ = load_cls(DATA_PATH.format(dataset, dataset, "train"))
  valid_cls,_ = load_cls(DATA_PATH.format(dataset, dataset, "valid"))
  classes, relations, triples = getEntityRelations(file_path, train_cls+valid_cls)
  for type in ["train", "valid", "test"]:
    tups = getTriples(DATA_PATH.format(dataset, dataset, type), triples)
    with open(SAVE_PATH.format(dataset, type), 'w') as f:
      f.write('\n'.join(tups))
  with open(ENTITY_PATH.format(dataset), 'w') as f:
    for class_, id in classes.items():
      f.write(str(id) + '\t' + class_ + '\n')
  with open(RELATION_PATH.format(dataset), 'w') as f:
    for relation_, id in relations.items():
      f.write(str(id) + '\t' + relation_ + '\n')

In [69]:
with open(path, 'w') as f:
  f.write('\n'.join(l))

# Data generation for KGE training

In [15]:
from rdflib import Graph
import rdflib
from tqdm import tqdm

## GO

In [16]:
g = Graph()
g.parse('data\KGE_data\go\go_turtle.owl', format='turtle')

<Graph identifier=N1acd16a1f3e0473dbb207cdb5536ad05 (<class 'rdflib.graph.Graph'>)>

In [68]:
entities = {}
relations = {}
nt = {}
train = []
valid = []
test = []
for s, p, o in g:
  # print(1, s)
  # print(2, p)
  # print(3, o)
  # print(s, '\t', p, '\t', o)
  if type(s) == rdflib.term.Literal or type(o) == rdflib.term.Literal:
    continue
  if str(s) not in entities:
    entities[str(s)] = len(entities)
    # print(s)
    # break
  if str(o) not in entities:
    entities[str(o)] = len(entities)
  if str(p) not in relations:
    relations[str(p)] = len(relations)
  if type(o) == rdflib.term.BNode or type(s) == rdflib.term.BNode:
    train.append(
      str(s) + '\t' + str(p) +'\t' + str(o)
    )
    continue
  if s not in nt:
    nt[str(s)] = {}
  if o not in nt[str(s)]:
    nt[str(s)][str(o)] = []
  nt[str(s)][str(o)].append(str(p))
  # break
print(len(entities))
print(len(relations))
print(len(nt))

247733
22
50780


In [69]:
tups = [
  (
    "data\GO\GO_train.txt",
    train,
    "data\GO\KG\\train.txt",
    False
  ),
  (
    "data\GO\GO_test.txt",
    test,
    "data\GO\KG\\test.txt",
    True
  ),
  (
    "data\GO\GO_valid.txt",
    valid,
    "data\GO\KG\\valid.txt",
    True
  )
]

In [70]:
for path, l, savepath, subclass_only in tups:
  with open(path, 'r') as f:
    c=0
    d=0
    for line in f:
      c+=1
      a, b = line.split()
      a = a.strip('<').strip('>')
      b = b.strip('<').strip('>')
      if a in nt:
        if b in nt[a]:
          if subclass_only:
            r = "http://www.w3.org/2000/01/rdf-schema#subClassOf"
            if r in nt[a][b]:
              l.append(a + '\t' + r +'\t' + b)
              d+=1
            # else:
            #   print(nt[a][b], a, b)
          else:
            for r in nt[a][b]:
              l.append(a + '\t' + r +'\t' + b)
              d+=1
  print(path, c)
  print(savepath, d, len(l))
  with open(savepath, 'w') as f:
    f.write(
      '\n'.join(l)
    )

data\GO\GO_train.txt 59829
data\GO\KG\train.txt 16779 602881
data\GO\GO_test.txt 8547
data\GO\KG\test.txt 2414 2414
data\GO\GO_valid.txt 17093
data\GO\KG\valid.txt 4898 4898


In [49]:
with open("data\GO\KG\\relations.dict", 'w') as f:
  for k,v in relations.items():
    f.write(str(v) + '\t' + k + '\n')


## GALEN

In [71]:
g = Graph()
g.parse('data\KGE_data\galen\galen_turtle.owl', format='turtle')

<Graph identifier=N55591a507b05470d9dd343e7ecde4125 (<class 'rdflib.graph.Graph'>)>

In [72]:
entities = {}
relations = {}
nt = {}
train = []
valid = []
test = []
for s, p, o in g:
  # print(1, s)
  # print(2, p)
  # print(3, o)
  # print(s, '\t', p, '\t', o)
  if type(s) == rdflib.term.Literal or type(o) == rdflib.term.Literal:
    continue
  if str(s) not in entities:
    entities[str(s)] = len(entities)
    # print(s)
    # break
  if str(o) not in entities:
    entities[str(o)] = len(entities)
  if str(p) not in relations:
    relations[str(p)] = len(relations)
  if type(o) == rdflib.term.BNode or type(s) == rdflib.term.BNode:
    train.append(
      str(s) + '\t' + str(p) +'\t' + str(o)
    )
    continue
  if s not in nt:
    nt[str(s)] = {}
  if o not in nt[str(s)]:
    nt[str(s)][str(o)] = []
  nt[str(s)][str(o)].append(str(p))
  # break
print(len(entities))
print(len(relations))
print(len(nt))
tups = [
  (
    "data\GALEN\GALEN_train.txt",
    train,
    "data\GALEN\KG\\train.txt",
    False
  ),
  (
    "data\GALEN\GALEN_test.txt",
    test,
    "data\GALEN\KG\\test.txt",
    True
  ),
  (
    "data\GALEN\GALEN_valid.txt",
    valid,
    "data\GALEN\KG\\valid.txt",
    True
  )
]
for path, l, savepath, subclass_only in tups:
  with open(path, 'r') as f:
    c=0
    d=0
    for line in f:
      c+=1
      a, b = line.split()
      a = a.strip('<').strip('>')
      b = b.strip('<').strip('>')
      if a in nt:
        if b in nt[a]:
          if subclass_only:
            r = "http://www.w3.org/2000/01/rdf-schema#subClassOf"
            if r in nt[a][b]:
              l.append(a + '\t' + r +'\t' + b)
              d+=1
            # else:
            #   print(nt[a][b], a, b)
          else:
            for r in nt[a][b]:
              l.append(a + '\t' + r +'\t' + b)
              d+=1
  print(path, c)
  print(savepath, d, len(l))
  with open(savepath, 'w') as f:
    f.write(
      '\n'.join(l)
    )
with open("data\GALEN\KG\\entities.dict", 'w') as f:
  for k,v in entities.items():
    f.write(str(v) + '\t' + k + '\n')
with open("data\GALEN\KG\\relations.dict", 'w') as f:
  for k,v in relations.items():
    f.write(str(v) + '\t' + k + '\n')


101805
10
24092
data\GALEN\GALEN_train.txt 19511
data\GALEN\KG\train.txt 4596 211817
data\GALEN\GALEN_test.txt 2788
data\GALEN\KG\test.txt 672 672
data\GALEN\GALEN_valid.txt 5573
data\GALEN\KG\valid.txt 1290 1290


## SNOMED

In [74]:
g = Graph()
g.parse('data\KGE_data\snomed\snomed_turtle.owl', format='turtle')

<Graph identifier=Nd1da9555b5844119a25fda00555ed360 (<class 'rdflib.graph.Graph'>)>

In [76]:
entities = {}
relations = {}
nt = {}
train = []
valid = []
test = []
for s, p, o in g:
  # print(1, s)
  # print(2, p)
  # print(3, o)
  # print(s, '\t', p, '\t', o)
  if type(s) == rdflib.term.Literal or type(o) == rdflib.term.Literal:
    continue
  if str(s) not in entities:
    entities[str(s)] = len(entities)
    # print(s)
    # break
  if str(o) not in entities:
    entities[str(o)] = len(entities)
  if str(p) not in relations:
    relations[str(p)] = len(relations)
  if type(o) == rdflib.term.BNode or type(s) == rdflib.term.BNode:
    train.append(
      str(s) + '\t' + str(p) +'\t' + str(o)
    )
    continue
  if s not in nt:
    nt[str(s)] = {}
  if o not in nt[str(s)]:
    nt[str(s)][str(o)] = []
  nt[str(s)][str(o)].append(str(p))
  # break
print(len(entities))
print(len(relations))
print(len(nt))
tups = [
  (
    "data\SNOMED\SNOMED_train.txt",
    train,
    "data\SNOMED\KG\\train.txt",
    False
  ),
  (
    "data\SNOMED\SNOMED_test.txt",
    test,
    "data\SNOMED\KG\\test.txt",
    True
  ),
  (
    "data\SNOMED\SNOMED_valid.txt",
    valid,
    "data\SNOMED\KG\\valid.txt",
    True
  )
]
for path, l, savepath, subclass_only in tups:
  with open(path, 'r') as f:
    c=0
    d=0
    for line in f:
      c+=1
      a, b = line.split()
      a = a.strip('<').strip('>')
      b = b.strip('<').strip('>')
      if a in nt:
        if b in nt[a]:
          if subclass_only:
            r = "http://www.w3.org/2000/01/rdf-schema#subClassOf"
            if r in nt[a][b]:
              l.append(a + '\t' + r +'\t' + b)
              d+=1
            # else:
            #   print(nt[a][b], a, b)
          else:
            for r in nt[a][b]:
              l.append(a + '\t' + r +'\t' + b)
              d+=1
  print(path, c)
  print(savepath, d, len(l))
  with open(savepath, 'w') as f:
    f.write(
      '\n'.join(l)
    )
with open("data\SNOMED\KG\\entities.dict", 'w') as f:
  for k,v in entities.items():
    f.write(str(v) + '\t' + k + '\n')
with open("data\SNOMED\KG\\relations.dict", 'w') as f:
  for k,v in relations.items():
    f.write(str(v) + '\t' + k + '\n')


1621878
10
307756
data\SNOMED\SNOMED_train.txt 312631
data\SNOMED\KG\train.txt 93323 3647565
data\SNOMED\SNOMED_test.txt 14700
data\SNOMED\KG\test.txt 4455 4455
data\SNOMED\SNOMED_valid.txt 89322
data\SNOMED\KG\valid.txt 26410 26410
