# Create dataset for each relation type (for EmEL and variants)
* Create 3 variants for each type of relation
* Create train, test, validation splits optimally

In [8]:
from collections import Counter

## Create 3 variants for each type of relation

In [2]:
DATASETS = ['GALEN', 'GO', 'SNOMED']
FILE_PATH = 'data\{}\{}_norm_mod.owl'
SAVE_PATH = 'data\{}\{}_norm_{}.owl'

In [3]:
def getRelation(line, ObjectIntersectionOf=False):
  if line.startswith('ObjectSomeValuesFrom('):
    # R some C SubClassOf D
    it = line.split(' ')
    r = it[0][21:].strip('\n')
    c = it[1][:-1].strip('\n')
    d = it[2].strip('\n')
    return r, d, c
  elif line.find('ObjectSomeValuesFrom') != -1:
    # C SubClassOf R some D
    it = line.split(' ')
    c = it[0].strip('\n')
    r = it[1][21:].strip('\n')
    d = it[2][:-1].strip('\n')
    return r, c, d
  elif ObjectIntersectionOf and line.startswith('ObjectIntersectionOf'):
    it = line.split(' ')
    c = it[0][21:].strip('\n')
    d = it[1][:-1].strip('\n')
    e = it[2].strip('\n')
    return e, c, d
  # Ignore SubClassOf and ObjectIntersectionOf
  it = line.split(' ')
  c = it[0].strip('\n')
  r = 'SubClassOf'
  d = it[1][:-1].strip('\n')
  return None, c, d
  # return r, c, d

In [4]:
def getRelations(file):
  relation = {}
  with open(file) as f:
    for line in f:
      if not line:
        break
      if line.startswith('SubClassOf'):
        line = line[11:]
        # SubClassOf(ObjectSomeValuesFrom(<http://www.co-ode.org/ontologies/galen#isActedOnSpecificallyBy> <http://www.co-ode.org/ontologies/galen#Haematopinus>) <http://www.co-ode.org/ontologies/galen#NonActiveImplantableDevice>)
        # SubClassOf(<http://www.co-ode.org/ontologies/galen#Anonymous-457> ObjectSomeValuesFrom(<http://www.co-ode.org/ontologies/galen#isSpecificSurfaceDivisionOf> <http://www.co-ode.org/ontologies/galen#CervicalRegionOfBack>))
        r, c1, c2 = getRelation(line)
        if r:
          if r not in relation:
            relation[r] = []
          relation[r].append((c1, c2))
  return relation

In [5]:
def getRelationsByType(relations):
  relations_by_type = {
    '1_1': [],
    '1_n': [],
    'n_n': []
  }
  for relation, tups in relations.items():
    left = Counter([tup[0] for tup in tups])
    right = Counter([tup[1] for tup in tups])
    if left.most_common(1)[0][1] == 1 and right.most_common(1)[0][1] == 1:
      relations_by_type['1_1'].append(relation)
    elif right.most_common(1)[0][1] == 1:
      relations_by_type['1_n'].append(relation)
    else:
      relations_by_type['n_n'].append(relation)
  return relations_by_type

In [6]:
def writeAxioms(dataset, file_path, save_path, relations_by_type):
  file = {}
  count = {}
  for k in relations_by_type.keys():
    file[k] = open(save_path.format(dataset, dataset, k), 'w')
    count[k] = 0
    print("Writing to", file[k])
  count['other'] = 0
  with open(file_path) as f:
    for line in f:
      if line.startswith('EquivalentClasses'):
        continue
      if line.startswith('SubClassOf'):
        s_line = line[11:]
        if not s_line:
          break
        # SubClassOf(ObjectSomeValuesFrom(<http://www.co-ode.org/ontologies/galen#isActedOnSpecificallyBy> <http://www.co-ode.org/ontologies/galen#Haematopinus>) <http://www.co-ode.org/ontologies/galen#NonActiveImplantableDevice>)
        # SubClassOf(<http://www.co-ode.org/ontologies/galen#Anonymous-457> ObjectSomeValuesFrom(<http://www.co-ode.org/ontologies/galen#isSpecificSurfaceDivisionOf> <http://www.co-ode.org/ontologies/galen#CervicalRegionOfBack>))
        r, c1, c2 = getRelation(s_line)
        if r:
          if r in relations_by_type['1_1']:
            file['1_1'].write(line)
            count['1_1']+=1
          elif r in relations_by_type['1_n']:
            file['1_n'].write(line)
            count['1_n']+=1
          else:
            file['n_n'].write(line)
            count['n_n']+=1
        else:
          # Write SubClassOf axioms to all files
          for k in relations_by_type.keys():
            file[k].write(line)
          count['other']+=1
      else:
        for k in relations_by_type.keys():
          file[k].write(line)
        count['other']+=1
  for k,v in file.items():
    v.close()
  for k,v in count.items():
    print(k, v)

In [14]:
for dataset in DATASETS:
  print(dataset)
  file_path = FILE_PATH.format(dataset, dataset)
  relations = getRelations(file_path)
  print(len(relations))
  relations_by_type = getRelationsByType(relations)
  for k, v in relations_by_type.items():
    # print(v)
    naxioms = sum([len(relations[vv]) for vv in v])
    print(k, len(v), naxioms)
    with open("data/"+dataset+"/"+k+"/"+dataset+"_"+k+"_relations.txt", 'w') as f:
      f.write(
        '\n'.join(v)
      )
  # writeAxioms(dataset, file_path, SAVE_PATH, relations_by_type)

GALEN
396
1_1 180 2199
1_n 17 1331
n_n 199 38185
GO
8
1_1 1 1
1_n 0 0
n_n 7 32452
SNOMED
57
1_1 0 0
1_n 1 200641
n_n 56 314138


In [None]:
DATASETS = ['OWL2EL_5']
FILE_PATH = 'data/{}/normalized.owl'
SAVE_PATH = 'data/{}/{}_norm_{}.owl'

In [None]:
for dataset in DATASETS[1:]:
  print(dataset)
  file_path = FILE_PATH.format(dataset)
  relations = getRelations(file_path)
  print(len(relations))
  relations_by_type = getRelationsByType(relations)
  for k, v in relations_by_type.items():
    print(v)
    with open("data/OWL2EL_5/OWL2EL_5_{}_relations.txt".format(k), "w") as f:
      f.write("\n".join(v))
    naxioms = sum([len(relations[vv]) for vv in v])
    print(k, len(v), naxioms)
  writeAxioms(dataset, file_path, SAVE_PATH, relations_by_type)

OWL2EL_5
60
['<http://benchmark/OWL2Bench#hasResearchProject>', '<http://benchmark/OWL2Bench#isTaughtBy>', '<http://benchmark/OWL2Bench#isHeadOf>', '<http://benchmark/OWL2Bench#hasDean>', '<http://benchmark/OWL2Bench#hasWork>', '<http://benchmark/OWL2Bench#worksFor>', '<http://benchmark/OWL2Bench#publicationResearch>', '<http://benchmark/OWL2Bench#isStudentOf>', '<http://benchmark/OWL2Bench#isResearchGroupOf>', '<http://benchmark/OWL2Bench#hasPhDProgram>', '<http://benchmark/OWL2Bench#hasUGProgram>', '<http://benchmark/OWL2Bench#isDepartmentOf>', '<http://benchmark/OWL2Bench#hasPGProgram>', '<http://benchmark/OWL2Bench#isDeanOf>', '<http://benchmark/OWL2Bench#isMemberOf>', '<http://benchmark/OWL2Bench#hasAdvisor>', '<http://benchmark/OWL2Bench#advises>', '<http://benchmark/OWL2Bench#isAdvisorOf>', '<http://benchmark/OWL2Bench#hasDegreeFrom>', '<http://benchmark/OWL2Bench#tenured>', '<http://benchmark/OWL2Bench#isFacultyOf>', '<http://benchmark/OWL2Bench#isCollegeOf>', '<http://benchmar

## Train, Test, Validation splitting

In [20]:
DATASET = "SNOMED"
DATASET_SPLIT = [
  "1_1", "1_n", 
  "n_n"]
FILE_PATH = "data/{}/{}_norm_{}.owl"
SAVE_PATH = "data/{}/{}/{}.{}"

In [21]:
for SPLIT in DATASET_SPLIT:
  file_path = FILE_PATH.format(DATASET, DATASET, SPLIT)
  print(SPLIT)
  file = {}
  # count = {}
  for k in ['train', 'valid', 'test']:
    file[k] = open(SAVE_PATH.format(DATASET, SPLIT, k, "txt"), 'w')
    # count[k] = 0
  file['train_norm'] = open(SAVE_PATH.format(DATASET, SPLIT, "train_norm", "owl"), 'w')
  # count['train_norm'] = 0
  subclass_axioms = []
  other_axioms = []
  co=0
  train = 0
  valid = 0
  test = 0
  classes = {}
  relations = {}
  subclass_classes = {}
  # First pass: Get all classes in subclass axioms, number of axioms
  with open(file_path) as f:
    for line in f:
      co+=1
      og_line = line
      if line.startswith("SubClassOf"):
        line = line.strip()[11:-1]
        if not line:
          co-=1
          continue
        if not line.startswith("ObjectIntersectionOf(") and not line.startswith("ObjectSomeValuesFrom(") and line.find("ObjectSomeValuesFrom(") == -1:
          # SubClassOf C D
          it = line.split(' ')
          c = it[0]
          d = it[1]
          # subclass_axioms.append((og_line, c, d))
          if c not in subclass_classes:
            subclass_classes[c] = len(subclass_classes)
          if d not in subclass_classes:
            subclass_classes[d] = len(subclass_classes)

  train = int(co*0.7)
  valid = int(co*0.2)
  test = co - train- valid
  ex_train = train
  total_train = train
  count=0
  print("total", co)
  print("Expected train, valid, test:", train, valid, test)

  # Second pass: add all relation (rbox) axioms and only add non subclass axioms containing classes in subclass axioms
  with open(file_path) as f:
    for line in f:
      og_line = line
      if line.startswith('SubObjectPropertyOf'):
        line = line.strip()[20:-1]
        if line.startswith('ObjectPropertyChain'):
          line_chain = line.strip()[20:-1]
          line1 = line.split(")")
          line10 = line1[0].split()
          if len(line10) < 2:
            continue
          r1 = line10[0]
          r2 = line10[1]
          r3 = line1[1]
          # if train and (r1 not in relations or r2 not in relations or r3 not in relations):
          file['train_norm'].write(og_line)
          file['train'].write(r1 + ' ' + r2 + '\n')
          count+=1
          train-=1
            # if r1 not in relations:
            #   relations[r1] = len(relations)
            # if r2 not in relations:
            #   relations[r2] = len(relations)
            # if r3 not in relations:
            #   relations[r3] = len(relations)
          # else:
            # other_axioms.append((og_line, r1, r2, r3))
        else:
          # print("Inside sub obj prop")
          it = line.split(' ')
          r1 = it[0]
          r2 = it[1]
          file['train_norm'].write(og_line)
          file['train'].write(r1 + ' ' + r2 + '\n')
          count+=1
          train-=1
          # other_axioms.append((og_line, r1, r2, ""))
        continue
      line = line.strip()[11:-1]
#           print(line)
      if not line:
        print(og_line)
        continue
      if line.startswith('ObjectIntersectionOf('):
        # C and D SubClassOf E
        it = line.split(' ')
        c = it[0][21:]
        d = it[1][:-1]
        e = it[2]
        if train and (c in subclass_classes or d in subclass_classes or e in subclass_classes):
          file['train_norm'].write(og_line)
          file['train'].write(c + ' ' + d + '\n')
          count+=1
          train-=1
          if c in subclass_classes:
            subclass_classes.pop(c)
          if d in subclass_classes:
            subclass_classes.pop(d)
          if e in subclass_classes:
            subclass_classes.pop(e)
          if c not in classes:
            classes[c] = len(classes)
          if d not in classes:
            classes[d] = len(classes)
          if e not in classes:
            classes[e] = len(classes)
        else:
          other_axioms.append((og_line, c, d, e))
      elif line.startswith('ObjectSomeValuesFrom('):
        # R some C SubClassOf D
        it = line.split(' ')
        r = it[0][21:]
        c = it[1][:-1]
        d = it[2]
        if train and (c in subclass_classes or d in subclass_classes or r not in relations):
          file['train_norm'].write(og_line)
          file['train'].write(c + ' ' + d + '\n')
          count+=1
          train-=1
          if c in subclass_classes:
            subclass_classes.pop(c)
          if d in subclass_classes:
            subclass_classes.pop(d)
          if c not in classes:
            classes[c] = len(classes)
          if d not in classes:
            classes[d] = len(classes)
          if r not in relations:
            relations[r] = len(relations)
        else:
          other_axioms.append((og_line, r, c, d))
      elif line.find('ObjectSomeValuesFrom') != -1:
        # C SubClassOf R some D
        it = line.split(' ')
        c = it[0]
        r = it[1][21:]
        d = it[2][:-1]
        if train and (c in subclass_classes or d in subclass_classes or r not in relations):
          file['train_norm'].write(og_line)
          file['train'].write(c + ' ' + d + '\n')
          count+=1
          train-=1
          if c in subclass_classes:
            subclass_classes.pop(c)
          if d in subclass_classes:
            subclass_classes.pop(d)
          if c not in classes:
            classes[c] = len(classes)
          if d not in classes:
            classes[d] = len(classes)
          if r not in relations:
            relations[r] = len(relations)
        else:
          other_axioms.append((og_line, r, c, d))
      else:
        # C SubClassOf D
        it = line.split(' ')
        c = it[0]
        d = it[1]
        subclass_axioms.append((og_line, c, d))
  print("Added to train from other axioms", ex_train-train, count)
  print(count+len(subclass_axioms)+len(other_axioms), len(subclass_axioms), len(other_axioms))
  print("Classes not added", len([k for k in subclass_classes if k not in classes]), len(subclass_classes))
  ex_train = train
  # Second pass
  temp = []
  for tup in subclass_axioms:
    line, c, d = tup
    if train and (c in subclass_classes or d in subclass_classes):
#       print(line)
      file['train_norm'].write(line)
      file['train'].write(c + ' ' + d + '\n')
      count+=1
      train-=1
      if c in subclass_classes:
        subclass_classes.pop(c)
      if d in subclass_classes:
        subclass_classes.pop(d)
      if c not in classes:
        classes[c] = len(classes)
      if d not in classes:
        classes[d] = len(classes)
    else:
      temp.append((line, c, d))
  subclass_axioms = temp
  print(count+len(subclass_axioms)+len(other_axioms), len(subclass_axioms), len(other_axioms))
  print("Added to train from subclass axioms", ex_train-train)
  ex_train = train

  print("remaining subclass axioms and other axioms", len(subclass_axioms), len(other_axioms))
  print("train, valid, test", train, valid, test)

  if len(subclass_axioms) < test+valid:
    # recalculate test, validation, training sample counts
    c = len(subclass_axioms)
    valid = int(c*0.66)
    test = c - valid
    train = max(0, min(train, int(c*0.7/0.3) - (total_train-train)))
  ex_train = train

  print("Train, valid, test left", train, valid, test)

  temp = []
  while test and subclass_axioms != []:
    line, c, d = subclass_axioms.pop()
    if c not in subclass_classes and d not in subclass_classes:
      file['test'].write(c + ' ' + d + '\n')
      test-=1
      count+=1
    else:
      temp.append((line, c, d))
  while valid and subclass_axioms != []:
    line, c, d =  subclass_axioms.pop()
    if c not in subclass_classes and d not in subclass_classes:
      file['valid'].write(c + ' ' + d + '\n')
      valid-=1
      count+=1
    else:
      temp.append((line, c, d))
  subclass_axioms.extend(temp)
  while train and other_axioms != []:
    line, r, c, d = other_axioms.pop()
#     if c in classes and d in classes:
    file['train'].write(c + ' ' + d + '\n')
    file['train_norm'].write(line)
    count+=1
    train-=1
  print("added to train from other axioms", ex_train-train)
  ex_train = train
  while train and subclass_axioms != []:
    line, c, d =  subclass_axioms.pop()
#     if c in classes or d in classes:
    file['train'].write(c + ' ' + d + '\n')
    file['train_norm'].write(line)
    count+=1
    train -=1
  print("added to train from subclass axioms", ex_train-train)
  for k in ['train', 'valid', 'test']:
    file[k].close()
  print("Total:", count)
  print("left", len(subclass_axioms), len(other_axioms))
  print(train, test, valid)
  # for k in ['train', 'valid', 'test']:
  #   with open(SAVE_PATH.format(DATASET, SPLIT, k, "txt")) as f:
  #     c=0
  #     for line in f:
  #       c+=1
  #     print("Final", k, c)
  # print("total added to train, valid, test")


1_1
total 474407
Expected train, valid, test: 332084 94881 47442
Added to train from other axioms 27001 27001
474407 446616 790
Classes not added 250120 250120
474407 217755 790
Added to train from subclass axioms 228861
remaining subclass axioms and other axioms 217755 790
train, valid, test 76222 94881 47442
Train, valid, test left 76222 94881 47442
added to train from other axioms 790
added to train from subclass axioms 75432
Total: 474407
left 0 0
0 0 0
1_n
total 675048
Expected train, valid, test: 472533 135009 67506
Added to train from other axioms 186673 186673
675048 446616 41759
Classes not added 33831 33831
675048 413018 41759
Added to train from subclass axioms 33598
remaining subclass axioms and other axioms 413018 41759
train, valid, test 252262 135009 67506
Train, valid, test left 252262 135009 67506
added to train from other axioms 41759
added to train from subclass axioms 210503
Total: 675048
left 0 0
0 0 0
n_n
total 788545
Expected train, valid, test: 551981 157709 788