# Create Phony Split

Create a split.Dataset without actually running the splitter.
Used to transform other datasets into a format for use by ryn.kgc and ryn.text.

In [1]:
from ryn.common import helper
helper.notebook()
%load_ext autoreload
%autoreload 2

changing directory


In [2]:
from ryn.graphs import loader

g_train, g_valid, g_test  = loader.load_graphs_from_uri(
    'oke.fb15k237-train',
    'oke.fb15k237-valid',
    'oke.fb15k237-test', )

for g in (g_train, g_valid, g_test):
    print(g.str_stats)


ryn graph: oke.fb15k237-train
  nodes: 14505
  edges: 272115 (237 types)
  degree:
    mean 37.52
    median 22





ryn graph: oke.fb15k237-valid
  nodes: 9809
  edges: 17535 (237 types)
  degree:
    mean 3.58
    median 2





ryn graph: oke.fb15k237-test
  nodes: 10348
  edges: 20466 (237 types)
  degree:
    mean 3.96
    median 2





In [3]:
g_all = g_train | g_valid | g_test
print(g_all.str_stats)


ryn graph: oke.fb15k237-train|oke.fb15k237-valid|oke.fb15k237-test
  nodes: 14541
  edges: 310116 (237 types)
  degree:
    mean 42.65
    median 26





In [33]:
# there are triples with unseen entities in the official FB15k237
# benchmark... removing them here:

def _ents_from_triples(source):
    return set(e for h, t, _ in source for e in (h, t))

def _remove_invalid(triples, invalid):
    return set(
        (h, t, r) for h, t, r in triples
        if (h not in invalid) and (t not in invalid))

e_train = _ents_from_triples(g_train.source.triples)
e_valid = _ents_from_triples(g_valid.source.triples)
e_test = _ents_from_triples(g_test.source.triples)

# removing invalid validation triples
invalid = e_valid - e_train
print(f'found {len(invalid)=} entities in validation')
train_triples = _remove_invalid(g_train.source.triples, invalid)

print(f'removed {len(g_train.source.triples - train_triples)} triples')


invalid = e_test - (e_train | e_valid)
print(f'found {len(invalid)=} entities in test')

found len(invalid)=8 entities in validation




found len(invalid)=28 entities in test




In [4]:
# create a phony split dataset with empty Parts

import ryn
from ryn.graphs import split

path = ryn.ENV.SPLIT_DIR / 'oke.fb15k237'

splitter = split.Splitter(
    g=g_all,
    name=path.name,
    cfg=split.Config(
        seed=30061990,
        ow_split=0,
        train_split=0,
        threshold=0,
    ),
).write(
    concepts=set(),
    cw=split.Split(
        train=train_triples,
        valid=valid_triples)),
    ow=split.Split(
        valid=test_triples),
        test=set()),
)

In [10]:
split_dataset = split.Dataset.load(path=path)

assert len(split_dataset.cw_valid.owe) == 0
assert len(split_dataset.ow_valid.owe) == 0
assert len(split_dataset.cw_valid.triples) == 0

print(split_dataset)

{14505, 14506, 14507, 14508, 14509, 14510, 14511, 14512}




AssertionError: cw.valid contains owe entities