In [1]:
%load_ext autoreload
%autoreload 2

import os
import pathlib
cwd = pathlib.Path.cwd()
if cwd.name != 'ryn':
    print('changing directory')
    os.chdir(cwd.parent)

import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

changing directory


In [2]:
from ryn.graphs import loader
g = loader.load_graphs_from_uri('vll.fb15k237-trainvalid')[0]
print(f'loaded {g.str_stats}')

loaded ryn graph: vll.fb15k237-trainvalid
  nodes: 24032
  edges: 289050 (238 types)
  degree:
    mean 24.06
    median 9



In [3]:
from dataclasses import dataclass
from typing import Set
from typing import Tuple


@dataclass
class Relation:

    r: int
    name: str

    hs: Set[int]
    ts: Set[int]

    ratio: float


rels = []
for r, relname in g.source.rels.items():
    hs, ts = map(set, zip(*((h, t) for h, t, _ in g.find(edges={r}))))
    lens = len(hs), len(ts)
    ratio = min(lens) / max(lens)

    rels.append(Relation(
        r=r, name=relname,
        hs=hs, ts=ts,
        ratio=ratio
    ))

In [4]:
from tabulate import tabulate

rels.sort(key=lambda rel: rel.ratio)
rows = [(r.r, r.ratio, len(r.hs), len(r.ts), r.name) for r in rels]

N = 5

print(f'first {N}')
print(tabulate(rows[:N]))

print(f'mid {N}')
m = len(rows) / 2
print(tabulate(rows[int(m-N/2):int(m+N/2)]))

print(f'last {N}')
print(tabulate(rows[-N:]))

first 5
---  -----------  ----  -  ---------------------------------------------------------------------------------------------------
 25  0.000256082  3905  1  /common/topic/webpage./common/webpage/category
 43  0.000977995  4090  4  /people/person/gender
 85  0.00196143   3059  6  /people/person/spouse_s./people/marriage/type_of_union
120  0.00247219    809  2  /location/hud_foreclosure_area/estimated_number_of_mortgages./measurement_unit/dated_integer/source
118  0.00451467    443  2  /user/tsegaran/random/taxonomy_subject/entry./user/tsegaran/random/taxonomy_entry/taxonomy
---  -----------  ----  -  ---------------------------------------------------------------------------------------------------
mid 5
---  --------  ---  --  -----------------------------------------------------------------------------------
134  0.252366  317  80  /music/group_member/membership./music/group_membership/role
221  0.252632  190  48  /location/country/official_language
117  0.253731   67  17  /sport

In [5]:
import ryn
import random
import pathlib


SEEDS = [30061990, 8051991, 25031990, 2041992]
SPLIT = .7  # training data ratio (1 - SPLIT -> validation data ratio)

for seed in SEEDS:
    print('-' * 60)
    print(f'using {seed=}')

    random.seed(seed)

    # triple sets
    train, valid = set(), set()

    for i, r in enumerate(rels, 1):
        reverse = len(r.hs) <= len(r.ts)
        concepts, entities = (r.hs, r.ts) if reverse else (r.ts, r.hs)
        # print(f'{len(concepts):4d} - {len(entities):4d} {r.name}')

        # split entities
        # concepts always go into train // TODO discuss!
        for concept in concepts:
            heads, tails = ({concept}, entities) if reverse else (entities, {concept})
            triples = g.select(edges={r.r}, heads=heads, tails=tails)
            # print(f'{" " * 6} {len(triples)=} {len(heads)=} {len(tails)=}')

            assert len(heads) == 1 or len(tails) == 1
            selection = set(random.sample(triples, k=int(SPLIT * len(triples))))
            complement = triples - selection

            train.update(selection)
            valid.update(complement)

            assert not len(selection & complement)

        print('.', end='' if (i % 40) else '\n')

    print()
    print(f'created {len(train)=} {len(valid)=}')

    path = ryn.ENV.CACHE_DIR / 'notes.graph.split' / f'{SPLIT:0.2f}_{seed}'
    path.mkdir(exist_ok=True, parents=True)

    # save oke-like
    with (path / 'train2id.txt').open(mode='w') as fd:
        fd.write(f'{len(train)}\n')
        fd.write('\n'.join(' '.join(map(str, triple)) for triple in train))
        print(f'wrote {fd.name}')

    with (path / 'valid2id.txt').open(mode='w') as fd:
        fd.write(f'{len(valid)}\n')
        fd.write('\n'.join(' '.join(map(str, triple)) for triple in valid))
        print(f'wrote {fd.name}')

    with (path / 'entity2id.txt').open(mode='w') as fd:
        fd.write(f'{len(g.source.ents)}\n')
        fd.write('\n'.join(f'{name} {eid}' for eid, name in g.source.ents.items()))
        print(f'wrote {fd.name}')

    with (path / 'relation2id.txt').open(mode='w') as fd:
        fd.write(f'{len(g.source.rels)}\n')
        fd.write('\n'.join(f'{name} {rid}' for rid, name in g.source.rels.items()))
        print(f'wrote {fd.name}')


------------------------------------------------------------
using seed=30061990
.

.

.....

....

.......

......

.....

...

..

......


.

..

...

....

..

..

.

.

.

..

...

..

.

....

..

.......

.

.
...

...

..

.

..

..

..

.

..

.

..

..

..

..

.

..

..

.

...

....


....

.

..

.

.

.

....

..

..

.

.

.

.

.

...

.

.

.

..

....

.

...

.


.

..

.

.

..

.

..

..

..

.

.

.

.

..

..

..

.

.

.

.

.

..

.

...

...

..


..

.

.

.

..

..

..

.

.

.

..

.

.

..

.

..

.

.

.

.

.

.

.

.

.

..

..

..
created len(train)=182700 len(valid)=106350


wrote /mnt/hdd/felix/Complex/deepkg/ryn/data/cache/notes.graph.split/0.70_30061990/train2id.txt
wrote /mnt/hdd/felix/Complex/deepkg/ryn/data/cache/notes.graph.split/0.70_30061990/valid2id.txt
wrote /mnt/hdd/felix/Complex/deepkg/ryn/data/cache/notes.graph.split/0.70_30061990/entity2id.txt
wrote /mnt/hdd/felix/Complex/deepkg/ryn/data/cache/notes.graph.split/0.70_30061990/relation2id.txt
------------------------------------------------------------
using seed=8051991


..

.....

....

........

.....

.......

.

..

......
.

..

...

....

..

..

.

.

.

..

...

..

.

....

..

.......

.

.
...

...

..

.

..

..

..

.

..

...

..

..

..

.

..

..

.

...

....


....

.

..

.

.

.

....

..

..

.

.

.

.

.

...

.

.

.

..

....

.

...

.


.

..

.

.

..

.

..

..

..

.

.

.

.

..

..

..

.

.

.

.

.

..

.

...

...

..


..

.

.

.

..

..

..

.

.

.

..

.

.

..

.

..

.

.

.

.

.

.

.

.

.

..

..

..
created len(train)=182700 len(valid)=106350


wrote /mnt/hdd/felix/Complex/deepkg/ryn/data/cache/notes.graph.split/0.70_8051991/train2id.txt
wrote /mnt/hdd/felix/Complex/deepkg/ryn/data/cache/notes.graph.split/0.70_8051991/valid2id.txt
wrote /mnt/hdd/felix/Complex/deepkg/ryn/data/cache/notes.graph.split/0.70_8051991/entity2id.txt
wrote /mnt/hdd/felix/Complex/deepkg/ryn/data/cache/notes.graph.split/0.70_8051991/relation2id.txt
------------------------------------------------------------
using seed=25031990


..

.....

....

.......

......

.......

.

..

......
.

..

...

....

..

..

.

.

.

..

...

..

.

....

..

.......

.

.
...

...

..

.

..

..

..

.

..

.

..

..

..

..

.

..

..

.

...

....


....

.

..

.

.

.

....

..

..

.

.

.

.

.

...

.

.

.

..

....

.

...

.


.

..

.

.

..

.

..

..

..

.

.

.

.

..

..

..

.

.

.

.

.

..

.

...

...

..


..

.

.

.

..

..

..

.

.

.

..

.

.

..

.

..

.

.

.

.

.

.

.

.

.

..

..

..
created len(train)=182700 len(valid)=106350


wrote /mnt/hdd/felix/Complex/deepkg/ryn/data/cache/notes.graph.split/0.70_25031990/train2id.txt
wrote /mnt/hdd/felix/Complex/deepkg/ryn/data/cache/notes.graph.split/0.70_25031990/valid2id.txt
wrote /mnt/hdd/felix/Complex/deepkg/ryn/data/cache/notes.graph.split/0.70_25031990/entity2id.txt
wrote /mnt/hdd/felix/Complex/deepkg/ryn/data/cache/notes.graph.split/0.70_25031990/relation2id.txt
------------------------------------------------------------
using seed=2041992


..

.....

....

.......

......

......

..

..

......
.

..

...

....

..

..

.

.

.

..

...

..

.

....

..

.......

.

.
...

...

..

.

..

..

..

.

..

...

..

..

..

.

..

..

.

...

....


....

.

..

.

.

.

....

..

..

.

.

.

.

.

...

.

.

.

..

....

.

...

.


.

..

.

.

..

.

..

..

..

.

.

.

.

..

..

..

.

.

.

.

.

..

.

...

...

..


..

.

.

.

..

..

..

.

.

.

..

.

.

..

.

..

.

.

.

.

.

.

.

.

.

..

..

..
created len(train)=182700 len(valid)=106350


wrote /mnt/hdd/felix/Complex/deepkg/ryn/data/cache/notes.graph.split/0.70_2041992/train2id.txt
wrote /mnt/hdd/felix/Complex/deepkg/ryn/data/cache/notes.graph.split/0.70_2041992/valid2id.txt
wrote /mnt/hdd/felix/Complex/deepkg/ryn/data/cache/notes.graph.split/0.70_2041992/entity2id.txt
wrote /mnt/hdd/felix/Complex/deepkg/ryn/data/cache/notes.graph.split/0.70_2041992/relation2id.txt
