In [None]:
%load_ext autoreload
%autoreload 2

import os
import pathlib
cwd = pathlib.Path.cwd()
if cwd.name != 'ryn':
    print('changing directory')
    os.chdir(cwd.parent)

import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [None]:
from ryn.graphs import split
from ryn.graphs import loader

g = loader.load_graphs_from_uri('oke.fb15k237-trainvalidtest')[0]
print(f'loaded {g.str_stats}')

rels = split.Relation.from_graph(g)
print(f'retrieved {len(rels)} relations')

cfg = split.Config(split=.7, prob_a=3, prob_o=len(rels) / 2, prob_s=1/20)
print(f'creating a split of {int(cfg.split * 100)}% train and {100 - int(cfg.split * 100)}% test data')

In [None]:
from tabulate import tabulate

rels.sort(key=lambda rel: rel.ratio)
rows = [(r.r, r.ratio, len(r.hs), len(r.ts), r.name) for r in rels]

N = 5

print(f'first {N}')
print(tabulate(rows[:N]))

print(f'mid {N}')
m = len(rows) / 2
print(tabulate(rows[int(m-N/2):int(m+N/2)]))

print(f'last {N}')
print(tabulate(rows[-N:]))

In [None]:
import ryn
from ryn.common import plotter

from functools import partial

path = ryn.ENV.CACHE_DIR / 'notes.graph.split' / f'{g.name}.ratio'
prob = partial(split.prob, a=cfg.prob_a, o=cfg.prob_o, s=cfg.prob_s)

plt = plotter.Plotter(title=f'Ratio Distribution {g.name}', xlabel='Relation', ylabel='Ratio', fname=str(path))
plt.ax.scatter(range(len(rels)), [r.ratio for r in rels], color=plotter.CLR[0], s=1)
plt.ax.scatter(range(len(rels)), [prob(x) for x in range(len(rels))], color=plotter.CLR[1], s=1)

print()
plt.plot()
print()

print(tabulate(rows[100:105]))
print([f'{prob(x):.2f}' for x in range(80,90)])

In [None]:
seeds = [30061990, 8051991, 25031990, 2041992]
split.create(g, cfg, rels, [seeds[3]])