# KGC Control Experiments

We run two control experiments to check correctness of metric calculation,
and get a upper performance boundary for chat based llms which propose mentions.

In [2]:
import irt2

p_data = irt2.ENV.DIR.DATA

In [3]:
from irt2.types import Split, Task, Sample, MID, RID, VID
from irt2.dataset import IRT2
from irt2.evaluation import Predictions

import random
from typing import Iterable, Literal


Tasks = dict[tuple[MID, RID], set[VID]]


def true_vids(tasks: Tasks, ds: IRT2, **_) -> Predictions:
    """This model cheats and always answers always correctly."""
    for (mid, rid), vids in tasks.items():
        yield (mid, rid), ((vid, 1) for vid in vids)

def true_mentions(
    tasks: Tasks,
    ds: IRT2,
    split: Literal['validation', 'test'],
    **_,
) -> Predictions:
    """This model cheats and knows the correct mentions."""
    splits = (Split.train, Split.valid)
    if split == 'test':
        splits += (Split.test, )

    ids = ds.idmap
    for (mid, rid), gt_vids in tasks.items():
        mentions = {
            ids.mid2str[mid]
            for mids in map(ids.vid2mids.get, gt_vids)
            for mid in mids
        }

        pr_vids = ds.find_by_mention(
            *mentions,
            splits=splits,
        )

        yield (mid, rid), ((vid, 1) for vid in pr_vids)


def random_guessing(
    tasks: Tasks,
    ds: IRT2,
    split: Literal['validation', 'test'],
    seed: int,
    **_,
) -> Predictions:
    """This model is just guessing randomly."""
    rng = random.Random()
    rng.seed(seed)

    ids = ds.idmap
    candidates = ids.split2vids[Split.train] | ids.split2vids[Split.valid]
    if split == 'test':
        candidates |= ids.split2vids[Split.test]

    perm = list(candidates)
    for (mid, rid), vids in tasks.items():
        yield (mid, rid), ((vid, rng.random()) for vid in rng.sample(perm, k=100))



MODELS = {
    'true-vertices': true_vids,
    'true-mentions': true_mentions,
    'random-guessing': random_guessing,
}

In [4]:
from irt2 import evaluation
from ktz.collections import dflat

import yaml
from functools import partial
from typing import Callable


def flatten(report: dict):
    before = dict(
        dataset=report['dataset'],
        model=report['model'],
        date=report['date'],
        split=report['split'],
    )

    metrics = dflat(report['metrics'], sep=' ')
    metrics = dict(sorted(metrics.items()))

    return before | metrics


def evaluate(
    ds: IRT2,
    name: str,
    split: str,
    head_predictions: Predictions,
    tail_predictions: Predictions,
):
    metrics = evaluation.evaluate(
        ds=ds,
        task='kgc',
        split=split,
        head_predictions=head_predictions,
        tail_predictions=tail_predictions,
    )

    return evaluation.create_report(
        metrics,
        ds,
        task='kgc',
        split=split,
        model=name,
        filenames=dict(notebook='ipynb/control-experiments.ipynb'),
    )



def run(
    ds: IRT2,
    name: str,
    model: Callable,
    split: str,
    seed: int,
):
    predictor = partial(
        model,
        ds=ds,
        split=split,
        seed=seed,
    )

    assert split == 'validation' or split == 'test'

    if split == 'validation':
        head_predictions = predictor(ds.open_kgc_val_heads)
        tail_predictions = predictor(ds.open_kgc_val_tails)

    if split == 'test':
        head_predictions = predictor(ds.open_kgc_test_heads)
        tail_predictions = predictor(ds.open_kgc_test_tails)


    report = evaluate(
        ds=ds,
        name=name,
        split=split,
        head_predictions=head_predictions,
        tail_predictions=tail_predictions,
    )

    return report


In [7]:
import csv
from pathlib import Path
from ktz.collections import dconv
from irt2.loader import LOADER


def _run_all(datasets, models, splits, seed: int):
    for dataset_config in datasets:
        ds = LOADER[dataset_config['loader']](dataset_config['path'])
        print(str(ds))

        for split in splits:
            percentage = None
            # percentage = dataset_config['percentage'][split]
            sub_ds = ds.tasks_subsample_kgc(percentage, seed=seed)

            if split == 'validation':
                n_heads = len(sub_ds.open_kgc_val_heads)
                n_tails = len(sub_ds.open_kgc_val_tails)

            if split == 'test':
                n_heads = len(sub_ds.open_kgc_test_heads)
                n_tails = len(sub_ds.open_kgc_test_tails)

            print(
                '  ' + split,
                f'{seed=} {percentage=}'
                f' {n_heads} head and {n_tails} tail tasks'
                f' = {n_heads + n_tails}',
                sep='\n    - ',
            )

            meta = {
                'percentage': percentage,
                'total tasks': n_heads + n_tails,
                'head tasks': n_heads,
                'tail tasks': n_tails,
            }

            # print(', '.join(map(str, sub_ds.table_row)))
            for model in models:
                print('    - model: ', model)
                report = run(sub_ds, model, MODELS[model], split, seed)
                yield meta | flatten(report)


def run_all(out, datasets, models, splits, seed: int):
    out.parent.mkdir(exist_ok=True, parents=True)

    print(f'write results to {out}')
    with out.open(mode='w') as fd:
        writer = None

        for flat in _run_all(datasets, models, splits, seed):
            if writer is None:
                header = ['seed'] + list(flat.keys())

                writer = csv.DictWriter(fd, fieldnames=header)
                writer.writeheader()

            writer.writerow(flat | {'seed': seed})



all_config = {
    'datasets': [
        # {
        #     'path': p_data / 'irt2' / 'irt2-cde-tiny',
        #     'loader': 'irt2',
        #     'percentage': {
        #         'validation': 0.17,
        #         'test': 0.02,
        #     },
        # },
        # {
        #     'path': p_data / 'irt2' / 'irt2-cde-small',
        #     'loader': 'irt2',
        #     'percentage': {
        #         'validation': 0.08,
        #         'test': 0.02,
        #     },
        # },
        # {
        #     'path': p_data / 'irt2' / 'irt2-cde-medium',
        #     'loader': 'irt2',
        #     'percentage': {
        #         'validation': 0.04,
        #         'test': 0.01,
        #     },
        # },
        # {
        #     'path': p_data / 'irt2' / 'irt2-cde-large',
        #     'loader': 'irt2',
        #     'percentage': {
        #         'validation': 0.05,
        #         'test': 0.02,
        #     },
        # },
        # {
        #     'path': p_data/ 'blp' / 'WN18RR',
        #     'loader': 'blp/wn18rr',
        #     'percentage': {
        #         'validation': 0.06,
        #         'test': 0.06,
        #     },
        # },
        # {
        #     'path': p_data/ 'blp' / 'FB15k-237',
        #     'loader': 'blp/fb15k237',
        #     'percentage': {
        #         'validation': 0.03,
        #         'test': 0.03,
        #     },
        # },
        {
            'path': p_data/ 'blp' / 'Wikidata5M',
            'loader': 'blp/wikidata5m',
            'percentage': {
                'validation': 0.09,
                'test': 0.08,
            },
        },
    ],
    'models': [
        # 'true-vertices',
        'true-mentions',
        # 'random-guessing',
    ],
    'splits': [
        'validation',
        'test',
    ],
    'seed': 31189,
}


def main(config):
    root = p_data / "evaluation"
    ffmt = "control-experiments-{seed}.{suffix}"
    fcsv = ffmt.format(suffix='csv', **config)
    run_all(out=root / fcsv, **config)


main(all_config)
print('done')

write results to /home/felix/Complex/dkg/irt2/data/evaluation/control-experiments-31189.csv


BLP/WIKIDATA5M: 4818582 vertices | 822 relations | 11804166 mentions
  validation
    - seed=31189 percentage=0.09 520 head and 579 tail tasks = 1099
    - model:  true-mentions


  test
    - seed=31189 percentage=0.08 471 head and 529 tail tasks = 1000
    - model:  true-mentions


done


In [None]:
from typing import Iterable


subsample_config = {
    'datasets': [
        {
            'path': p_data / 'irt2' / 'irt2-cde-tiny',
            'loader': 'irt2',
        },
        {
            'path': p_data / 'irt2' / 'irt2-cde-small',
            'loader': 'irt2',
        },
        {
            'path': p_data / 'irt2' / 'irt2-cde-medium',
            'loader': 'irt2',
        },
        {
            'path': p_data / 'irt2' / 'irt2-cde-large',
            'loader': 'irt2',
        },
        {
            'path': p_data/ 'blp' / 'WN18RR',
            'loader': 'blp/wn18rr',
        },
        {
            'path': p_data/ 'blp' / 'FB15k-237',
            'loader': 'blp/fb15k237',
        },
        # {
        #     'path': p_data/ 'blp' / 'Wikidata5M',
        #     'loader': 'blp/wikidata5m',
        # },
    ],
    'seed': 31189,
}


def run_subsampling(out, datasets, seed, percentages: Iterable[float]):
    out.parent.mkdir(exist_ok=True, parents=True)

    print(f'write results to {out}')
    with out.open(mode='w') as fd:
        writer = None

        for dataset_config in datasets:
            ds = LOADER[dataset_config['loader']](dataset_config['path'])
            print(str(ds))

            for percentage in percentages:
                print(f'  - {int(percentage * 100):3d}%', f'{seed=}')
                sub_ds = ds.tasks_subsample_kgc(percentage=percentage, seed=seed)

                report = run(
                    sub_ds,
                    name='true-mentions',
                    model=MODELS['true-mentions'],
                    split='validation',
                    seed=seed,
                )

                flat = flatten(report)

                if writer is None:
                    header = ['percentage', 'head tasks', 'tail tasks', 'seed'] + list(flat.keys())
                    writer = csv.DictWriter(fd, fieldnames=header)
                    writer.writeheader()

                writer.writerow(flat | {
                    'percentage': percentage,
                    'head tasks': len(sub_ds.open_kgc_val_heads),
                    'tail tasks': len(sub_ds.open_kgc_val_tails),
                    'seed': seed
                })


def subsample_experiments(config, ks: Iterable[int]):
    root = p_data / "evaluation"
    ffmt = "subsample-experiments-{seed}.{suffix}"

    fcsv = ffmt.format(
        seed=config['seed'],
        suffix='csv',
    )

    run_subsampling(
        out=root / fcsv,
        percentages=[0.01, 0.025] + [x/100 for x in range(5, 101, 5)],
        **config,
    )


subsample_experiments(subsample_config, ks=[50, 100])