# KGC Control Experiments

We run control experiments to check correctness of metric calculation,
and get a approximate performance boundary for chat based llms which propose mentions.

In [1]:
%load_ext autoreload
%autoreload 2

import irt2

p_data = irt2.ENV.DIR.DATA

In [2]:
from irt2.types import Split, Task, Sample, MID, RID, VID
from irt2.dataset import IRT2
from irt2.evaluation import Predictions

import random
from typing import Iterable, Literal


Tasks = dict[tuple[MID, RID], set[VID]]


def true_vids(tasks: Tasks, ds: IRT2, **_) -> Predictions:
    """This model cheats and always answers always correctly."""
    for (mid, rid), vids in tasks.items():
        yield (mid, rid), ((vid, 1) for vid in vids)

def true_mentions(
    tasks: Tasks,
    ds: IRT2,
    split: Literal['validation', 'test'],
    **_,
) -> Predictions:
    """This model cheats and knows the correct mentions."""
    splits = (Split.train, Split.valid)
    if split == 'test':
        splits += (Split.test, )

    ids = ds.idmap
    for (mid, rid), gt_vids in tasks.items():
        mentions = {
            ids.mid2str[mid]
            for mids in map(ids.vid2mids.get, gt_vids)
            for mid in mids
        }

        pr_vids = ds.find_by_mention(
            *mentions,
            splits=splits,
        )

        yield (mid, rid), ((vid, 1) for vid in pr_vids)


def random_guessing(
    tasks: Tasks,
    ds: IRT2,
    split: Literal['validation', 'test'],
    seed: int,
    **_,
) -> Predictions:
    """This model is just guessing randomly."""
    rng = random.Random()
    rng.seed(seed)

    ids = ds.idmap
    candidates = ids.split2vids[Split.train] | ids.split2vids[Split.valid]
    if split == 'test':
        candidates |= ids.split2vids[Split.test]

    perm = list(candidates)
    for (mid, rid), vids in tasks.items():
        yield (mid, rid), ((vid, rng.random()) for vid in rng.sample(perm, k=100))



MODELS = {
    'true-vertices': true_vids,
    'true-mentions': true_mentions,
    'random-guessing': random_guessing,
}

In [3]:
from irt2 import evaluation
from ktz.collections import dflat

import yaml
from functools import partial
from typing import Callable


def flatten(report: dict):
    before = dict(
        dataset=report['dataset'],
        model=report['model'],
        date=report['date'],
        split=report['split'],
    )

    metrics = dflat(report['metrics'], sep=' ')
    metrics = dict(sorted(metrics.items()))

    return before | metrics


def evaluate(
    ds: IRT2,
    name: str,
    split: str,
    head_predictions: Predictions,
    tail_predictions: Predictions,
):
    metrics = evaluation.evaluate(
        ds=ds,
        task='kgc',
        split=split,
        head_predictions=head_predictions,
        tail_predictions=tail_predictions,
    )

    return evaluation.create_report(
        metrics,
        ds,
        task='kgc',
        split=split,
        model=name,
        filenames=dict(notebook='ipynb/control-experiments.ipynb'),
    )



def run(
    ds: IRT2,
    name: str,
    model: Callable,
    split: str,
    seed: int,
):
    predictor = partial(
        model,
        ds=ds,
        split=split,
        seed=seed,
    )

    assert split == 'validation' or split == 'test'

    if split == 'validation':
        head_predictions = predictor(ds.open_kgc_val_heads)
        tail_predictions = predictor(ds.open_kgc_val_tails)

    if split == 'test':
        head_predictions = predictor(ds.open_kgc_test_heads)
        tail_predictions = predictor(ds.open_kgc_test_tails)


    report = evaluate(
        ds=ds,
        name=name,
        split=split,
        head_predictions=head_predictions,
        tail_predictions=tail_predictions,
    )

    return report


In [4]:
import csv
from pathlib import Path
from ktz.collections import dconv, dflat
from irt2.loader import from_config_file


def _run_all(datasets_config, models, splits, seed: int):
    datasets = from_config_file(
        root_path=irt2.ENV.DIR.ROOT,
        **datasets_config,
    )

    for _, dataset in datasets:
        print('\n', str(dataset))

        for split in splits:
            if split == 'validation':
                n_heads = len(dataset.open_kgc_val_heads)
                n_tails = len(dataset.open_kgc_val_tails)

            if split == 'test':
                n_heads = len(dataset.open_kgc_test_heads)
                n_tails = len(dataset.open_kgc_test_tails)

            options = dataset.meta['loader']
            percentage = None
            if "subsample" in options:
                percentage = options["subsample"].get(split, None)

            print(
                '  ' + split,
                f'percentage={percentage}',
                f'{n_heads} head and {n_tails} tail tasks'
                f' = {n_heads + n_tails}',
                sep='\n    - ',
            )

            meta = {
                'percentage': percentage,
                'total tasks': n_heads + n_tails,
                'head tasks': n_heads,
                'tail tasks': n_tails,
            }

            # print(', '.join(map(str, dataset.table_row)))
            for model in models:
                print('    - model: ', model)
                report = run(dataset, model, MODELS[model], split, seed)

                h10 = report['metrics']['all']['micro']['hits_at_10']
                print(f'    - result: {h10:2.3f}')

                yield meta | flatten(report)


def run_all(out, datasets_config, models, splits, seed: int):
    out.parent.mkdir(exist_ok=True, parents=True)

    print(f'write results to {out}')
    with out.open(mode='w') as fd:
        writer = None

        for flat in _run_all(datasets_config, models, splits, seed):
            if writer is None:
                header = ['seed'] + list(flat.keys())

                writer = csv.DictWriter(fd, fieldnames=header)
                writer.writeheader()

            writer.writerow(flat | {'seed': seed})



all_config = {
    'datasets_config': {
        'config_file': irt2.ENV.DIR.CONF / 'datasets' / 'original.yaml',
        'without': ['blp/*'],
        # 'config_file': irt2.ENV.DIR.CONF / 'datasets' / 'original-subsampled.yaml',
        # 'config_file': irt2.ENV.DIR.CONF / 'datasets' / 'full.yaml',
        # 'config_file': irt2.ENV.DIR.CONF / 'datasets' / 'full-subsampled.yaml',
        # 'without': ('blp/wikidata5m', )
    },
    'models': ['true-mentions'],
    'splits': [
        'validation',
        'test',
    ],
    'seed': 31189,
}


def main(config):
    name = config['datasets_config']['config_file'].stem
    fcsv = f"control-experiments-{name}.csv"
    run_all(out=p_data / "evaluation" / fcsv, **config)


main(all_config)
print('done')

write results to /home/felix/Complex/dkg/irt2/data/evaluation/control-experiments-original.csv



 IRT2/CDE-L: 15020 vertices | 45 relations | 32666 mentions
  validation
    - percentage=None
    - 1184 head and 19654 tail tasks = 20838
    - model:  true-mentions


    - result: 0.625
  test
    - percentage=None
    - 2697 head and 46104 tail tasks = 48801
    - model:  true-mentions


    - result: 0.619



 IRT2/CDE-M: 15020 vertices | 45 relations | 32666 mentions
  validation
    - percentage=None
    - 782 head and 23656 tail tasks = 24438
    - model:  true-mentions


    - result: 0.636
  test
    - percentage=None
    - 3174 head and 94747 tail tasks = 97921
    - model:  true-mentions


    - result: 0.644



 IRT2/CDE-S: 14207 vertices | 12 relations | 28582 mentions
  validation
    - percentage=None
    - 131 head and 13084 tail tasks = 13215
    - model:  true-mentions


    - result: 0.628
  test
    - percentage=None
    - 513 head and 52141 tail tasks = 52654
    - model:  true-mentions


    - result: 0.624



 IRT2/CDE-T: 12389 vertices | 5 relations | 23894 mentions
  validation
    - percentage=None
    - 68 head and 5299 tail tasks = 5367
    - model:  true-mentions


    - result: 0.827
  test
    - percentage=None
    - 423 head and 47434 tail tasks = 47857
    - model:  true-mentions


    - result: 0.824
done


In [8]:
from typing import Iterable
from irt2.loader import from_config_file

def run_subsampling(out, datasets_config, percentages: Iterable[float], seed: int):
    out.parent.mkdir(exist_ok=True, parents=True)

    print(f'write results to {out}')
    with out.open(mode='w') as fd:
        writer = None

        datasets = from_config_file(
            root_path=irt2.ENV.DIR.ROOT,
            **datasets_config,
        )

        for _, dataset in datasets:
            print(str(dataset))
            print(dataset.meta['loader'])
            assert "subsample" not in dataset.meta['loader']

            for percentage in percentages:
                print(f'  - {int(percentage * 100):3d}%', f'{seed=}')
                sub_ds = dataset.tasks_subsample_kgc(
                    seed=seed,
                    percentage_val=percentage,
                )

                report = run(
                    sub_ds,
                    name='true-mentions',
                    model=MODELS['true-mentions'],
                    split='validation',
                    seed=seed,
                )

                flat = flatten(report)

                if writer is None:
                    header = ['percentage', 'head tasks', 'tail tasks', 'seed'] + list(flat.keys())
                    writer = csv.DictWriter(fd, fieldnames=header)
                    writer.writeheader()

                writer.writerow(flat | {
                    'percentage': percentage,
                    'head tasks': len(sub_ds.open_kgc_val_heads),
                    'tail tasks': len(sub_ds.open_kgc_val_tails),
                    'seed': seed
                })


def subsample_experiments(datasets_config, percentages, seed):
    name = datasets_config['config_file'].stem
    fname = f"subsample-experiments-{name}.csv"

    run_subsampling(
        out=p_data / "evaluation" / fname,
        datasets_config=datasets_config,
        percentages=percentages,
        seed=seed,
    )


subsample_experiments(
    datasets_config={
        'config_file': irt2.ENV.DIR.CONF / 'datasets' / 'original.yaml',
        'only': ['irt2/tiny'],
    },
    # irt2.ENV.DIR.CONF / 'datasets' / 'full.yaml',
    percentages=(
        [x/100 for x in range(1, 10)] +
        [x/100 for x in range(10, 40, 5)] +
        [x/100 for x in range(40, 101, 20)]
    ),
    seed=31189,
)

print('done')

write results to /home/felix/Complex/dkg/irt2/data/evaluation/subsample-experiments-original.csv
done
