# Text Subsampling

We draw up to `n` texts for training/evaluation.
This does not concerns blp datasets.

In [3]:
import irt2
from irt2.dataset import IRT2
from irt2.loader import LOADER

import gzip
import pickle
import random
from itertools import islice
from collections import defaultdict


p_data = irt2.ENV.DIR.DATA


# IRT2

config = {
    'at_most': 30,
    'seed': 31189,
    'datasets': [
        {
            "path": p_data / "irt2" / "irt2-cde-tiny",
            "loader": "irt2",
            "percentage": {
                "validation": 0.17,
                "test": 0.02,
            },
        },
        {
            "path": p_data / "irt2" / "irt2-cde-small",
            "loader": "irt2",
            "percentage": {
                "validation": 0.08,
                "test": 0.02,
            },
        },
        {
            "path": p_data / "irt2" / "irt2-cde-medium",
            "loader": "irt2",
            "percentage": {
                "validation": 0.04,
                "test": 0.01,
            },
        },
        {
            "path": p_data / "irt2" / "irt2-cde-large",
            "loader": "irt2",
            "percentage": {
                "validation": 0.05,
                "test": 0.02,
            },
        },
    ],
}



def draw(ds: IRT2, ds_conf: dict, seed: int, n: int):
    managers = {
        'closed.train-contexts': ds.closed_contexts,
        'open.validation-contexts': ds.open_contexts_val,
        'open.test-contexts': ds.open_contexts_test,
    }

    rng = random.Random(seed)

    seperator = ds.config["create"]["separator"]
    for name, mgr in managers.items():
        agg = defaultdict(list)
        with mgr() as contexts:
            contexts = (ctx for ctx in contexts if '\n' not in ctx.data)

            for context in islice(contexts, None):
                # reproducibility given as order of insertion stays the same
                agg[context.mid].append(context)

            for mid in agg:
                rng.shuffle(agg[mid])
                agg[mid] = [ctx.data for ctx in agg[mid][:n]]

        fpath = ds_conf['path'] / f'{name}-{seed}-{n}.txt.gz'
        print(f'writing {fpath}')

        total = 0
        agg = dict(agg)

        with gzip.open(fpath, mode='wb') as fd:
             for text in (text for texts in agg.values() for text in texts):
                fd.write(text.strip().encode() + b"\n")
                total += 1

        print(f'wrote {total} contexts, writing pickled dict')
        with (ds_conf['path'] / f'{name}-{seed}-{n}.pkl').open(mode='wb') as fd:
            pickle.dump(agg, fd)


kwargs = dict(seed=config['seed'], n=config['at_most'])
for ds_conf in config['datasets']:
    draw(
        LOADER[ds_conf['loader']](ds_conf['path']),
        ds_conf,
        **kwargs,
    )



# draw(ds=IRT2.from_dir(irt2.ENV.DIR.DATA / 'irt2' / 'irt2-cde-tiny'), **kwargs)

writing /home/felix/Complex/dkg/irt2/data/irt2/irt2-cde-tiny/closed.train-contexts-31189-30.txt.gz


wrote 90810 contexts, writing pickled dict


writing /home/felix/Complex/dkg/irt2/data/irt2/irt2-cde-tiny/open.validation-contexts-31189-30.txt.gz


wrote 39286 contexts, writing pickled dict


writing /home/felix/Complex/dkg/irt2/data/irt2/irt2-cde-tiny/open.test-contexts-31189-30.txt.gz


wrote 348717 contexts, writing pickled dict


writing /home/felix/Complex/dkg/irt2/data/irt2/irt2-cde-small/closed.train-contexts-31189-30.txt.gz


wrote 175854 contexts, writing pickled dict


writing /home/felix/Complex/dkg/irt2/data/irt2/irt2-cde-small/open.validation-contexts-31189-30.txt.gz


wrote 78277 contexts, writing pickled dict


writing /home/felix/Complex/dkg/irt2/data/irt2/irt2-cde-small/open.test-contexts-31189-30.txt.gz


wrote 311113 contexts, writing pickled dict


writing /home/felix/Complex/dkg/irt2/data/irt2/irt2-cde-medium/closed.train-contexts-31189-30.txt.gz


wrote 268684 contexts, writing pickled dict


writing /home/felix/Complex/dkg/irt2/data/irt2/irt2-cde-medium/open.validation-contexts-31189-30.txt.gz


wrote 75047 contexts, writing pickled dict


writing /home/felix/Complex/dkg/irt2/data/irt2/irt2-cde-medium/open.test-contexts-31189-30.txt.gz


wrote 294529 contexts, writing pickled dict


writing /home/felix/Complex/dkg/irt2/data/irt2/irt2-cde-large/closed.train-contexts-31189-30.txt.gz


wrote 447929 contexts, writing pickled dict


writing /home/felix/Complex/dkg/irt2/data/irt2/irt2-cde-large/open.validation-contexts-31189-30.txt.gz


wrote 58456 contexts, writing pickled dict


writing /home/felix/Complex/dkg/irt2/data/irt2/irt2-cde-large/open.test-contexts-31189-30.txt.gz


wrote 135206 contexts, writing pickled dict


In [None]:
# BLP

# TODO