In [76]:
import pandas as pd
import json
from itertools import chain
from math import ceil

MAX_BATCH_SIZE = 100
BLOCK_SIZE = 10

In [77]:
def read_sents(filepath: str) -> list:
    with open(filepath, mode="r", encoding="utf-8") as f:
        sents = f.read().splitlines()
    return sents

In [78]:
# import data
SRC_LANG_CODE, TGT_LANG_CODE = "mlt", "eng"
srcs = read_sents("/home/falcao/experiments/mt-en/mt-en.mt")
refs = read_sents("/home/falcao/experiments/mt-en/mt-en.en")
hyps_gtrans = read_sents("/home/falcao/experiments/mt-en/hyp-gtrans.en")
hyps_mstrans = read_sents("/home/falcao/experiments/mt-en/hyp-mstrans.en")

In [79]:
# create DFs for each target type/ID and concat them in final DF

df_gtrans = pd.DataFrame({
    "itemID": range(len(srcs)),
    "itemType": "TGT",
    "sourceID": "mt-en.mt",
    "sourceText": srcs,
    "targetID": "hyp-gtrans.en",
    "targetText": hyps_gtrans
})

df_mstrans = pd.DataFrame({
    "itemID": range(len(srcs)),
    "itemType": "TGT",
    "sourceID": "mt-en.mt",
    "sourceText": srcs,
    "targetID": "hyp-mstrans.en",
    "targetText": hyps_mstrans
})

df_ref = pd.DataFrame({
    "itemID": range(len(srcs)),
    "itemType": "REF",
    "sourceID": "mt-en.mt",
    "sourceText": srcs,
    "targetID": "mt-en.en",
    "targetText": refs,
})

df = pd.concat([df_gtrans, df_mstrans, df_ref])

In [80]:
# randomize
df = df.sample(len(df), random_state=1)

In [81]:
n_batches = ceil(len(df) / MAX_BATCH_SIZE)
batches_json = []

for batch_id in range(n_batches):
    # get section of DF that will be the batch
    first = batch_id * MAX_BATCH_SIZE
    last = first + MAX_BATCH_SIZE
    batch_df = df[first:last].copy()

    # create `_item` and `_block` columns according to block size
    batch_df["_block"] = list(chain(*[[block_id] * BLOCK_SIZE for block_id in range(ceil(len(batch_df) / BLOCK_SIZE))]))[:len(batch_df)]
    batch_df["_item"] = range(len(batch_df))

    # reorder columns
    if all(batch_df.columns[-2:] == ["_block", "_item"]):
        cols = ["_block", "_item", *(batch_df.columns[:-2])]
        batch_df = batch_df[cols]

    # generate dict for the batch
    batch_json = {
        "items": batch_df.to_dict(orient="records"),
        "task": {
            "batchNo": batch_id + 1, # 1-based
            "batchSize": BLOCK_SIZE,
            "randomSeed": 1,
            "requiredAnnotations": 1,
            "sourceLanguage": src_lang,
            "targetLanguage": tgt_lang,
        }
    }

    # add to json list of batches
    batches_json.append(batch_json)

In [None]:
with open("/home/falcao/Appraise/MyCampaign/batches-3x100.json", mode="w+") as fout:
    json.dump(batches_json, fout, indent=2)