In [17]:
import random
from pathlib import Path
from typing import List

from src.zipper.core import TestSet, InOutPair


def source_keys_with_max_samples_left(sources_dict: dict) -> List[str]:
    max_value = max(sources_dict.values())
    return [key for key, value in sources_dict.items() if value == max_value]

In [18]:
utterances_per_source = 36
samples_per_hit = 5
out_dir = Path("/media/arnas/SSD Disk/uni/semester_4/thesis-files/mos/lt_synthesizer_survey")
out_dir.mkdir(parents=True, exist_ok=True)
data_filepath = out_dir / "mos_data.txt"
utterances_filepath = out_dir / "mos_utterances.txt"
dataset_filelists = {
    'aurimas': Path("/media/arnas/SSD Disk/uni/semester_4/thesis-files/mos/lt_synthesizer_survey/aurimas/filelist.txt"),
}
dataset_entries = {name: TestSet.from_in_out_pair(InOutPair(in_filepath=path, out_directory=Path("."))).entries
                   for name, path in dataset_filelists.items()}
dataset_entries = {name: random.sample(entries, utterances_per_source) for name, entries in dataset_entries.items()}

datasets = [dataset for dataset in dataset_filelists]
sources = [
    "aurimas",
    "microsoft_leonas",
    "liepa_edvardas",
    "vdu",
]
sources_samples_left = {source: utterances_per_source for source in sources}
source_filelists = {source: [] for source in sources}

dataset_ids = {name: idx for idx, name in enumerate(datasets)}
source_ids = {name: idx for idx, name in enumerate(sources)}
utterance_ids = {name: idx for idx, name in enumerate([str(entry.path)
                                                       for name, entries in dataset_entries.items()
                                                       for entry in entries])}
# text: List[source]

In [19]:
hits = dict()
for dataset in datasets:
    for entry in dataset_entries[dataset]:
        keys = source_keys_with_max_samples_left(sources_samples_left)
        curr_sources = random.sample(keys, samples_per_hit-1) # -1 for groundtruth
        hit = [dataset] + [source for source in curr_sources]
        random.shuffle(hit)
        hits[str(dataset_filelists[dataset].stem / entry.path)] = (hit, entry.text)
        for source in curr_sources:
            sources_samples_left[source] -= 1
            source_filelists[source].append(entry)


In [20]:
print("Before shuffle:")
print(hits)
keys = list(hits.keys())
random.shuffle(keys)
hits = {key: hits[key] for key in keys}
print("\n\n\nAfter shuffle:")
print(hits)

Before shuffle:
{'filelist/si_geguze_yra_neiprasta_vesi_ir_net_musa_vesumo_rekordus.wav': (['vdu', 'aurimas', 'microsoft_leonas', 'aurimas', 'liepa_edvardas'], 'si_geguze_yra_neiprastai_vesi_ir_net_musa'), 'filelist/konfiskavus_gyvuna_draudimas_ji_isigyti.wav': (['aurimas', 'microsoft_leonas', 'vdu', 'aurimas', 'liepa_edvardas'], 'konfiskavus_gyvuna_draudimas_ji_isigyti_ir_laikyti_butu'), 'filelist/taigi_sodininkai_gali_dziugauti.wav': (['aurimas', 'liepa_edvardas', 'aurimas', 'microsoft_leonas', 'vdu'], '_taigi_sodininkai_gali_dziugauti_ir_trinti_rankomis'), 'filelist/nors_kognityviniu_gebejimu_skirtumas_buvo_nedidelis.wav': (['microsoft_leonas', 'aurimas', 'aurimas', 'vdu', 'liepa_edvardas'], 'nors_kognityviniu_gebejimu_skirtumas_buvo_nedidelis_ir_nepakankamas'), 'filelist/pasak_istatymo_iniciatores.wav': (['vdu', 'aurimas', 'aurimas', 'liepa_edvardas', 'microsoft_leonas'], 'pasak_istatymo_iniciatores_seimo_laisves_frakcijos_nares_ievos'), 'filelist/pavyzdziui_ore_ne_dirvos_pavirsiuj

In [21]:
print(dataset_ids)
print(source_ids)
print(utterance_ids)

{'aurimas': 0}
{'aurimas': 0, 'microsoft_leonas': 1, 'liepa_edvardas': 2, 'vdu': 3}
{'si_geguze_yra_neiprasta_vesi_ir_net_musa_vesumo_rekordus.wav': 0, 'konfiskavus_gyvuna_draudimas_ji_isigyti.wav': 1, 'taigi_sodininkai_gali_dziugauti.wav': 2, 'nors_kognityviniu_gebejimu_skirtumas_buvo_nedidelis.wav': 3, 'pasak_istatymo_iniciatores.wav': 4, 'pavyzdziui_ore_ne_dirvos_pavirsiuje.wav': 5, 'pranesama_kad_nuo_invazijos_pradzios.wav': 6, 'seimas_emesi_gyvunu_geroves_ir_apsaugos_istatymo.wav': 7, 'siuo_projektu_siekiama_apsisaugoti_nuo_tu.wav': 8, 'is_viso_rajone_yra_penki_sovietiniai.wav': 9, 'draudimas_butu_taikomas.wav': 10, 'tyrimas_turetu_buti_naudingas.wav': 11, 'bet_ar_pavyks_ekspertai_labai_abejoja.wav': 12, 'mariupolyje_slopstant_aktyviems_karo_vieksmams.wav': 13, 'sprendimas_kuris_mano_manymu_turejo_buti_piimtas.wav': 14, 'velu_ketvirtadienio_vakara_paaiskejo.wav': 15, 'nezinia_ar_juoktis_ar_verkti.wav': 16, 'pasak_kremliaus_atstovo_naujo.wav': 17, 'vis_delto_panasu_kad_dziugauti_ji

In [22]:
dataset_ids_text = '\n\t'.join([f"{name}: {idx}" for name, idx in dataset_ids.items()])
source_ids_text = '\n\t'.join([f"{name}: {idx}" for name, idx in source_ids.items()])
utterance_ids_text = '\n\t'.join([f"{name}: {idx}" for name, idx in utterance_ids.items()])

text = '\n'.join([f"{utterance}: {sources}" for utterance, sources in hits.items()]) + "\n\n\n"

text += f"Dataset ids:\n\t{dataset_ids_text}\n\n"
text += f"Source ids:\n\t{source_ids_text}\n\n"
text += f"Utterance ids:\n\t{utterance_ids_text}\n\n"

with open(data_filepath, mode='w', encoding='utf-8') as f:
    f.write(text)

In [23]:
text = '\n'.join([f"{utterance_ids[utterance.split('/')[-1]]}: {sources[0]}" for utterance, sources in hits.items()])

with open(utterances_filepath, mode='w', encoding='utf-8') as f:
    f.write(text)


In [24]:
filelist_out_dir = out_dir / 'filelists'
filelist_out_dir.mkdir(parents=True, exist_ok=True)
for source, filelist in source_filelists.items():
    with open(filelist_out_dir / f"{source}.txt", mode='w', encoding='utf-8') as f:
        f.write('\n'.join([f"{e.path}|{e.text}" for e in filelist]))
