### Reduce Sample
This notebook reduces the sample generated by `sample_data.ipynb` by randomly choosing at most `max_usages` word usages per headword.

### Usage
In the second cell, the `sample_file` variable sets the path to the sample file generated by `sample_data.ipynb`. `max_usages` sets the maximum number of usages to keep per headword.

In [1]:
import random
import json
from Levenshtein import distance as lev

In [2]:
max_usages = 5

#sample_file = "../data/outputs/SAMPLED_PROCESSED_ccoha1.json"
#sample_file = "../data/outputs/SAMPLED_PROCESSED_eng_news_2020_1M-sentences.json"
sample_file = "../data/outputs/SAMPLED_PROCESSED_kubhist2a.json"
#sample_file = "../data/outputs/SAMPLED_PROCESSED_swe_news_2022_1M-sentences.json"

reduced_file = f"../data/outputs/REDUCED_SAMPLE{sample_file.split('SAMPLED_PROCESSED')[1]}"

In [3]:
with open(sample_file, "r") as f:
    sample = json.load(f)

    seed = sample["seed"] # get seed from sampling
    random.seed(seed)

    # adopt values from sample file
    reduced_sample = {"seed": seed, "sample_size": sample["headwords_found"], "max_senses": sample["max_senses"], "sample": []}

    for entry in sample["entries"].values(): # for all headwords
        if len(entry) > max_usages: # if there are more than max_usages usages
            choice = []
            for i in range(max_usages):
                j = random.randint(0, (max(0, len(entry) - len(choice) -1))) # choose random index
                choice.append(entry.pop(j)) # pop entry from list and append to choice
            reduced_sample["sample"].extend(choice) # extend reduced sample with choice
        else:
            reduced_sample["sample"].extend(entry) # extend reduced sample with all entries

In [4]:
with open(reduced_file, 'w', encoding="utf-8") as g:
    json.dump(reduced_sample, g, indent=4, ensure_ascii=False)

In [5]:
# sanity check sample size
print(len(reduced_sample["sample"]))

356
