# Description
**Functionality**: Removes homographs for which any one pronunciation class has less than three examples, as classes with under two examples will not be represented in the train, dev, and test splits. 

In [1]:
import os
import shutil
import glob
import pandas as pd
from tqdm import tqdm

In [2]:
#Paths 
WHD_DATA = "C:/Users/jseal/Dev/dissertation/Data/WikipediaHomographData/data/"
TRAIN = WHD_DATA + "train_cpy/"
EVAL = WHD_DATA + "eval_cpy/"

WHD_DATA_VARIANT = "C:/Users/jseal/Dev/dissertation/Data/WikipediaHomographData/data/variant_limit_data/"
TRAIN_VARIANT = WHD_DATA_VARIANT + "train/"
EVAL_VARIANT = WHD_DATA_VARIANT + "eval/"

pd.set_option('display.max_rows', None)

### Script

In [3]:
#Combine train and eval data
dfs = []
for f in tqdm(glob.iglob(TRAIN +'*.tsv')):
    df = pd.read_table(f)
    dfs.append(df)
for f in tqdm(glob.iglob(EVAL +'*.tsv')):
    df = pd.read_table(f)
    dfs.append(df)

train_eval_df = pd.concat(dfs)

#Remove invariant homographs
unique = train_eval_df.groupby('homograph')['wordid'].nunique()
one_count = unique[unique == 1]
dataset = train_eval_df[~train_eval_df['homograph'].isin(one_count.index.tolist())]

#Remove low resource homographs (< 3 instances)
counts = dataset.groupby('wordid').count()
low_resource = counts[counts['homograph'] < 3].index.tolist()
dataset_final = dataset[~dataset['wordid'].isin(low_resource)]

#Get list of homographs to remove
to_remove_df = pd.concat([train_eval_df, dataset_final]).drop_duplicates(keep=False)
to_remove = to_remove_df.homograph.unique()

162it [00:00, 726.43it/s]
162it [00:00, 782.16it/s]


In [4]:
for f in tqdm(glob.iglob(TRAIN +'*.tsv')):
    f_name = os.path.basename(f)
    if f_name[:-4] not in to_remove: 
        shutil.copy(f, TRAIN_VARIANT)

162it [00:00, 1203.84it/s]


In [5]:
for f in tqdm(glob.iglob(EVAL +'*.tsv')):
    f_name = os.path.basename(f)
    if f_name[:-4] not in to_remove: 
        shutil.copy(f, EVAL_VARIANT)

162it [00:00, 1290.94it/s]
