# Description
**Functionality**: The Wikipedia Homograph Data provided by Google is a two-way split dataset: train and eval. For the purposes of this study, we split the data into train, valid, and test and ensure the presence of each pronunciation class in each split. 

This module splits the Wikipedia homograph train split into 90% train, 10% valid. For the test split, the original Wikipedia homograph eval split is copied over.

**Use**: The train and valid splits are used to train neural net models for homograph disambiguation, while the test set is held out for final evaluation.

### Imports

In [1]:
import os
import shutil
from glob import iglob
import pandas as pd
from tqdm import tqdm

### Variables

In [2]:
#Paths 

WHD_DATA_VARIANT = "C:/Users/jseal/Dev/dissertation/Data/WikipediaHomographData/data/variant_limit_data/"
TRAIN = WHD_DATA_VARIANT + "train/"
EVAL = WHD_DATA_VARIANT + "eval/"
SPLIT_DIRS = [TRAIN, EVAL]

WHD_DATA = "C:/Users/jseal/Dev/dissertation/Data/WikipediaHomographData/data/"
THREE_SPLITS = WHD_DATA + "three_split_stratified_variant_data_corrected/"
NEW_TRAIN = THREE_SPLITS + "train/"
NEW_DEV = THREE_SPLITS + "dev/"
NEW_TEST = THREE_SPLITS + "test/"

pd.set_option('display.max_rows', None)

### Functions

In [3]:
def make_df(split_dir): 
    dfs = []
    for f in tqdm(iglob(split_dir +'*.tsv')):
        f_name = os.path.basename(f)
        dfs.append(pd.read_table(f))
    return dfs

def combine_train_eval():
    dfs = []
    for split_dir in SPLIT_DIRS:
        print(split_dir)
        for df in make_df(split_dir): 
            dfs.append(df) 
    all_data = pd.concat(dfs, ignore_index=True)
    all_data['orig_idx'] = all_data.index
    return all_data     

def stratify_sample(data_df):
    data_df = data_df.sample(frac=1)
    #Append stratified samples per wordid, with enforcement of at least one sample per split
    train_samples = []
    dev_samples = []
    test_samples = []
    for i, gp in data_df.groupby("wordid"):
        print(i)
        print(len(gp))
            
        #Randomize samples
        gp = gp.sample(frac=1)
        
        #If only three samples, assign one to each split
        if len(gp) == 3: 
            train = gp.head(1) 
            test = gp.tail(1)
            dev = gp[~gp.orig_idx.isin(train.orig_idx)]
            dev = dev[~dev.orig_idx.isin(test.orig_idx)]
            print('train')
            print(train.shape[0])
            print('dev')
            print(dev.shape[0])
            assert len(dev) > 0, f"dev has no samples"
            assert (set(train.orig_idx) & set(dev.orig_idx)) == set(), f"train and dev not mutex. Mutual idx: " + str(set(train.orig_idx) & set(dev.orig_idx))
            print('test')
            print(test.shape[0])
            assert len(test) > 0, f"test has no samples"
            assert (set(train.orig_idx) & set(test.orig_idx)) == set(), f"train and test not mutex. Mutual idx: " + str(set(train.orig_idx) & set(test.orig_idx))
            assert (set(dev.orig_idx) & set(test.orig_idx)) == set(), f"dev and test not mutex. Mutual idx: : " + str(set(dev.orig_idx) & set(test.orig_idx))
            
            train_samples.append(train)
            dev_samples.append(dev)
            test_samples.append(test)
        
        else:
            n = gp.shape[0]
            #Take 80% for train
            train = gp.sample(frac=.8)
            print('train')
            print(train.shape[0])
            #Used to check if the data left after train is 
            #created has at least 2 samples - one for dev, and one for test
            check_rest = gp[~gp.index.isin(train.index)]
        
            if (len(check_rest) < 2): 

                #If less than 2, take the last sample from train, and make a new train
                train_new = train[:-1]
                train_samples.append(train_new)
                # Make new remaining data by removing smaller, new train set
                dev_test = gp[~gp.index.isin(train_new.index)]
                dev = dev_test.head(1)
                print('dev')
                print(dev.shape[0])
                assert len(dev) > 0, f"dev has no samples"
                assert (set(train_new.orig_idx) & set(dev.orig_idx)) == set(), f"train and dev not mutex. Mutual idx: " + str(set(train_new.orig_idx) & set(dev.orig_idx))
                dev_samples.append(dev)
                
                test = dev_test.tail(1)
                print('test')
                print(test.shape[0])
                assert len(test) > 0, f"test has no samples"
                assert (set(train_new.orig_idx) & set(test.orig_idx)) == set(), f"train and test not mutex. Mutual idx: " + str(set(train.orig_idx) & set(test.orig_idx))
                assert (set(dev.orig_idx) & set(test.orig_idx)) == set(), f"dev and test not mutex. Mutual idx: : " + str(set(dev.orig_idx) & set(test.orig_idx))
                test_samples.append(test)
            else: 
                train_samples.append(train)
                dev_test = gp[~gp.index.isin(train.index)]
                
                dev = dev_test.sample(frac=.5)
                print('dev')
                print(dev.shape[0])
                assert len(dev) > 0, f"dev has no samples"
                assert (set(train.orig_idx) & set(dev.orig_idx)) == set(), f"train and dev not mutex. Mutual idx: " + str(set(train.orig_idx) & set(dev.orig_idx))
                dev_samples.append(dev)
                
                test = dev_test[~dev_test.index.isin(dev.index)]
                print('test')
                print(test.shape[0])
                assert len(test) > 0, f"test has no samples"
                assert (set(train.orig_idx) & set(test.orig_idx)) == set(), f"train and test not mutex. Mutual idx: " + str(set(train.orig_idx) & set(test.orig_idx))
                assert (set(dev.orig_idx) & set(test.orig_idx)) == set(), f"dev and test not mutex. Mutual idx: : " + str(set(dev.orig_idx) & set(test.orig_idx))
                test_samples.append(test)
    train_df = pd.concat(train_samples)
    dev_df = pd.concat(dev_samples)
    test_df = pd.concat(test_samples)

    return train_df, dev_df, test_df
    

# Script

In [4]:
all_data = combine_train_eval()

128it [00:00, 687.62it/s]

C:/Users/jseal/Dev/dissertation/Data/WikipediaHomographData/data/variant_limit_data/train/



128it [00:00, 785.87it/s]


C:/Users/jseal/Dev/dissertation/Data/WikipediaHomographData/data/variant_limit_data/eval/


In [5]:
all_data.head(10)

Unnamed: 0,homograph,wordid,sentence,start,end,orig_idx
0,abuse,abuse_nou,Some such suits accuse the Vatican of complici...,59,64,0
1,abuse,abuse_nou,Similarly the novel deals with the misuse and ...,46,51,1
2,abuse,abuse_nou,"In December 1988, he announced a pilot project...",61,66,2
3,abuse,abuse_vrb,Marty can already tell how much Judd is going ...,49,54,3
4,abuse,abuse_nou,Advocacy services that encourage victims to es...,54,59,4
5,abuse,abuse_nou,Barros has been accused of complicity in an ab...,44,49,5
6,abuse,abuse_nou,Some conditions they face include no time off;...,79,84,6
7,abuse,abuse_nou,This valve is weakened by drug abuse.,31,36,7
8,abuse,abuse_nou,Tanenbaum notes that MacKinnon misrepresents t...,76,81,8
9,abuse,abuse_nou,"Prevention of Mental Disorders, Substance Abus...",42,47,9


In [6]:
for i, gp in all_data.groupby('homograph'): 
    print(gp['wordid'].value_counts())

abuse_nou    90
abuse_vrb    10
Name: wordid, dtype: int64
abuses_nou    83
abuses_vrb    17
Name: wordid, dtype: int64
advocate_nou    83
advocate_vrb    17
Name: wordid, dtype: int64
affect            95
affect_nou-psy     5
Name: wordid, dtype: int64
affiliate_nou    96
affiliate_vrb     3
Name: wordid, dtype: int64
aged        96
aged_adj     5
Name: wordid, dtype: int64
aggregate_adj-nou    88
aggregate_vrb        10
Name: wordid, dtype: int64
alternate_adj-nou    97
alternate_vrb         5
Name: wordid, dtype: int64
analyses_nou    89
analyses_vrb    11
Name: wordid, dtype: int64
animate_vrb        58
animate_adj-nou    37
Name: wordid, dtype: int64
approximate_adj-nou    88
approximate_vrb        11
Name: wordid, dtype: int64
articulate_vrb    54
articulate_adj    46
Name: wordid, dtype: int64
associate_adj-nou    88
associate_vrb        12
Name: wordid, dtype: int64
attribute_nou    54
attribute_vrb    46
Name: wordid, dtype: int64
axes_nou        51
axes_nou-vrb    49
Name: wo

## Serialize splits

In [7]:
for i, gp in all_data.groupby('homograph'): 
    train, dev, test = stratify_sample(gp)

    #serialize
    train.to_csv(NEW_TRAIN + '{}.tsv'.format(i), sep='\t', index=False)
    dev.to_csv(NEW_DEV + '{}.tsv'.format(i), sep='\t', index=False)
    test.to_csv(NEW_TEST + '{}.tsv'.format(i), sep='\t', index=False)     

abuse_nou
90
train
72
dev
9
test
9
abuse_vrb
10
train
8
dev
1
test
1
abuses_nou
83
train
66
dev
8
test
9
abuses_vrb
17
train
14
dev
2
test
1
advocate_nou
83
train
66
dev
8
test
9
advocate_vrb
17
train
14
dev
2
test
1
affect
95
train
76
dev
10
test
9
affect_nou-psy
5
train
4
dev
1
test
1
affiliate_nou
96
train
77
dev
10
test
9
affiliate_vrb
3
train
1
dev
1
test
1
aged
96
train
77
dev
10
test
9
aged_adj
5
train
4
dev
1
test
1
aggregate_adj-nou
88
train
70
dev
9
test
9
aggregate_vrb
10
train
8
dev
1
test
1
alternate_adj-nou
97
train
78
dev
10
test
9
alternate_vrb
5
train
4
dev
1
test
1
analyses_nou
89
train
71
dev
9
test
9
analyses_vrb
11
train
9
dev
1
test
1
animate_adj-nou
37
train
30
dev
4
test
3
animate_vrb
58
train
46
dev
6
test
6
approximate_adj-nou
88
train
70
dev
9
test
9
approximate_vrb
11
train
9
dev
1
test
1
articulate_adj
46
train
37
dev
4
test
5
articulate_vrb
54
train
43
dev
6
test
5
associate_adj-nou
88
train
70
dev
9
test
9
associate_vrb
12
train
10
dev
1
test
1
attribute_

tear_nou
33
train
26
dev
4
test
3
tear_vrb
66
train
53
dev
6
test
7
transform
68
train
54
dev
7
test
7
transform_nou
32
train
26
dev
3
test
3
transplant_nou
94
train
75
dev
10
test
9
transplant_vrb
6
train
5
dev
1
test
1
transport_nou
93
train
74
dev
10
test
9
transport_vrb
6
train
5
dev
1
test
1
upset_nou
72
train
58
dev
7
test
7
upset_vrb
27
train
22
dev
2
test
3
use_nou
58
train
46
dev
6
test
6
use_vrb
43
train
34
dev
4
test
5
uses_nou
13
train
10
dev
2
test
1
uses_vrb
87
train
70
dev
8
test
9
wind_nou
94
train
75
dev
10
test
9
wind_vrb
6
train
5
dev
1
test
1
winds_nou
91
train
73
dev
9
test
9
winds_vrb
6
train
5
dev
1
test
1
wound_nou-vrb
66
train
53
dev
6
test
7
wound_vrb
33
train
26
dev
4
test
3
