# Description
**Functionality**: This module creates a new train, dev and test set with WHD and SWBD data.

**Use**: The augmented data is used to train non-baseline models.

### Imports

In [1]:
import os
import shutil
from glob import iglob
import pandas as pd
from tqdm import tqdm
import spacy
nlp = spacy.load('en_core_web_sm')

### Variables

In [2]:
#Paths 

SWBD = "C:/Users/jseal/Dev/dissertation/Data/SWBD/SWBD_Homographs/swbd_homs.tsv"
NEW_SAMPLES_FORMATTED = "C:/Users/jseal/Dev/dissertation/Data/SWBD/SWBD_Homographs/swbd_new_samples_formatted.tsv"
SWBD_HOMS_META = "C:/Users/jseal/Dev/dissertation/Data/SWBD/SWBD_Homographs/swbd_homs_meta.tsv"
LOW_ACC_PATH = 'C:/Users/jseal/Dev/dissertation/Data/WHD_ALBERT/classes_w_accuracy_below_1-albert.tsv' #albert is lowest performing model
LABELS_PATH = 'C:/Users/jseal/Dev/dissertation/Data/WHD_Bert/variant_stratified/labels.txt'
WORDIDS_PATH = 'C:/Users/jseal/Dev/dissertation/Data/WikipediaHomographData/data/wordids.tsv'

WHD_DATA = "C:/Users/jseal/Dev/dissertation/Data/WikipediaHomographData/data/"
THREE_SPLITS = WHD_DATA + "three_split_stratified_variant_data_corrected/"
TRAIN = THREE_SPLITS + "train/"
DEV = THREE_SPLITS + "dev/"
TEST = THREE_SPLITS + "test/"
WHD_PATHS = [TRAIN, DEV, TEST]
TRAIN_AUG = THREE_SPLITS + "train_swbd_aug/"
DEV_AUG = THREE_SPLITS + "dev_swbd_aug/"
TEST_AUG = THREE_SPLITS + "test_swbd_aug/"
PATHS = [TRAIN_AUG, DEV_AUG, TEST_AUG]

pd.set_option('display.max_rows', None)

### Functions

In [3]:
def make_df(split_dir): 
    dfs = []
    for f in tqdm(iglob(split_dir +'*.tsv')):
        f_name = os.path.basename(f)
        dfs.append(pd.read_table(f))
    return dfs


def stratify_sample(data_df):
    data_df = data_df.sample(frac=1)
    #Append stratified samples per wordid, with enforcement of at least one sample per split
    train_samples = []
    dev_samples = []
    test_samples = []
    for i, gp in data_df.groupby("wordid"):         
        #Randomize samples
        gp = gp.sample(frac=1)
        if len(gp) == 1:
            train = gp
        elif len(gp) == 2:     
            train = gp.head(1) 
            test = gp.tail(1)
        #If only three samples, assign one to each split
        elif len(gp) == 3: 
            train = gp.head(1) 
            test = gp.tail(1)
            dev = gp[~gp.orig_idx.isin(train.orig_idx)]
            dev = dev[~dev.orig_idx.isin(test.orig_idx)]
            print('train')
            print(train.shape[0])
            print('dev')
            print(dev.shape[0])
            assert len(dev) > 0, f"dev has no samples"
            assert (set(train.orig_idx) & set(dev.orig_idx)) == set(), f"train and dev not mutex. Mutual idx: " + str(set(train.orig_idx) & set(dev.orig_idx))
            print('test')
            print(test.shape[0])
            assert len(test) > 0, f"test has no samples"
            assert (set(train.orig_idx) & set(test.orig_idx)) == set(), f"train and test not mutex. Mutual idx: " + str(set(train.orig_idx) & set(test.orig_idx))
            assert (set(dev.orig_idx) & set(test.orig_idx)) == set(), f"dev and test not mutex. Mutual idx: : " + str(set(dev.orig_idx) & set(test.orig_idx))
            
            train_samples.append(train)
            dev_samples.append(dev)
            test_samples.append(test)
        
        else:
            n = gp.shape[0]
            #Take 80% for train
            train = gp.sample(frac=.8)
            print('train')
            print(train.shape[0])
            #Used to check if the data left after train is 
            #created has at least 2 samples - one for dev, and one for test
            check_rest = gp[~gp.index.isin(train.index)]
        
            if (len(check_rest) < 2): 

                #If less than 2, take the last sample from train, and make a new train
                train_new = train[:-1]
                train_samples.append(train_new)
                # Make new remaining data by removing smaller, new train set
                dev_test = gp[~gp.index.isin(train_new.index)]
                dev = dev_test.head(1)
                print('dev')
                print(dev.shape[0])
                assert len(dev) > 0, f"dev has no samples"
                assert (set(train_new.orig_idx) & set(dev.orig_idx)) == set(), f"train and dev not mutex. Mutual idx: " + str(set(train_new.orig_idx) & set(dev.orig_idx))
                dev_samples.append(dev)
                
                test = dev_test.tail(1)
                print('test')
                print(test.shape[0])
                assert len(test) > 0, f"test has no samples"
                assert (set(train_new.orig_idx) & set(test.orig_idx)) == set(), f"train and test not mutex. Mutual idx: " + str(set(train.orig_idx) & set(test.orig_idx))
                assert (set(dev.orig_idx) & set(test.orig_idx)) == set(), f"dev and test not mutex. Mutual idx: : " + str(set(dev.orig_idx) & set(test.orig_idx))
                test_samples.append(test)
            else: 
                train_samples.append(train)
                dev_test = gp[~gp.index.isin(train.index)]
                
                dev = dev_test.sample(frac=.5)
                print('dev')
                print(dev.shape[0])
                assert len(dev) > 0, f"dev has no samples"
                assert (set(train.orig_idx) & set(dev.orig_idx)) == set(), f"train and dev not mutex. Mutual idx: " + str(set(train.orig_idx) & set(dev.orig_idx))
                dev_samples.append(dev)
                
                test = dev_test[~dev_test.index.isin(dev.index)]
                print('test')
                print(test.shape[0])
                assert len(test) > 0, f"test has no samples"
                assert (set(train.orig_idx) & set(test.orig_idx)) == set(), f"train and test not mutex. Mutual idx: " + str(set(train.orig_idx) & set(test.orig_idx))
                assert (set(dev.orig_idx) & set(test.orig_idx)) == set(), f"dev and test not mutex. Mutual idx: : " + str(set(dev.orig_idx) & set(test.orig_idx))
                test_samples.append(test)
    if train_samples: 
        train_df = pd.concat(train_samples)
    else: 
        train_df = pd.DataFrame()
    if dev_samples:     
        dev_df = pd.concat(dev_samples)
    else: 
        dev_df = pd.DataFrame()
    if test_samples:
        test_df = pd.concat(test_samples)
    else: 
        test_df = pd.DataFrame()

    return train_df, dev_df, test_df
    

# Script

In [4]:
swbd_df = pd.read_csv(SWBD, sep='\t')
wordids_df = pd.read_csv(WORDIDS_PATH, sep='\t')
new_samples_df = pd.read_csv(NEW_SAMPLES_FORMATTED, sep='\t')

In [5]:
def correct_me(x):
    x = x.capitalize()
    x = x.replace(" i ", " I ")
    x = x.replace('<unk>','')
    nlp_x = nlp(x)
    ents = [tok.text for tok in nlp_x.ents if ('<' not in tok.text) 
                                                        and ((tok.label_ == 'GPE') 
                                                             or (tok.label_ == 'NORP')
                                                             or (tok.label_ == 'PERSON')
                                                             or (tok.label_ == 'LOC')
                                                             or (tok.label_ == 'ORG'))]
    x = [tok.capitalize() if tok in ents else tok for tok in x.split(' ')]
    return ' '.join(x)

In [6]:
new_samples_df['sentence_new'] = new_samples_df.sentence.apply(lambda x: correct_me(x))

In [7]:
new_samples_df.sentence_new

0      Well I guess I would first identify myself as ...
1                                   And I aged ten years
2                             We we  very similarly aged
3      Once once I was declared an alternate and I wa...
4      Even when I was in elementary school years ago...
5      The neighborhood that we live they are switchi...
6                         I was an alternate on the case
7      And one of the ones that had an impact was uh ...
8      There are so many people with with their petty...
9      So I have not done uh much freshwater fishing ...
10     And they  say well there s a big bass tourname...
11                                    Catching tree bass
12                                      Do you have bass
13               Now they have a lot of bass tournaments
14     We have a lot of uh uh couple of lakes in our ...
15                 And I do a lot of fishing for uh bass
16     I think that uh probably around here people fi...
17     The last time I went uh 

In [8]:
#Labels/wordids after variance and low-resource data culling
with open(LABELS_PATH) as f:
    labels = f.readlines()
labels = [l.strip() for l in labels] 
final_wordids_df = wordids_df[wordids_df.wordid.isin(labels)]

#SWBD homograph pronunciations
swbd_new = set(swbd_df.new_label.tolist())

#Handling for stress mark interpretation as single quote
swbd_for_merge = []
for p in swbd_new:
    if type(p) == str:
        p = "\"" + p + "\""
        swbd_for_merge.append(p)
        
swbd_pronunciations = final_wordids_df.pronunciation.tolist()
prons_check = [pron[1:] if pron.startswith("'") else pron for pron in swbd_pronunciations] 

def get_pron(label):
    check_real = zip(prons_check, swbd_pronunciations)
    for c, r in check_real: 
        if label == c:
            return r
        
swbd_df['pronunciation'] = swbd_df['new_label'].apply(lambda x: get_pron(x))
final = pd.merge(final_wordids_df, swbd_df, on='pronunciation')

In [9]:
swbd_homs_meta = pd.read_csv(SWBD_HOMS_META, sep='\t')

In [10]:
swbd_homs_meta = swbd_homs_meta[['wordid', 'allowed_new_samples']]

In [11]:
new_samples = []
for i, grp in final.groupby('wordid'):
    for row in swbd_homs_meta.iterrows():
        if row[1][0] == i:
            if len(grp) >= row[1][1]:
                samples = grp.sample(n=row[1][1])
                new_samples.append(samples)
            else:
                new_samples.append(grp)

new_samples_df = pd.concat(new_samples)
new_samples_df.groupby('homograph_x')['wordid'].value_counts().sum()

326

In [12]:
import re
new_samples_df['sentence'] = new_samples_df['sentence'].apply(lambda x : correct_me(x))
new_samples_df['sentence']

26      Well I guess I would first identify myself as ...
27                                   And I aged ten years
28                             We we  very similarly aged
29      Once once I was declared an alternate and I wa...
30      Even when I was in elementary school years ago...
31      The neighborhood that we live they are switchi...
32                         I was an alternate on the case
33      And one of the ones that had an impact was uh ...
37      There are so many people with with their petty...
38      So I have not done uh much freshwater fishing ...
39      And they  say well there s a big bass tourname...
40                                     Catching tree bass
41                                       Do you have bass
42                Now they have a lot of bass tournaments
43      We have a lot of uh uh couple of lakes in our ...
44                  And I do a lot of fishing for uh bass
45      I think that uh probably around here people fi...
46      The la

In [13]:
new_samples_df.head()

Unnamed: 0,homograph_x,wordid,label,pronunciation,homograph_type,fine_homograph_type,file_name,homograph_y,WHD_IPA_representations,MFA-Arpabet-2-IPA,Human-mapped_column_D_to_C_w_notes,sentence,new_label
26,aged,aged_adj,old,'eɪʤəd,Lexical,PSI,sw02145_s2_1-s2_13.TextGrid,aged,['eɪʤəd' 'eɪʤd'],edʒd,eɪʤd,Well I guess I would first identify myself as ...,eɪʤəd
27,aged,aged_adj,old,'eɪʤəd,Lexical,PSI,sw02305_s181_1-s181_5.TextGrid,aged,['eɪʤəd' 'eɪʤd'],edʒd,eɪʤd,And I aged ten years,eɪʤəd
28,aged,aged_adj,old,'eɪʤəd,Lexical,PSI,sw02366_s21_2-s21_10.TextGrid,aged,['eɪʤəd' 'eɪʤd'],edʒd,eɪʤd,We we very similarly aged,eɪʤəd
29,alternate,alternate_adj-nou,adjective-noun,'ɑːltɚnət,Morphosyntactic,PoS,sw03503_s25_2-s25_23.TextGrid,alternate,['ɑːltɚnət' 'ɑːltɚˌneɪt'],ˈɔltɝnʌt,ɑːltɚnət,Once once I was declared an alternate and I wa...,ɑːltɚnət
30,alternate,alternate_adj-nou,adjective-noun,'ɑːltɚnət,Morphosyntactic,PoS,sw02324_s76_1-s76_50.TextGrid,alternate,['ɑːltɚnət' 'ɑːltɚˌneɪt'],ˈɔltɝnʌt,ɑːltɚnət,Even when I was in elementary school years ago...,ɑːltɚnət


In [14]:
samples_for_paper = new_samples_df[['homograph_x', 'WHD_IPA_representations', 'MFA-Arpabet-2-IPA', 'pronunciation', 'sentence']].rename(columns = {'homograph_x' : 'homograph', 'MFA-Arpabet-2-IPA' : 'auto-IPA', 'pronunciation' : 'WHD IPA'})

In [15]:
samples_for_paper.head()

Unnamed: 0,homograph,WHD_IPA_representations,auto-IPA,WHD IPA,sentence
26,aged,['eɪʤəd' 'eɪʤd'],edʒd,'eɪʤəd,Well I guess I would first identify myself as ...
27,aged,['eɪʤəd' 'eɪʤd'],edʒd,'eɪʤəd,And I aged ten years
28,aged,['eɪʤəd' 'eɪʤd'],edʒd,'eɪʤəd,We we very similarly aged
29,alternate,['ɑːltɚnət' 'ɑːltɚˌneɪt'],ˈɔltɝnʌt,'ɑːltɚnət,Once once I was declared an alternate and I wa...
30,alternate,['ɑːltɚnət' 'ɑːltɚˌneɪt'],ˈɔltɝnʌt,'ɑːltɚnət,Even when I was in elementary school years ago...


In [16]:
samples_for_paper.to_csv('./SWBD_WHD_samples.tsv', sep='\t', index=False)

In [17]:
samples_for_paper.to_latex(index=False,longtable=True, buf='SWBDhomprons', caption='SWBD homograph pronunciation data', label='SWBDhomprons')

In [18]:
new_samples = new_samples_df[['homograph_x', 'wordid', 'sentence']].rename(columns = {'homograph_x' : 'homograph'})

new_samples['orig_idx'] = new_samples.index

In [19]:
#new_samples.to_csv("C:/Users/jseal/Dev/dissertation/Data/SWBD/SWBD_Homographs/swbd_new_samples_formatted.tsv", sep='\t')


In [20]:
#all_data = combine_train_eval_swbd()

In [21]:
#all_data.head(10)

In [22]:
for i, gp in new_samples.groupby('homograph'): 
    print(gp['wordid'].value_counts())

aged_adj    3
Name: wordid, dtype: int64
alternate_adj-nou    5
Name: wordid, dtype: int64
axes_nou-vrb    1
Name: wordid, dtype: int64
bass_corp    9
Name: wordid, dtype: int64
bow_nou-ship    2
bow_nou-knot    2
Name: wordid, dtype: int64
close_adj-nou    31
close_vrb         7
Name: wordid, dtype: int64
conflict_nou    8
Name: wordid, dtype: int64
content_adj-nou-vrb    3
Name: wordid, dtype: int64
duplicate_adj-nou    1
Name: wordid, dtype: int64
frequent_adj    4
Name: wordid, dtype: int64
initiate_vrb    2
Name: wordid, dtype: int64
invite_vrb    3
Name: wordid, dtype: int64
lead_nou-vrb    8
Name: wordid, dtype: int64
lives_vrb    29
lives_nou    29
Name: wordid, dtype: int64
misuse_vrb    1
Name: wordid, dtype: int64
mobile    5
Name: wordid, dtype: int64
present_adj-nou    18
present_vrb         4
Name: wordid, dtype: int64
produce_vrb    7
Name: wordid, dtype: int64
read_past       8
read_present    8
Name: wordid, dtype: int64
reading_en    30
Name: wordid, dtype: int64
sepa

In [26]:
new_samples.wordid.nunique()

33

## Serialize splits

In [23]:
for i, gp in new_samples.groupby('homograph'): 
    train, dev, test = stratify_sample(gp)
    if not train.empty: 
        train = train[['homograph', 'wordid', 'sentence']]
        train_whd = pd.read_csv(TRAIN + "/" + i + ".tsv", sep="\t")
        train_final = pd.concat([train, train_whd], ignore_index=True)
        train_final = train_final.sample(frac=1)
    else: 
        train_final = train_whd
    if not dev.empty:
        dev = dev[['homograph', 'wordid', 'sentence']]
        dev_whd = pd.read_csv(DEV + "/" + i + ".tsv", sep="\t")
        dev_final = pd.concat([dev, dev_whd], ignore_index=True)
        dev_final = dev_final.sample(frac=1)
    else: 
        dev_final = dev_whd
    if not test.empty:
        test = test[['homograph', 'wordid', 'sentence']]
        test_whd = pd.read_csv(TEST + "/" + i + ".tsv", sep="\t")
        test_final = pd.concat([test, test_whd], ignore_index=True)
        test_final = test_final.sample(frac=1)
    else: 
        test = test_whd

    for path in PATHS:
        if not os.path.exists(path):
            os.mkdir(path)
    #serialize
    train_final.to_csv(TRAIN_AUG + '{}.tsv'.format(i), sep='\t', index=False)
    dev_final.to_csv(DEV_AUG + '{}.tsv'.format(i), sep='\t', index=False)
    test_final.to_csv(TEST_AUG + '{}.tsv'.format(i), sep='\t', index=False)



train
1
dev
1
test
1
train
4
dev
1
test
1
train
7
dev
1
test
1
train
25
dev
3
test
3
train
6
dev
1
test
1
train
6
dev
1
test
1
train
1
dev
1
test
1
train
3
dev
1
test
1
train
1
dev
1
test
1
train
6
dev
1
test
1
train
23
dev
3
test
3
train
23
dev
3
test
3
train
4
dev
1
test
1
train
14
dev
2
test
2
train
3
dev
1
test
1
train
6
dev
1
test
1
train
6
dev
1
test
1
train
6
dev
1
test
1
train
24
dev
3
test
3
train
13
dev
2
test
1
train
12
dev
2
test
1
train
23
dev
3
test
3
train
17
dev
2
test
2
train
10
dev
2
test
1


In [29]:
augmented_homographs = []
for i, gp in new_samples.groupby('homograph'): 
    augmented_homographs.append(i) 

In [35]:
for f in iglob(TRAIN +"/*"): 
    homograph = os.path.basename(f)[:-4]
    if homograph not in augmented_homographs:
        shutil.copyfile(f, TRAIN_AUG + os.path.basename(f))
        
for f in iglob(DEV +"/*"): 
    homograph = os.path.basename(f)[:-4]
    if homograph not in augmented_homographs:
        shutil.copyfile(f, DEV_AUG + os.path.basename(f))
        
for f in iglob(TEST +"/*"): 
    homograph = os.path.basename(f)[:-4]
    if homograph not in augmented_homographs:
        shutil.copyfile(f, TEST_AUG + os.path.basename(f))