# Description
**Functionality**: Confirm lack of variance in certain WHD homograph data classes

**Use**: Use results to create only variant dataset 

### Import

In [1]:
import os
import shutil
from glob import glob
import pandas as pd
from tqdm import tqdm

### Variables

In [2]:
#Paths 
WHD_DATA = "C:/Users/jseal/Dev/dissertation/Data/WikipediaHomographData/data/"
#WHD_DATA = "/Users/jen/Dev/dissertation/dissertation/Data/WikipediaHomographData/data/"
TRAIN = WHD_DATA + "train/"
EVAL = WHD_DATA + "eval/"

pd.set_option('display.max_rows', None)


# Train: View numbers of unique pronunciation classes per homograph

In [3]:
dfs = []
for f in tqdm(glob(TRAIN +'*.tsv')):
    df = pd.read_table(f)
    dfs.append(df)

train_df = pd.concat(dfs)

100%|███████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 617.81it/s]


In [4]:
train_df.head()

Unnamed: 0,homograph,wordid,sentence,start,end
0,abstract,abstract_adj-nou,Smith uses his name as a base for building abs...,43,51
1,abstract,abstract_adj-nou,The Group intended to promote Welsh radical an...,48,56
2,abstract,abstract_adj-nou,Both stances are verbalized with the same abst...,42,50
3,abstract,abstract_adj-nou,"In later years he became less prolific, and un...",96,104
4,abstract,abstract_adj-nou,"As a sculptor, Ruud Kuijer became famous for h...",49,57


In [5]:
train_df.groupby('homograph')['wordid'].nunique().sort_values()

homograph
pigment         1
moped           1
elaborate       1
entrance        1
mouth           1
row             1
desert          1
conglomerate    1
job             1
subject         1
object          1
intimate        1
interchange     1
instrument      1
addict          1
incense         1
consort         1
compact         1
ornament        2
mobile          2
nestle          2
overthrow       2
pasty           2
perfect         2
perfume         2
moderate        2
abstract        2
minute          2
initiate        2
insert          2
insult          2
intrigue        2
invalid         2
invert          2
invite          2
isolate         2
jesus           2
laminate        2
lead            2
learned         2
permit          2
lives           2
mate            2
misuse          2
live            2
precipitate     2
polish          2
sake            2
separate        2
sow             2
subordinate     2
supplement      2
suspect         2
syndicate       2
tear            2


# View numbers of pronunciation classes per homograph

In [6]:
hids = []
for idx, group in train_df.groupby('homograph'):
    #print(idx)
    ttl = 0
    grp = []
    for wordid in group['wordid'].unique():
        homograph_wordid = {}
        homograph_wordid['homograph'] = idx
        homograph_wordid["wordid"] = wordid
        count = len(group[group['wordid'] == wordid])
        homograph_wordid["wordid_count"] = count
        #print("{}:{}".format(wordid, count))
        grp.append(homograph_wordid)
        ttl += count
    hid_df = pd.DataFrame(grp)    
    hid_df["ttl"] = ttl
    hids.append(hid_df)
    #print("\n")
hids_df = pd.concat(hids)
hids_df["percent_ttl"] = hids_df['wordid_count']/hids_df["ttl"]
print(hids_df)


      homograph                 wordid  wordid_count  ttl  percent_ttl
0      abstract       abstract_adj-nou            89   90     0.988889
1      abstract           abstract_vrb             1   90     0.011111
0         abuse              abuse_nou            81   90     0.900000
1         abuse              abuse_vrb             9   90     0.100000
0        abuses             abuses_nou            74   90     0.822222
1        abuses             abuses_vrb            16   90     0.177778
0        addict             addict_nou            89   89     1.000000
0      advocate           advocate_nou            76   90     0.844444
1      advocate           advocate_vrb            14   90     0.155556
0        affect                 affect            85   90     0.944444
1        affect         affect_nou-psy             5   90     0.055556
0     affiliate          affiliate_vrb             3   89     0.033708
1     affiliate          affiliate_nou            86   89     0.966292
0     

In [7]:
hids_df.describe()

Unnamed: 0,wordid_count,ttl,percent_ttl
count,307.0,307.0,307.0
mean,47.188925,89.413681,0.527687
std,33.389807,3.668655,0.372981
min,1.0,58.0,0.011111
25%,11.5,89.0,0.128464
50%,49.0,90.0,0.535714
75%,81.0,90.0,0.910612
max,90.0,112.0,1.0


In [8]:
hids_df.to_csv("hids.csv")

In [9]:
# Train: View numbers of unique pronunciation classes per homograph

dfs = []
for f in tqdm(glob(TRAIN +'*.tsv')):
    df = pd.read_table(f)
    dfs.append(df)

train_df = pd.concat(dfs)

train_df.head()

train_df.groupby('homograph')['wordid'].nunique().sort_values()

100%|███████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 866.22it/s]


homograph
pigment         1
moped           1
elaborate       1
entrance        1
mouth           1
row             1
desert          1
conglomerate    1
job             1
subject         1
object          1
intimate        1
interchange     1
instrument      1
addict          1
incense         1
consort         1
compact         1
ornament        2
mobile          2
nestle          2
overthrow       2
pasty           2
perfect         2
perfume         2
moderate        2
abstract        2
minute          2
initiate        2
insert          2
insult          2
intrigue        2
invalid         2
invert          2
invite          2
isolate         2
jesus           2
laminate        2
lead            2
learned         2
permit          2
lives           2
mate            2
misuse          2
live            2
precipitate     2
polish          2
sake            2
separate        2
sow             2
subordinate     2
supplement      2
suspect         2
syndicate       2
tear            2


In [10]:
unique = train_df.groupby('homograph')['wordid'].nunique()
one_count = unique[unique == 1]
len(one_count)

18

# Eval: View numbers of unique pronunciation classes per homograph

In [11]:
dfs = []
for f in tqdm(glob(EVAL +'*.tsv')):
    df = pd.read_table(f)
    dfs.append(df)

eval_df = pd.concat(dfs)

100%|███████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 725.77it/s]


In [12]:
eval_df.head()

Unnamed: 0,homograph,wordid,sentence,start,end
0,abstract,abstract_adj-nou,Andrzej Tarlecki: Quasi-varieties in abstract ...,37,45
1,abstract,abstract_adj-nou,"Over time, the Egyptians developed more abstra...",40,48
2,abstract,abstract_adj-nou,He sought to express his sense of heritage in ...,46,54
3,abstract,abstract_adj-nou,There are parts that are almost completely abs...,43,51
4,abstract,abstract_adj-nou,In the late 1970s she began working in more ab...,44,52


In [13]:
eval_df.groupby('homograph')['wordid'].nunique().sort_values()

homograph
abstract        1
entrance        1
elaborate       1
pigment         1
polish          1
discount        1
predicate       1
deviate         1
desert          1
delegate        1
project         1
defect          1
protest         1
job             1
record          1
contrast        1
estimate        1
expatriate      1
perfect         1
fragment        1
learned         1
intimate        1
interchange     1
instrument      1
mate            1
misuse          1
mobile          1
contract        1
moderate        1
moped           1
winds           1
mouth           1
nestle          1
house           1
object          1
ornament        1
incense         1
reject          1
jesus           1
august          1
subject         1
analyses        1
alternate       1
bologna         1
affiliate       1
syndicate       1
compact         1
compound        1
affect          1
appropriate     1
buffet          1
row             1
conflict        1
consort         1
addict          1


In [14]:
unique = eval_df.groupby('homograph')['wordid'].nunique()
one_count = unique[unique == 1]
len(one_count)

62

# View Eval numbers of pronunciation classes per homograph

In [15]:
hids = []
for idx, group in eval_df.groupby('homograph'):
    #print(idx)
    ttl = 0
    grp = []
    for wordid in group['wordid'].unique():
        homograph_wordid = {}
        homograph_wordid['homograph'] = idx
        homograph_wordid["wordid"] = wordid
        count = len(group[group['wordid'] == wordid])
        homograph_wordid["wordid_count"] = count
        #print("{}:{}".format(wordid, count))
        grp.append(homograph_wordid)
        ttl += count
    hid_df = pd.DataFrame(grp)    
    hid_df["ttl"] = ttl
    hids.append(hid_df)
    #print("\n")
hids_df = pd.concat(hids)
hids_df["percent_ttl"] = hids_df['wordid_count']/hids_df["ttl"]
print(hids_df)


      homograph                 wordid  wordid_count  ttl  percent_ttl
0      abstract       abstract_adj-nou            10   10     1.000000
0         abuse              abuse_nou             9   10     0.900000
1         abuse              abuse_vrb             1   10     0.100000
0        abuses             abuses_nou             9   10     0.900000
1        abuses             abuses_vrb             1   10     0.100000
0        addict             addict_nou            10   10     1.000000
0      advocate           advocate_nou             7   10     0.700000
1      advocate           advocate_vrb             3   10     0.300000
0        affect                 affect            10   10     1.000000
0     affiliate          affiliate_nou            10   10     1.000000
0          aged                   aged             9   10     0.900000
1          aged               aged_adj             1   10     0.100000
0     aggregate      aggregate_adj-nou             9   10     0.900000
1     

In [16]:
dfs = []
for f in tqdm(glob(EVAL +'*.tsv')):
    df = pd.read_table(f)
    dfs.append(df)

eval_df = pd.concat(dfs)

eval_df.head()
eval_df.groupby('homograph')['wordid'].nunique().sort_values()

100%|███████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 931.23it/s]


homograph
abstract        1
entrance        1
elaborate       1
pigment         1
polish          1
discount        1
predicate       1
deviate         1
desert          1
delegate        1
project         1
defect          1
protest         1
job             1
record          1
contrast        1
estimate        1
expatriate      1
perfect         1
fragment        1
learned         1
intimate        1
interchange     1
instrument      1
mate            1
misuse          1
mobile          1
contract        1
moderate        1
moped           1
winds           1
mouth           1
nestle          1
house           1
object          1
ornament        1
incense         1
reject          1
jesus           1
august          1
subject         1
analyses        1
alternate       1
bologna         1
affiliate       1
syndicate       1
compact         1
compound        1
affect          1
appropriate     1
buffet          1
row             1
conflict        1
consort         1
addict          1


# Train + Eval: View numbers of unique pronunciation classes per homograph

In [17]:
dfs = []
for f in tqdm(glob(TRAIN +'*.tsv')):
    df = pd.read_table(f)
    dfs.append(df)
for f in tqdm(glob(EVAL +'*.tsv')):
    df = pd.read_table(f)
    dfs.append(df)

train_eval_df = pd.concat(dfs)

100%|███████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 877.01it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 162/162 [00:00<00:00, 955.57it/s]


In [18]:
unique = train_eval_df.groupby('homograph')['wordid'].nunique()
one_count = unique[unique == 1]
len(one_count)

17

In [19]:
one_count.index.tolist()

['addict',
 'compact',
 'consort',
 'desert',
 'elaborate',
 'entrance',
 'incense',
 'instrument',
 'interchange',
 'intimate',
 'job',
 'moped',
 'mouth',
 'object',
 'pigment',
 'row',
 'subject']

In [20]:
dataset = train_eval_df[~train_eval_df['homograph'].isin(one_count.index.tolist())]

In [48]:
hids = []
for idx, group in dataset.groupby('homograph'):
    #print(idx)
    ttl = 0
    grp = []
    for wordid in group['wordid'].unique():
        homograph_wordid = {}
        homograph_wordid['homograph'] = idx
        homograph_wordid["wordid"] = wordid
        count = len(group[group['wordid'] == wordid])
        homograph_wordid["wordid_count"] = count
        #print("{}:{}".format(wordid, count))
        grp.append(homograph_wordid)
        ttl += count
    hid_df = pd.DataFrame(grp)    
    hid_df["ttl"] = ttl
    hids.append(hid_df)
    #print("\n")
hids_df = pd.concat(hids)
hids_df["percent_ttl"] = hids_df['wordid_count']/hids_df["ttl"]
hids_df["percent_ttl"] = hids_df["percent_ttl"].apply(lambda x: float(str(x*100)[:4]))
print(hids_df)

      homograph                 wordid  wordid_count  ttl  percent_ttl
0      abstract       abstract_adj-nou            99  100        99.00
1      abstract           abstract_vrb             1  100         1.00
0         abuse              abuse_nou            90  100        90.00
1         abuse              abuse_vrb            10  100        10.00
0        abuses             abuses_nou            83  100        83.00
1        abuses             abuses_vrb            17  100        17.00
0      advocate           advocate_nou            83  100        83.00
1      advocate           advocate_vrb            17  100        17.00
0        affect                 affect            95  100        95.00
1        affect         affect_nou-psy             5  100         5.00
0     affiliate          affiliate_vrb             3   99         3.03
1     affiliate          affiliate_nou            96   99        96.90
0          aged                   aged            96  101        95.00
1     

In [47]:
hids_df.to_latex(index=False,longtable=True, buf='whd_counts', caption='WHD counts and percentages', label='whd_counts_percents')

In [32]:
hids_df.homograph.nunique()

145

In [23]:
hids_df.to_csv("train_eval_hids.csv")

In [52]:
low_resource = hids_df[hids_df["wordid_count"] < 3]['homograph'].tolist()
low_resource

['abstract',
 'appropriate',
 'august',
 'bologna',
 'conglomerate',
 'console',
 'contract',
 'contrast',
 'deviate',
 'expatriate',
 'house',
 'jesus',
 'mate',
 'moderate',
 'predicate',
 'project',
 'ravel']

In [53]:
dataset_final = dataset[~dataset['homograph'].isin(low_resource)]

In [55]:
dataset_final.homograph.nunique()

128

In [58]:
dataset_final.wordid.nunique()

256

In [76]:
hids = []
for idx, group in dataset_final.groupby('homograph'):
    #print(idx)
    ttl = 0
    grp = []
    for wordid in group['wordid'].unique():
        homograph_wordid = {}
        homograph_wordid['homograph'] = idx
        homograph_wordid["wordid"] = wordid
        count = len(group[group['wordid'] == wordid])
        homograph_wordid["wordid_count"] = count
        #print("{}:{}".format(wordid, count))
        grp.append(homograph_wordid)
        ttl += count
    hid_df = pd.DataFrame(grp)    
    hid_df["ttl"] = ttl
    hids.append(hid_df)
    #print("\n")
hids_df = pd.concat(hids)
hids_df["percent_ttl"] = hids_df['wordid_count']/hids_df["ttl"]
hids_df["percent_ttl"] = hids_df["percent_ttl"].apply(lambda x: float(str(x*100)[:4]))
print(hids_df)

     homograph                 wordid  wordid_count  ttl  percent_ttl
0        abuse              abuse_nou            90  100        90.00
1        abuse              abuse_vrb            10  100        10.00
0       abuses             abuses_nou            83  100        83.00
1       abuses             abuses_vrb            17  100        17.00
0     advocate           advocate_nou            83  100        83.00
1     advocate           advocate_vrb            17  100        17.00
0       affect                 affect            95  100        95.00
1       affect         affect_nou-psy             5  100         5.00
0    affiliate          affiliate_vrb             3   99         3.03
1    affiliate          affiliate_nou            96   99        96.90
0         aged                   aged            96  101        95.00
1         aged               aged_adj             5  101         4.95
0    aggregate      aggregate_adj-nou            88   98        89.70
1    aggregate      

In [63]:
hids_df.to_latex(index=False,longtable=True, buf='whd_counts_final', caption='WHD counts and percentages', label='whd_counts_percents')

In [None]:
hids_df.percent_ttl.plot('hist')

In [101]:
import math

In [157]:
ratios = []
for i, grp in hids_df.groupby('homograph'): 
    ratio = {}
    counts = []
    ratio['homograph'] = i
    for wdid in grp['wordid']:
        counts.append(grp[grp['wordid'] == wdid]['wordid_count'].item())
    counts.sort()
    print(counts)
    gcd = math.gcd(counts[0], counts[1])
    print(gcd)
    ratio["ratio"] = str(counts[0]) + ":" + str(counts[1]) 
    #ratio["ratio"] = (counts[0]) / (counts[1])
    ratio["class size difference"] = counts[1] - counts[0]
    #ratio["percent_difference"] = ((counts[1] - counts[0]) / (counts[1] + counts[0]))*100
    ratios.append(ratio)
ratios

[10, 90]
10
[17, 83]
1
[17, 83]
1
[5, 95]
5
[3, 96]
3
[5, 96]
1
[10, 88]
2
[5, 97]
1
[11, 89]
1
[37, 58]
1
[11, 88]
11
[46, 54]
2
[12, 88]
4
[46, 54]
2
[49, 51]
1
[13, 84]
1
[45, 51]
3
[47, 53]
1
[5, 93]
1
[30, 70]
10
[6, 93]
3
[9, 90]
9
[3, 97]
1
[5, 93]
1
[48, 52]
4
[22, 77]
11
[4, 96]
4
[5, 95]
5
[15, 84]
3
[10, 90]
10
[49, 51]
1
[4, 96]
4
[15, 85]
5
[37, 63]
1
[14, 86]
2
[12, 88]
4
[47, 52]
1
[10, 90]
10
[46, 53]
1
[11, 89]
1
[16, 84]
4
[8, 92]
4
[4, 95]
1
[22, 78]
2
[37, 63]
1
[27, 73]
1
[11, 89]
1
[3, 97]
1
[15, 85]
5
[7, 57]
1
[27, 73]
1
[12, 88]
4
[27, 72]
9
[18, 81]
9
[14, 86]
2
[15, 84]
3
[6, 93]
3
[48, 52]
4
[3, 96]
3
[5, 95]
5
[5, 95]
5
[3, 97]
1
[9, 91]
1
[10, 90]
10
[23, 75]
1
[4, 95]
1
[47, 53]
1
[12, 87]
3
[9, 91]
1
[48, 52]
4
[26, 74]
2
[5, 95]
5
[23, 76]
1
[31, 68]
1
[21, 79]
1
[22, 78]
2
[4, 96]
4
[24, 91]
1
[7, 93]
1
[37, 61]
1
[37, 64]
1
[13, 84]
1
[7, 91]
7
[4, 95]
1
[14, 72]
2
[4, 96]
4
[34, 66]
2
[24, 70]
2
[6, 95]
1
[3, 97]
1
[45, 54]
9
[24, 76]
4
[3, 97]
1
[35

[{'homograph': 'abuse', 'ratio': '10:90', 'class size difference': 80},
 {'homograph': 'abuses', 'ratio': '17:83', 'class size difference': 66},
 {'homograph': 'advocate', 'ratio': '17:83', 'class size difference': 66},
 {'homograph': 'affect', 'ratio': '5:95', 'class size difference': 90},
 {'homograph': 'affiliate', 'ratio': '3:96', 'class size difference': 93},
 {'homograph': 'aged', 'ratio': '5:96', 'class size difference': 91},
 {'homograph': 'aggregate', 'ratio': '10:88', 'class size difference': 78},
 {'homograph': 'alternate', 'ratio': '5:97', 'class size difference': 92},
 {'homograph': 'analyses', 'ratio': '11:89', 'class size difference': 78},
 {'homograph': 'animate', 'ratio': '37:58', 'class size difference': 21},
 {'homograph': 'approximate', 'ratio': '11:88', 'class size difference': 77},
 {'homograph': 'articulate', 'ratio': '46:54', 'class size difference': 8},
 {'homograph': 'associate', 'ratio': '12:88', 'class size difference': 76},
 {'homograph': 'attribute', 'rati

In [160]:
ratios_df = pd.DataFrame(ratios)

In [161]:
ratios_df.sort_values(by='ratio')

Unnamed: 0,homograph,ratio,class size difference
6,aggregate,10:88,78
0,abuse,10:90,80
98,protest,10:90,80
37,correlate,10:90,80
29,construct,10:90,80
63,implement,10:90,80
10,approximate,11:88,77
46,discharge,11:89,78
39,defect,11:89,78
8,analyses,11:89,78


In [163]:
ratios_df.to_latex(index=False,longtable=True, buf='class_ratios', caption='Homograph class ratios and differences', label='class_ratios')

In [136]:
counts = ratios_df.groupby('percent_difference').count()
counts.sort_values(by='percent_difference')

Unnamed: 0_level_0,homograph,ratio_str,ratio,difference
percent_difference,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.02,2,2,2,2
0.04,3,3,3,3
0.050505,1,1,1,1
0.056,1,1,1,1
0.06,2,2,2,2
0.0625,1,1,1,1
0.070707,1,1,1,1
0.08,2,2,2,2
0.090909,1,1,1,1
0.148515,1,1,1,1


In [141]:
len(ratios_df[ratios_df["difference"] <= 10]) / len(ratios_df)

0.109375

In [143]:
len(ratios_df[(ratios_df["difference"] > 10) & (ratios_df["difference"] <= 20)]) / len(ratios_df)

0.015625

In [147]:
len(ratios_df[(ratios_df["difference"] > 20) & (ratios_df["difference"] <= 30)]) / len(ratios_df)

0.0546875

In [148]:
len(ratios_df[(ratios_df["difference"] > 30) & (ratios_df["difference"] <= 40)]) / len(ratios_df)

0.0546875

In [149]:
len(ratios_df[(ratios_df["difference"] > 40) & (ratios_df["difference"] <= 50)]) / len(ratios_df)

0.078125

In [150]:
len(ratios_df[(ratios_df["difference"] > 50) & (ratios_df["difference"] <= 60)]) / len(ratios_df)

0.0625

In [151]:
len(ratios_df[(ratios_df["difference"] > 60) & (ratios_df["difference"] <= 70)]) / len(ratios_df)

0.109375

In [152]:
len(ratios_df[(ratios_df["difference"] > 70) & (ratios_df["difference"] <= 80)]) / len(ratios_df)

0.1640625

In [153]:
len(ratios_df[(ratios_df["difference"] > 80) & (ratios_df["difference"] <= 90)]) / len(ratios_df)

0.2109375

In [155]:
len(ratios_df[(ratios_df["difference"] > 90) & (ratios_df["difference"] <= 100)]) / len(ratios_df)

0.140625

In [156]:
ratios_df.difference.describe()

count    128.000000
mean      62.281250
std       28.638375
min        2.000000
25%       45.000000
50%       71.500000
75%       87.000000
max       94.000000
Name: difference, dtype: float64