# Description
**Functionality**: Confirm lack of variance in certain WHD homograph data classes

**Use**: Use results to create only variant dataset 

### Import

In [1]:
import os
import math
import shutil
from glob import glob
import pandas as pd
from tqdm import tqdm

### Variables

In [2]:
#Paths 
BASE = "C:/Users/jseal/Dev/dissertation/Data/WikipediaHomographData/data/"
WHD_DATA = BASE + "three_split_stratified_variant_data_corrected/"
TRAIN = WHD_DATA + "train/"
DEV = WHD_DATA + "dev/"
TEST = WHD_DATA + "test/"

pd.set_option('display.max_rows', None)


# Train

In [3]:
dfs = []
for f in tqdm(glob(TRAIN +'*.tsv')):
    df = pd.read_table(f)
    dfs.append(df)

train_df = pd.concat(dfs)

100%|███████████████████████████████████████████████████████████████████████████████| 128/128 [00:00<00:00, 815.04it/s]


In [4]:
train_df.head()

Unnamed: 0,homograph,wordid,sentence,start,end,orig_idx
0,abuse,abuse_nou,Some such suits accuse the Vatican of complici...,59,64,0
1,abuse,abuse_nou,The charges of third-degree sexual abuse were ...,35,40,56
2,abuse,abuse_nou,"However, history has still shown the abuse of ...",37,42,24
3,abuse,abuse_nou,"Ecological Risk Assessment: Use, Abuse, and Al...",33,38,11442
4,abuse,abuse_nou,"She described the abuse as ""mild inappropriate...",18,23,64


In [5]:
train_df.groupby('homograph')['wordid'].nunique().sort_values()

homograph
abuse          2
postulate      2
polish         2
pervert        2
permit         2
perfume        2
perfect        2
pasty          2
overthrow      2
ornament       2
nestle         2
mobile         2
misuse         2
minute         2
precipitate    2
lives          2
learned        2
lead           2
laminate       2
isolate        2
invite         2
invert         2
invalid        2
intrigue       2
insult         2
insert         2
initiate       2
increment      2
increase       2
live           2
incline        2
present        2
progress       2
wind           2
uses           2
use            2
upset          2
transport      2
transplant     2
transform      2
tear           2
syndicate      2
suspect        2
supplement     2
subordinate    2
sow            2
produce        2
separate       2
rodeo          2
retard         2
resume         2
rerelease      2
reject         2
refuse         2
refund         2
recount        2
record         2
rebel          2
read

In [6]:
hids = []
for idx, group in train_df.groupby('homograph'):
    #print(idx)
    ttl = 0
    grp = []
    for wordid in group['wordid'].unique():
        homograph_wordid = {}
        homograph_wordid['homograph'] = idx
        homograph_wordid["wordid"] = wordid
        count = len(group[group['wordid'] == wordid])
        homograph_wordid["wordid_count"] = count
        #print("{}:{}".format(wordid, count))
        grp.append(homograph_wordid)
        ttl += count
    hid_df = pd.DataFrame(grp)    
    hid_df["ttl"] = ttl
    hids.append(hid_df)
    #print("\n")
hids_df = pd.concat(hids)
hids_df["percent_ttl"] = hids_df['wordid_count']/hids_df["ttl"]
print(hids_df)


     homograph                 wordid  wordid_count  ttl  percent_ttl
0        abuse              abuse_nou            72   80     0.900000
1        abuse              abuse_vrb             8   80     0.100000
0       abuses             abuses_nou            66   80     0.825000
1       abuses             abuses_vrb            14   80     0.175000
0     advocate           advocate_nou            66   80     0.825000
1     advocate           advocate_vrb            14   80     0.175000
0       affect                 affect            76   79     0.962025
1       affect         affect_nou-psy             3   79     0.037975
0    affiliate          affiliate_nou            77   78     0.987179
1    affiliate          affiliate_vrb             1   78     0.012821
0         aged                   aged            77   80     0.962500
1         aged               aged_adj             3   80     0.037500
0    aggregate      aggregate_adj-nou            70   78     0.897436
1    aggregate      

In [7]:
hids_df.ttl.mean()

79.1484375

In [8]:
ratios = []
for i, grp in hids_df.groupby('homograph'): 
    ratio = {}
    counts = []
    ratio['homograph'] = i
    for wdid in grp['wordid']:
        counts.append(grp[grp['wordid'] == wdid]['wordid_count'].item())
    counts.sort()
    print(counts)
    gcd = math.gcd(counts[0], counts[1])
    print(gcd)
    ratio["ratio"] = str(counts[0]) + ":" + str(counts[1]) 
    #ratio["ratio"] = (counts[0]) / (counts[1])
    ratio["difference"] = counts[1] - counts[0]
    #ratio["percent_difference"] = ((counts[1] - counts[0]) / (counts[1] + counts[0]))*100
    ratios.append(ratio)
ratios
ratios_df = pd.DataFrame(ratios)
ratios_df.sort_values(by='ratio')

[8, 72]
8
[14, 66]
2
[14, 66]
2
[3, 76]
1
[1, 77]
1
[3, 77]
1
[8, 70]
2
[3, 78]
3
[9, 71]
1
[30, 46]
2
[9, 70]
1
[37, 43]
1
[10, 70]
10
[37, 43]
1
[39, 41]
1
[10, 67]
1
[36, 41]
1
[38, 42]
2
[3, 74]
1
[24, 56]
8
[4, 74]
2
[7, 72]
1
[1, 78]
1
[3, 74]
1
[38, 42]
2
[18, 62]
2
[2, 77]
1
[3, 76]
1
[12, 67]
1
[8, 72]
8
[39, 41]
1
[2, 77]
1
[12, 68]
4
[30, 50]
10
[11, 69]
1
[10, 70]
10
[38, 42]
2
[8, 72]
8
[37, 42]
1
[9, 71]
1
[13, 67]
1
[6, 74]
2
[2, 76]
2
[18, 62]
2
[30, 50]
10
[22, 58]
2
[9, 71]
1
[1, 78]
1
[12, 68]
4
[5, 46]
1
[22, 58]
2
[10, 70]
10
[22, 58]
2
[14, 65]
1
[11, 69]
1
[12, 67]
1
[4, 74]
2
[38, 42]
2
[1, 77]
1
[3, 76]
1
[3, 76]
1
[1, 78]
1
[7, 73]
1
[8, 72]
8
[18, 60]
6
[2, 76]
2
[38, 42]
2
[10, 70]
10
[7, 73]
1
[38, 42]
2
[21, 59]
1
[3, 76]
1
[18, 61]
1
[25, 54]
1
[17, 63]
1
[18, 62]
2
[2, 77]
1
[19, 73]
1
[5, 74]
1
[30, 49]
1
[30, 51]
3
[10, 67]
1
[5, 73]
1
[2, 76]
2
[11, 58]
1
[2, 77]
1
[27, 53]
1
[19, 56]
1
[4, 76]
4
[1, 78]
1
[36, 43]
1
[19, 61]
1
[1, 78]
1
[28, 52]
4
[3

Unnamed: 0,homograph,ratio,difference
81,minute,10:67,57
15,bass,10:67,57
51,escort,10:70,60
67,increment,10:70,60
35,convict,10:70,60
102,record,10:70,60
12,associate,10:70,60
124,uses,10:70,60
84,nestle,11:58,47
34,convert,11:69,58


In [9]:
ratios_df.to_latex(index=False,longtable=True, buf='final_train_class_ratios', caption='Final WHD Train split class ratios and differences', label='whdfinaltrainclassratios')

In [10]:
len(ratios_df[ratios_df["difference"] <= 10]) / len(ratios_df)

0.109375

In [11]:
len(ratios_df[(ratios_df["difference"] > 10) & (ratios_df["difference"] <= 20)]) / len(ratios_df)

0.0546875

In [12]:
len(ratios_df[(ratios_df["difference"] > 20) & (ratios_df["difference"] <= 30)]) / len(ratios_df)

0.0625

In [13]:
len(ratios_df[(ratios_df["difference"] > 30) & (ratios_df["difference"] <= 40)]) / len(ratios_df)

0.078125

In [14]:
len(ratios_df[(ratios_df["difference"] > 40) & (ratios_df["difference"] <= 50)]) / len(ratios_df)

0.0703125

In [15]:
len(ratios_df[(ratios_df["difference"] > 50) & (ratios_df["difference"] <= 60)]) / len(ratios_df)

0.1953125

In [16]:
len(ratios_df[(ratios_df["difference"] >= 60) & (ratios_df["difference"] <= 70)]) / len(ratios_df)

0.25

In [17]:
len(ratios_df[(ratios_df["difference"] > 70) & (ratios_df["difference"] <= 80)]) / len(ratios_df)

0.2265625

In [18]:
len(ratios_df[(ratios_df["difference"] > 70) & (ratios_df["difference"] <= 77)])  / len(ratios_df)

0.2265625

In [19]:
len(ratios_df[(ratios_df["difference"] > 80) & (ratios_df["difference"] <= 90)]) / len(ratios_df)

0.0

In [20]:
len(ratios_df[(ratios_df["difference"] > 90) & (ratios_df["difference"] <= 100)]) / len(ratios_df)

0.0

In [21]:
hids_df.describe()

Unnamed: 0,wordid_count,ttl,percent_ttl
count,256.0,256.0,256.0
mean,39.574219,79.148438,0.5
std,27.722334,3.617148,0.351232
min,1.0,51.0,0.012658
25%,10.75,79.0,0.135593
50%,41.0,80.0,0.5
75%,69.0,80.0,0.864407
max,78.0,100.0,0.987342


In [22]:
#hids_df.to_csv("hids.csv")

In [23]:
unique = train_df.groupby('homograph')['wordid'].nunique()
one_count = unique[unique == 1]
len(one_count)

0

In [24]:
len(train_df)

10131

In [25]:
#Some sentences used multiple times for different instances
train_deduped = train_df.drop_duplicates(subset=['sentence'])
len(train_deduped)

10118

In [26]:
import numpy as np
dups = np.setdiff1d(train_deduped.index,train_df.index)

dups

array([], dtype=int64)

In [27]:
train_deduped.head()

Unnamed: 0,homograph,wordid,sentence,start,end,orig_idx
0,abuse,abuse_nou,Some such suits accuse the Vatican of complici...,59,64,0
1,abuse,abuse_nou,The charges of third-degree sexual abuse were ...,35,40,56
2,abuse,abuse_nou,"However, history has still shown the abuse of ...",37,42,24
3,abuse,abuse_nou,"Ecological Risk Assessment: Use, Abuse, and Al...",33,38,11442
4,abuse,abuse_nou,"She described the abuse as ""mild inappropriate...",18,23,64


In [28]:
dups = train_df[~train_df.index.isin(train_deduped.index)]

In [29]:
dups

Unnamed: 0,homograph,wordid,sentence,start,end,orig_idx


# Dev

In [30]:
dfs = []
for f in tqdm(glob(DEV +'*.tsv')):
    df = pd.read_table(f)
    dfs.append(df)

dev_df = pd.concat(dfs)

100%|███████████████████████████████████████████████████████████████████████████████| 128/128 [00:00<00:00, 743.40it/s]


In [31]:
dev_df.head()

Unnamed: 0,homograph,wordid,sentence,start,end,orig_idx
0,abuse,abuse_nou,"There were incidences of abuse, however, with ...",25,30,81
1,abuse,abuse_nou,"In Ramona v. Isabella, Gary Ramona sued his da...",102,107,65
2,abuse,abuse_nou,They do not include other factors of disadvant...,100,105,61
3,abuse,abuse_nou,"It smashed taboos surrounding mental health, a...",45,50,53
4,abuse,abuse_nou,Members of all three groups have been involved...,63,68,23


In [32]:
dev_df.groupby('homograph')['wordid'].nunique().sort_values()

homograph
abuse          2
postulate      2
polish         2
pervert        2
permit         2
perfume        2
perfect        2
pasty          2
overthrow      2
ornament       2
nestle         2
mobile         2
misuse         2
minute         2
precipitate    2
lives          2
learned        2
lead           2
laminate       2
isolate        2
invite         2
invert         2
invalid        2
intrigue       2
insult         2
insert         2
initiate       2
increment      2
increase       2
live           2
incline        2
present        2
progress       2
wind           2
uses           2
use            2
upset          2
transport      2
transplant     2
transform      2
tear           2
syndicate      2
suspect        2
supplement     2
subordinate    2
sow            2
produce        2
separate       2
rodeo          2
retard         2
resume         2
rerelease      2
reject         2
refuse         2
refund         2
recount        2
record         2
rebel          2
read

In [33]:
unique = dev_df.groupby('homograph')['wordid'].nunique()
one_count = unique[unique == 1]
one_count

Series([], Name: wordid, dtype: int64)

In [34]:
hids = []
for idx, group in dev_df.groupby('homograph'):
    #print(idx)
    ttl = 0
    grp = []
    for wordid in group['wordid'].unique():
        homograph_wordid = {}
        homograph_wordid['homograph'] = idx
        homograph_wordid["wordid"] = wordid
        count = len(group[group['wordid'] == wordid])
        homograph_wordid["wordid_count"] = count
        #print("{}:{}".format(wordid, count))
        grp.append(homograph_wordid)
        ttl += count
    hid_df = pd.DataFrame(grp)    
    hid_df["ttl"] = ttl
    hids.append(hid_df)
    #print("\n")
hids_df = pd.concat(hids)
hids_df["percent_ttl"] = hids_df['wordid_count']/hids_df["ttl"]
print(hids_df)


     homograph                 wordid  wordid_count  ttl  percent_ttl
0        abuse              abuse_nou             9   10     0.900000
1        abuse              abuse_vrb             1   10     0.100000
0       abuses             abuses_nou             8   10     0.800000
1       abuses             abuses_vrb             2   10     0.200000
0     advocate           advocate_nou             8   10     0.800000
1     advocate           advocate_vrb             2   10     0.200000
0       affect                 affect            10   11     0.909091
1       affect         affect_nou-psy             1   11     0.090909
0    affiliate          affiliate_nou            10   11     0.909091
1    affiliate          affiliate_vrb             1   11     0.090909
0         aged                   aged            10   11     0.909091
1         aged               aged_adj             1   11     0.090909
0    aggregate      aggregate_adj-nou             9   10     0.900000
1    aggregate      

In [35]:
hids_df.ttl.mean()

10.21875

In [36]:
ratios = []
for i, grp in hids_df.groupby('homograph'): 
    ratio = {}
    counts = []
    ratio['homograph'] = i
    for wdid in grp['wordid']:
        counts.append(grp[grp['wordid'] == wdid]['wordid_count'].item())
    counts.sort()
    print(counts)
    gcd = math.gcd(counts[0], counts[1])
    print(gcd)
    ratio["ratio"] = str(counts[0]) + ":" + str(counts[1]) 
    #ratio["ratio"] = (counts[0]) / (counts[1])
    ratio["difference"] = counts[1] - counts[0]
    #ratio["percent_difference"] = ((counts[1] - counts[0]) / (counts[1] + counts[0]))*100
    ratios.append(ratio)
ratios
ratios_df = pd.DataFrame(ratios)
ratios_df.sort_values(by='ratio')

[1, 9]
1
[2, 8]
2
[2, 8]
2
[1, 10]
1
[1, 10]
1
[1, 10]
1
[1, 9]
1
[1, 10]
1
[1, 9]
1
[4, 6]
2
[1, 9]
1
[4, 6]
2
[1, 9]
1
[4, 6]
2
[5, 5]
5
[2, 8]
2
[4, 5]
1
[4, 6]
2
[1, 10]
1
[3, 7]
1
[1, 10]
1
[1, 9]
1
[1, 10]
1
[1, 10]
1
[5, 5]
5
[2, 8]
2
[1, 10]
1
[1, 10]
1
[2, 8]
2
[1, 9]
1
[5, 5]
5
[1, 10]
1
[2, 8]
2
[4, 6]
2
[2, 8]
2
[1, 9]
1
[4, 5]
1
[1, 9]
1
[4, 6]
2
[1, 9]
1
[2, 8]
2
[1, 9]
1
[1, 10]
1
[2, 8]
2
[4, 6]
2
[2, 8]
2
[1, 9]
1
[1, 10]
1
[2, 8]
2
[1, 6]
1
[2, 8]
2
[1, 9]
1
[2, 7]
1
[2, 8]
2
[2, 8]
2
[2, 8]
2
[1, 10]
1
[5, 5]
5
[1, 10]
1
[1, 10]
1
[1, 10]
1
[1, 10]
1
[1, 9]
1
[1, 9]
1
[2, 8]
2
[1, 10]
1
[4, 6]
2
[1, 8]
1
[1, 9]
1
[5, 5]
5
[2, 8]
2
[1, 10]
1
[2, 8]
2
[3, 7]
1
[2, 8]
2
[2, 8]
2
[1, 10]
1
[2, 9]
1
[1, 10]
1
[4, 6]
2
[4, 6]
2
[2, 8]
2
[1, 9]
1
[1, 10]
1
[2, 7]
1
[1, 10]
1
[4, 6]
2
[2, 7]
1
[1, 10]
1
[1, 10]
1
[4, 6]
2
[2, 8]
2
[1, 10]
1
[4, 6]
2
[4, 6]
2
[1, 9]
1
[1, 9]
1
[2, 8]
2
[1, 9]
1
[6, 6]
6
[2, 8]
2
[1, 10]
1
[1, 9]
1
[3, 7]
1
[2, 8]
2
[2, 8]
2
[1, 9]
1
[2, 8]
2


Unnamed: 0,homograph,ratio,difference
89,perfume,1:10,9
61,impact,1:10,9
20,close,1:10,9
92,polish,1:10,9
22,compound,1:10,9
23,compress,1:10,9
47,discount,1:10,9
18,buffet,1:10,9
88,perfect,1:10,9
27,conjugate,1:10,9


In [38]:
ratios_df.to_latex(index=False,longtable=True, buf='final_dev_class_ratios', caption='Final WHD Dev split class ratios and differences', label='whdfinaldevclassratios')

In [39]:
len(ratios_df[ratios_df["difference"] <= 1]) / len(ratios_df)

0.0625

In [40]:
len(ratios_df[(ratios_df["difference"] > 1) & (ratios_df["difference"] <= 2)]) / len(ratios_df)

0.1484375

In [41]:
len(ratios_df[(ratios_df["difference"] > 2) & (ratios_df["difference"] <= 3)]) / len(ratios_df)

0.0

In [42]:
len(ratios_df[(ratios_df["difference"] > 3) & (ratios_df["difference"] <= 4)]) / len(ratios_df)

0.03125

In [43]:
len(ratios_df[(ratios_df["difference"] > 4) & (ratios_df["difference"] <= 5)]) / len(ratios_df)

0.0390625

In [44]:
len(ratios_df[(ratios_df["difference"] > 5) & (ratios_df["difference"] <= 6)]) / len(ratios_df)

0.2421875

In [45]:
len(ratios_df[(ratios_df["difference"] > 6) & (ratios_df["difference"] <= 7)]) / len(ratios_df)

0.015625

In [46]:
len(ratios_df[(ratios_df["difference"] > 7) & (ratios_df["difference"] <= 8)]) / len(ratios_df)

0.1875

In [47]:
len(ratios_df[(ratios_df["difference"] > 8) & (ratios_df["difference"] <= 9)]) / len(ratios_df)

0.2734375

In [48]:
len(ratios_df[(ratios_df["difference"] > 9) & (ratios_df["difference"] <= 10)]) / len(ratios_df)

0.0

# TEST

In [49]:
dfs = []
for f in tqdm(glob(TEST +'*.tsv')):
    df = pd.read_table(f)
    dfs.append(df)

test_df = pd.concat(dfs)

100%|███████████████████████████████████████████████████████████████████████████████| 128/128 [00:00<00:00, 875.72it/s]


In [50]:
test_df.head()

Unnamed: 0,homograph,wordid,sentence,start,end,orig_idx
0,abuse,abuse_nou,He has worked to fund public education and to ...,65,70,47
1,abuse,abuse_nou,Despite his history of child sexual abuse agai...,36,41,11439
2,abuse,abuse_nou,"Allegations included rape, sodomy, fellatio an...",94,99,55
3,abuse,abuse_nou,In the hearse with Charlie he discusses the ab...,44,49,11443
4,abuse,abuse_nou,Jersey Police have so far recorded claims of a...,45,50,20


In [51]:
test_df.groupby('homograph')['wordid'].nunique().sort_values()

homograph
abuse          2
postulate      2
polish         2
pervert        2
permit         2
perfume        2
perfect        2
pasty          2
overthrow      2
ornament       2
nestle         2
mobile         2
misuse         2
minute         2
precipitate    2
lives          2
learned        2
lead           2
laminate       2
isolate        2
invite         2
invert         2
invalid        2
intrigue       2
insult         2
insert         2
initiate       2
increment      2
increase       2
live           2
incline        2
present        2
progress       2
wind           2
uses           2
use            2
upset          2
transport      2
transplant     2
transform      2
tear           2
syndicate      2
suspect        2
supplement     2
subordinate    2
sow            2
produce        2
separate       2
rodeo          2
retard         2
resume         2
rerelease      2
reject         2
refuse         2
refund         2
recount        2
record         2
rebel          2
read

In [52]:
hids = []
for idx, group in test_df.groupby('homograph'):
    #print(idx)
    ttl = 0
    grp = []
    for wordid in group['wordid'].unique():
        homograph_wordid = {}
        homograph_wordid['homograph'] = idx
        homograph_wordid["wordid"] = wordid
        count = len(group[group['wordid'] == wordid])
        homograph_wordid["wordid_count"] = count
        #print("{}:{}".format(wordid, count))
        grp.append(homograph_wordid)
        ttl += count
    hid_df = pd.DataFrame(grp)    
    hid_df["ttl"] = ttl
    hids.append(hid_df)
    #print("\n")
hids_df = pd.concat(hids)
hids_df["percent_ttl"] = hids_df['wordid_count']/hids_df["ttl"]
print(hids_df)


     homograph                 wordid  wordid_count  ttl  percent_ttl
0        abuse              abuse_nou             9   10     0.900000
1        abuse              abuse_vrb             1   10     0.100000
0       abuses             abuses_nou             9   10     0.900000
1       abuses             abuses_vrb             1   10     0.100000
0     advocate           advocate_nou             9   10     0.900000
1     advocate           advocate_vrb             1   10     0.100000
0       affect                 affect             9   10     0.900000
1       affect         affect_nou-psy             1   10     0.100000
0    affiliate          affiliate_nou             9   10     0.900000
1    affiliate          affiliate_vrb             1   10     0.100000
0         aged                   aged             9   10     0.900000
1         aged               aged_adj             1   10     0.100000
0    aggregate      aggregate_adj-nou             9   10     0.900000
1    aggregate      

In [53]:
hids_df.ttl.mean()

9.9609375

In [71]:
ratios = []
for i, grp in hids_df.groupby('homograph'): 
    ratio = {}
    counts = []
    ratio['homograph'] = i
    for wdid in grp['wordid']:
        counts.append(grp[grp['wordid'] == wdid]['wordid_count'].item())
    counts.sort()
    print(counts)
    gcd = math.gcd(counts[0], counts[1])
    print(gcd)
    ratio["ratio"] = str(counts[0]) + ":" + str(counts[1]) 
    #ratio["ratio"] = (counts[0]) / (counts[1])
    ratio["difference"] = counts[1] - counts[0]
    #ratio["percent_difference"] = ((counts[1] - counts[0]) / (counts[1] + counts[0]))*100
    ratios.append(ratio)
ratios
ratios_df = pd.DataFrame(ratios)
ratios_df.sort_values(by='ratio')

[1, 9]
1
[1, 9]
1
[1, 9]
1
[1, 9]
1
[1, 9]
1
[1, 9]
1
[1, 9]
1
[1, 9]
1
[1, 9]
1
[3, 6]
3
[1, 9]
1
[5, 5]
5
[1, 9]
1
[5, 5]
5
[5, 5]
5
[1, 9]
1
[5, 5]
5
[5, 5]
5
[1, 9]
1
[3, 7]
1
[1, 9]
1
[1, 9]
1
[1, 9]
1
[1, 9]
1
[5, 5]
5
[2, 7]
1
[1, 9]
1
[1, 9]
1
[1, 9]
1
[1, 9]
1
[5, 5]
5
[1, 9]
1
[1, 9]
1
[3, 7]
1
[1, 9]
1
[1, 9]
1
[5, 5]
5
[1, 9]
1
[5, 5]
5
[1, 9]
1
[1, 9]
1
[1, 9]
1
[1, 9]
1
[2, 8]
2
[3, 7]
1
[3, 7]
1
[1, 9]
1
[1, 9]
1
[1, 9]
1
[1, 5]
1
[3, 7]
1
[1, 9]
1
[3, 7]
1
[2, 8]
2
[1, 9]
1
[1, 9]
1
[1, 9]
1
[5, 5]
5
[1, 9]
1
[1, 9]
1
[1, 9]
1
[1, 9]
1
[1, 9]
1
[1, 9]
1
[3, 7]
1
[1, 9]
1
[5, 5]
5
[1, 9]
1
[1, 9]
1
[5, 5]
5
[3, 7]
1
[1, 9]
1
[3, 7]
1
[3, 7]
1
[2, 8]
2
[2, 8]
2
[1, 9]
1
[3, 9]
3
[1, 9]
1
[3, 6]
3
[3, 7]
1
[1, 9]
1
[1, 9]
1
[1, 9]
1
[1, 7]
1
[1, 9]
1
[3, 7]
1
[3, 7]
1
[1, 9]
1
[1, 9]
1
[5, 5]
5
[3, 7]
1
[1, 9]
1
[3, 7]
1
[4, 6]
2
[1, 9]
1
[1, 9]
1
[2, 8]
2
[1, 9]
1
[6, 7]
1
[2, 8]
2
[1, 9]
1
[1, 9]
1
[3, 7]
1
[3, 7]
1
[1, 9]
1
[1, 9]
1
[1, 9]
1
[1, 9]
1
[1, 8]
1
[1, 9]
1
[

Unnamed: 0,homograph,ratio,difference
49,dove,1:5,4
84,nestle,1:7,6
109,retard,1:8,7
82,misuse,1:9,8
81,minute,1:9,8
78,learned,1:9,8
76,laminate,1:9,8
71,intrigue,1:9,8
68,initiate,1:9,8
67,increment,1:9,8


In [72]:
ratios_df.to_latex(index=False,longtable=True, buf='final_test_class_ratios', caption='Final WHD Test split class ratios and differences', label='whdfinaltestclassratios')

In [56]:
len(ratios_df[ratios_df["difference"] <= 1]) / len(ratios_df)

0.1171875

In [57]:
len(ratios_df[(ratios_df["difference"] > 1) & (ratios_df["difference"] <= 2)]) / len(ratios_df)

0.0078125

In [58]:
len(ratios_df[(ratios_df["difference"] > 2) & (ratios_df["difference"] <= 3)]) / len(ratios_df)

0.0234375

In [59]:
len(ratios_df[(ratios_df["difference"] > 3) & (ratios_df["difference"] <= 4)]) / len(ratios_df)

0.1875

In [60]:
len(ratios_df[(ratios_df["difference"] > 4) & (ratios_df["difference"] <= 5)]) / len(ratios_df)

0.0078125

In [61]:
len(ratios_df[(ratios_df["difference"] > 5) & (ratios_df["difference"] <= 6)]) / len(ratios_df)

0.0625

In [62]:
len(ratios_df[(ratios_df["difference"] > 6) & (ratios_df["difference"] <= 7)]) / len(ratios_df)

0.0078125

In [63]:
len(ratios_df[(ratios_df["difference"] > 7) & (ratios_df["difference"] <= 8)]) / len(ratios_df)

0.5859375

In [64]:
len(ratios_df[ratios_df["difference"] == 8]) / len(ratios_df)

0.5859375

In [65]:
len(ratios_df[(ratios_df["difference"] > 8) & (ratios_df["difference"] <= 9)]) / len(ratios_df)

0.0

In [66]:
len(ratios_df[(ratios_df["difference"] > 9) & (ratios_df["difference"] <= 10)]) / len(ratios_df)

0.0

# Split counts and percents

In [67]:
total = len(train_df) + len(dev_df) + len(test_df)
total

12714

In [68]:
print(len(train_df))
print(train_df.homograph.nunique())
print(train_df.wordid.nunique())
print(len(train_df)/total)

10131
128
256
0.7968381311939594


In [69]:
print(len(dev_df))
print(dev_df.homograph.nunique())
print(dev_df.wordid.nunique())
print(len(dev_df)/total)

1308
128
256
0.1028787163756489


In [70]:
print(len(test_df))
print(test_df.homograph.nunique())
print(test_df.wordid.nunique())
print(len(test_df)/total)

1275
128
256
0.1002831524303917
