# Description
**Functionality**: This module explores the homographs and pronunciation labels obtained from the NXT SWBD data set.
**Use**:  

### Imports

In [1]:
import os
from datetime import datetime
import xml.etree.ElementTree as ET
import glob
from collections import Counter
import pandas as pd
from typing import List

### Variables

In [57]:
DATA_PATH = 'C:/Users/jseal/Dev/dissertation/Data/SWBD/SWBD_Homographs/swbd_homs.tsv'
LABELS_PATH = 'C:/Users/jseal/Dev/dissertation/Data/WHD_Bert/variant_stratified/labels.txt'
WORDIDS_PATH = 'C:/Users/jseal/Dev/dissertation/Data/WikipediaHomographData/data/wordids.tsv'
LOW_ACC_PATH = 'C:/Users/jseal/Dev/dissertation/Data/WHD_ALBERT/classes_w_accuracy_below_1-albert.tsv' #albert is lowest performing model

### Functions

## Script

In [3]:
data_df = pd.read_csv(DATA_PATH, sep='\t')
data_df.head()

Unnamed: 0,file_name,homograph,WHD_IPA_representations,MFA-Arpabet-2-IPA,Human-mapped_column_D_to_C_w_notes,sentence,new_label
0,sw03012_s136_1-s136_4.TextGrid,abuse,"[""ə'bjuːs"" ""ə'bjuːz""]",ʌˈbjuz,ə'bjuːz,and they abuse that,ə'bjuːz
1,sw04184_s102_1-s102_19.TextGrid,abuse,"[""ə'bjuːs"" ""ə'bjuːz""]",ʌˈbjuz,"! - need to listen, col D seems incorrect, cou...",it takes a lot of training and a lot of abuse ...,ə'bjuːs
2,sw02597_s50_1-s50_25.TextGrid,abuse,"[""ə'bjuːs"" ""ə'bjuːz""]",ʌˈbjuz,ə'bjuːz,and they <unk> they do n't you know they do n'...,ə'bjuːz
3,sw04649_s99_1-s99_50.TextGrid,abuse,"[""ə'bjuːs"" ""ə'bjuːz""]",ʌˈbjus,ə'bjuːs,i think the problems are more ones of cost and...,ə'bjuːs
4,sw03049_s138_1-s138_15.TextGrid,abuse,"[""ə'bjuːs"" ""ə'bjuːz""]",ʌˈbjuz,ə'bjuːz,i think it s time we had a lady abuse the guys...,ə'bjuːz


In [4]:
wordids_df = pd.read_csv(WORDIDS_PATH, sep='\t')
wordids_df.head()

Unnamed: 0,homograph,wordid,label,pronunciation,homograph_type,fine_homograph_type
0,abstract,abstract_adj-nou,adjective-noun,'æbˌstɹækt,Morphosyntactic,PoS
1,abstract,abstract_vrb,verb,əb'stɹækt,Morphosyntactic,PoS
2,abuse,abuse_nou,noun,ə'bjuːs,Morphosyntactic,PoS
3,abuse,abuse_vrb,verb,ə'bjuːz,Morphosyntactic,PoS
4,abuses,abuses_nou,noun,ə'bjuːsəz,Morphosyntactic,PoS


In [5]:
with open(LABELS_PATH) as f:
    labels = f.readlines()
labels = [l.strip() for l in labels] 

In [6]:
len(labels)

257

In [7]:
final_wordids_df = wordids_df[wordids_df.wordid.isin(labels)]

In [8]:
len(final_wordids_df)

256

In [9]:
final_wordids_df.homograph.nunique()

128

### Get wordids, counts

In [10]:
data_df.new_label.nunique()

130

In [23]:
swbd_new = set(data_df.new_label.tolist())
swbd_new

{'None',
 'aɪsəˌleɪt',
 'baʊ',
 'blɛst',
 'boʊ',
 'bæs',
 'diːˌkɹiːs',
 'diːˌviːˌeɪt',
 'doʊv',
 'duːpləkət',
 'dɑːkjəˌmɛnt',
 "də'fɛkt",
 "də'kɹiːs",
 "də'lɪbɚət",
 'dɛləˌɡeɪt',
 'dɛzɚt',
 'dɪsˌkaʊnt',
 'eɪʤəd',
 'fɹiːkwənt',
 'haʊs',
 'juːs',
 'juːsəz',
 'juːz',
 'juːzəz',
 'kloʊs',
 'kloʊz',
 'kɑːmˌpækt',
 'kɑːnfləkt',
 'kɑːnˌtɛnt',
 'kɑːnˌtɛst',
 'kɑːnˌtɹækt',
 'kɑːnˌtɹæst',
 "kəm'baɪn",
 "kəm'pɹɛs",
 "kən'stɹʌkt",
 "kən'tɛnt",
 "kən'tɹæst",
 "kən'vɚt",
 "kən'vɪkt",
 'laɪv',
 'laɪvz',
 'liːd',
 'lɚnd',
 'lɛd',
 'lɪv',
 'lɪvz',
 'maʊθ',
 'moʊbəl',
 'moʊˌbiːl',
 "mɑːdɚ'eɪt",
 'mɪnət',
 nan,
 "pɚ'mɪt",
 'pɚfəkt',
 'pɚmət',
 'pɹoʊˌduːs',
 'pɹoʊˌtɛst',
 'pɹɑːˌɡɹɛs',
 'pɹɑːˌʤɛkt',
 "pɹə'duːs",
 "pɹə'tɛst",
 "pɹə'ɡɹɛs",
 'pɹɛzənt',
 'seɪk',
 'soʊ',
 "səb'ʤɛkt",
 "səs'pɛkt",
 'sɛpɚət',
 'sɛpɚˌeɪt',
 'sʌbˌʤɛkt',
 'sʌpləˌmɛnt',
 'sʌsˌpɛkt',
 'tɛɹ',
 'tɹænsˌplænt',
 'tɹænsˌpɔːɹt',
 'waɪnd',
 'waɪndz',
 'waʊnd',
 'wɪnd',
 'ædvəkət',
 'ædvəˌkeɪt',
 'æksəz',
 'ɑːbʤəkt',
 'ɑːltɚnət',
 "ə'bjuːs",


In [34]:
swbd_for_merge = []
for p in swbd_new:
    if type(p) == str:
        p = "\"" + p + "\""
        swbd_for_merge.append(p)

"əb'ʤɛkt"
"pɹɛzənt"
"ɹə'ʤɛkt"
"pɚ'mɪt"
"pɹoʊˌtɛst"
"kɑːnfləkt"
"diːˌkɹiːs"
"liːd"
"kəm'pɹɛs"
"kən'vɪkt"
"sʌsˌpɛkt"
"səs'pɛkt"
"dɪsˌkaʊnt"
"ɹə'fjuːz"
"pɹoʊˌduːs"
"lɪvz"
"ɹə'bɛl"
"juːz"
"baʊ"
"sɛpɚət"
"haʊs"
"ˌpɹiː'zɛnt"
"ə'læbɚət"
"laɪvz"
"ɹoʊ"
"ək'skjuːz"
"ɪntəmət"
"də'lɪbɚət"
"juːzəz"
"sɛpɚˌeɪt"
"ə'fɛkt"
"ˌɪntɚ'ʧeɪnʤ"
"ən'kɹiːs"
"aɪsəˌleɪt"
"ək'skjuːs"
"ɛstəmət"
"ɪmˌpækt"
"sʌbˌʤɛkt"
"ædvəkət"
"mɪnət"
"ɪnˌklaɪn"
"fɹiːkwənt"
"lɛd"
"ʤiːzəs"
"ˌtɹæns'plænt"
"də'fɛkt"
"waɪndz"
"kɑːmˌpækt"
"ɡɹæˌʤuːˌeɪt'"
"doʊv"
"duːpləkət"
"ɹɛbəl"
"ən'vaɪt"
"ɑːbʤəkt"
"kɑːnˌtɹækt"
"ɪnˌvaɪt"
"ɑːltɚnət"
"ɪnˌkɹiːs"
"kən'stɹʌkt"
"kloʊz"
"pɹə'duːs"
"ə'bjuːs"
"dɑːkjəˌmɛnt"
"ək'spoʊz"
"kən'vɚt"
"də'kɹiːs"
"seɪk"
"ˌʌp'sɛt"
"moʊbəl"
"wɪnd"
"kən'tɹæst"
"sʌpləˌmɛnt"
"pɹə'ɡɹɛs"
"juːsəz"
"moʊˌbiːl"
"diːˌviːˌeɪt"
"ə'soʊˌsiːˌeɪt"
"mɑːdɚ'eɪt"
"waɪnd"
"ɹə'kɔːɹd"
"ɹɛd"
"soʊ"
"pɚmət"
"kəm'baɪn"
"ɹɛˌfjuːs"
"ə'bjuːz"
"dɛzɚt"
"tɹænsˌplænt"
"kɑːnˌtɹæst"
"bæs"
"ə'pɹoʊˌpɹiːət"
"ædvəˌkeɪt"
"pɹɑːˌʤɛkt"
"boʊ"
"tɹænsˌpɔːɹt"
"ɹɛzəˌmeɪ"
"ɹ

In [18]:
swbd_pronunciations = final_wordids_df.pronunciation.tolist()
swbd_pronunciations

["ə'bjuːs",
 "ə'bjuːz",
 "ə'bjuːsəz",
 "ə'bjuːzə",
 "'ædvəkət",
 "'ædvəˌkeɪt",
 "'æˌfɛkt",
 "ə'fɛkt",
 "ə'fɪˌliːət",
 "ə0'fɪˌliːˌeɪt",
 "'eɪʤəd",
 "'eɪʤd",
 "'æɡɹəɡət",
 "'æɡɹəˌɡeɪt",
 "'ɑːltɚnət",
 "'ɑːltɚˌneɪt",
 "ə'næləˌsiːz",
 "'ænəˌlaɪzəz",
 "'ænəmət",
 "'ænəˌmeɪt",
 "ə'pɹɑːksəmət",
 "ə'pɹɑːsəˌmeɪt",
 "ɑːɹ'tɪkjələt",
 "ˌɑːɹ'tɪkjəˌleɪt",
 "ə'soʊˌsiːət",
 "ə'soʊˌsiːˌeɪt",
 "'ætɹəˌbjuːt",
 "ə'tɹɪˌbjuːt",
 "'ækˌsiːz",
 "'æksəz",
 "'beɪs",
 "'bæs",
 "'blɛsəd",
 "'blɛst",
 "'boʊ",
 "'baʊ",
 "ˌbʌ'feɪ",
 "'bʌfət",
 "'sɛltək",
 "'kɛltək",
 "'kloʊs",
 "'kloʊz",
 "'kɑːmˌbaɪn",
 "kəm'baɪn",
 "'kɑːmˌpaʊnd",
 "kəm'paʊnd",
 "'kɑːmˌpɹɛs",
 "kəm'pɹɛs",
 "'kɑːnˌdʌkt",
 "kən'dʌkt",
 "'kɑːnˌfaɪnz",
 "kən'faɪnz",
 "'kɑːnfləkt",
 "kən'flɪkt",
 "'kɑːnʤəɡət",
 "'kɑːnʤəˌɡeɪt",
 "'kɑːnˌskɹɪpt",
 "kən'skɹɪpt",
 "'kɑːnˌstɹʌkt",
 "kən'stɹʌkt",
 "'kɑːnsəmət",
 "'kɑːnsəˌmeɪt",
 "kən'tɛnt",
 "'kɑːnˌtɛnt",
 "'kɑːnˌtɛst",
 "kən'tɛst",
 "'kɑːnvɚs",
 "kən'vɚs",
 "'kɑːnvɚt",
 "kən'vɚt",
 "'kɑːnvəkt",
 "kən'vɪkt",
 "ˌ

In [39]:
prons_check = [pron[1:] if pron.startswith("'") else pron for pron in swbd_pronunciations] 
prons_check


["ə'bjuːs",
 "ə'bjuːz",
 "ə'bjuːsəz",
 "ə'bjuːzə",
 'ædvəkət',
 'ædvəˌkeɪt',
 'æˌfɛkt',
 "ə'fɛkt",
 "ə'fɪˌliːət",
 "ə0'fɪˌliːˌeɪt",
 'eɪʤəd',
 'eɪʤd',
 'æɡɹəɡət',
 'æɡɹəˌɡeɪt',
 'ɑːltɚnət',
 'ɑːltɚˌneɪt',
 "ə'næləˌsiːz",
 'ænəˌlaɪzəz',
 'ænəmət',
 'ænəˌmeɪt',
 "ə'pɹɑːksəmət",
 "ə'pɹɑːsəˌmeɪt",
 "ɑːɹ'tɪkjələt",
 "ˌɑːɹ'tɪkjəˌleɪt",
 "ə'soʊˌsiːət",
 "ə'soʊˌsiːˌeɪt",
 'ætɹəˌbjuːt',
 "ə'tɹɪˌbjuːt",
 'ækˌsiːz',
 'æksəz',
 'beɪs',
 'bæs',
 'blɛsəd',
 'blɛst',
 'boʊ',
 'baʊ',
 "ˌbʌ'feɪ",
 'bʌfət',
 'sɛltək',
 'kɛltək',
 'kloʊs',
 'kloʊz',
 'kɑːmˌbaɪn',
 "kəm'baɪn",
 'kɑːmˌpaʊnd',
 "kəm'paʊnd",
 'kɑːmˌpɹɛs',
 "kəm'pɹɛs",
 'kɑːnˌdʌkt',
 "kən'dʌkt",
 'kɑːnˌfaɪnz',
 "kən'faɪnz",
 'kɑːnfləkt',
 "kən'flɪkt",
 'kɑːnʤəɡət',
 'kɑːnʤəˌɡeɪt',
 'kɑːnˌskɹɪpt',
 "kən'skɹɪpt",
 'kɑːnˌstɹʌkt',
 "kən'stɹʌkt",
 'kɑːnsəmət',
 'kɑːnsəˌmeɪt',
 "kən'tɛnt",
 'kɑːnˌtɛnt',
 'kɑːnˌtɛst',
 "kən'tɛst",
 'kɑːnvɚs',
 "kən'vɚs",
 'kɑːnvɚt',
 "kən'vɚt",
 'kɑːnvəkt',
 "kən'vɪkt",
 "ˌkoʊ'ɔːɹdənət",
 "ˌkoʊ'ɔːɹdəˌneɪt",
 'kɔːɹəl

In [42]:
def get_pron(label):
    check_real = zip(prons_check, swbd_pronunciations)
    for c, r in check_real: 
        if label == c:
            return r
            

data_df['pronunciation'] = data_df['new_label'].apply(lambda x: get_pron(x))
data_df

Unnamed: 0,file_name,homograph,WHD_IPA_representations,MFA-Arpabet-2-IPA,Human-mapped_column_D_to_C_w_notes,sentence,new_label,pronunciaton,pronunciation
0,sw03012_s136_1-s136_4.TextGrid,abuse,"[""ə'bjuːs"" ""ə'bjuːz""]",ʌˈbjuz,ə'bjuːz,and they abuse that,ə'bjuːz,ə'bjuːz,ə'bjuːz
1,sw04184_s102_1-s102_19.TextGrid,abuse,"[""ə'bjuːs"" ""ə'bjuːz""]",ʌˈbjuz,"! - need to listen, col D seems incorrect, cou...",it takes a lot of training and a lot of abuse ...,ə'bjuːs,ə'bjuːs,ə'bjuːs
2,sw02597_s50_1-s50_25.TextGrid,abuse,"[""ə'bjuːs"" ""ə'bjuːz""]",ʌˈbjuz,ə'bjuːz,and they <unk> they do n't you know they do n'...,ə'bjuːz,ə'bjuːz,ə'bjuːz
3,sw04649_s99_1-s99_50.TextGrid,abuse,"[""ə'bjuːs"" ""ə'bjuːz""]",ʌˈbjus,ə'bjuːs,i think the problems are more ones of cost and...,ə'bjuːs,ə'bjuːs,ə'bjuːs
4,sw03049_s138_1-s138_15.TextGrid,abuse,"[""ə'bjuːs"" ""ə'bjuːz""]",ʌˈbjuz,ə'bjuːz,i think it s time we had a lady abuse the guys...,ə'bjuːz,ə'bjuːz,ə'bjuːz
...,...,...,...,...,...,...,...,...,...
2930,sw03108_s132_1-s132_27.TextGrid,wound,['wuːnd' 'waʊnd'],waʊnd,exact match,but course it s one of these with a twist wher...,waʊnd,'waʊnd,'waʊnd
2931,sw02365_s174_1-s174_7.TextGrid,wound,['wuːnd' 'waʊnd'],wund,"waʊnd (? - could listen, not sure what's going...",we wound up selling the house,waʊnd,'waʊnd,'waʊnd
2932,sw03073_s215_1-s215_8.TextGrid,wound,['wuːnd' 'waʊnd'],waʊnd,exact match,we <unk> wound up sending in one,waʊnd,'waʊnd,'waʊnd
2933,sw03070_s186_1-s186_7.TextGrid,wound,['wuːnd' 'waʊnd'],waʊnd,exact match,and so it kind of wound down,waʊnd,'waʊnd,'waʊnd


In [44]:
final = pd.merge(final_wordids_df, data_df, on='pronunciation')

In [51]:
final.head()

Unnamed: 0,homograph_x,wordid,label,pronunciation,homograph_type,fine_homograph_type,file_name,homograph_y,WHD_IPA_representations,MFA-Arpabet-2-IPA,Human-mapped_column_D_to_C_w_notes,sentence,new_label,pronunciaton
0,abuse,abuse_nou,noun,ə'bjuːs,Morphosyntactic,PoS,sw04184_s102_1-s102_19.TextGrid,abuse,"[""ə'bjuːs"" ""ə'bjuːz""]",ʌˈbjuz,"! - need to listen, col D seems incorrect, cou...",it takes a lot of training and a lot of abuse ...,ə'bjuːs,ə'bjuːs
1,abuse,abuse_nou,noun,ə'bjuːs,Morphosyntactic,PoS,sw04649_s99_1-s99_50.TextGrid,abuse,"[""ə'bjuːs"" ""ə'bjuːz""]",ʌˈbjus,ə'bjuːs,i think the problems are more ones of cost and...,ə'bjuːs,ə'bjuːs
2,abuse,abuse_nou,noun,ə'bjuːs,Morphosyntactic,PoS,sw02466_s59_1-s59_21.TextGrid,abuse,"[""ə'bjuːs"" ""ə'bjuːz""]",ʌˈbjuz,"! - need to listen, col D seems incorrect, cou...",right now some of the things i m working with ...,ə'bjuːs,ə'bjuːs
3,abuse,abuse_nou,noun,ə'bjuːs,Morphosyntactic,PoS,sw04792_s100_1-s100_9.TextGrid,abuse,"[""ə'bjuːs"" ""ə'bjuːz""]",ʌˈbjus,ə'bjuːs,that sounds like a little abuse of our system,ə'bjuːs,ə'bjuːs
4,abuse,abuse_nou,noun,ə'bjuːs,Morphosyntactic,PoS,sw04649_s34_1-s34_22.TextGrid,abuse,"[""ə'bjuːs"" ""ə'bjuːz""]",ʌˈbjuz,"! - need to listen, col D seems incorrect, cou...",well the thing i really worry about is uh the ...,ə'bjuːs,ə'bjuːs


In [45]:
final.pronunciation.nunique()

105

In [47]:
final.wordid.nunique()

105

In [54]:
pd.set_option('display.max_rows', None)

In [55]:
final.groupby('homograph_x')['wordid'].value_counts()

homograph_x  wordid             
abuse        abuse_nou                8
             abuse_vrb                5
advocate     advocate_nou             4
             advocate_vrb             2
affect       affect                   6
affiliate    affiliate_nou            1
aged         aged_adj                 3
alternate    alternate_adj-nou        5
associate    associate_adj-nou        1
             associate_vrb            1
attribute    attribute_vrb            1
axes         axes_nou-vrb             1
bass         bass_corp                9
blessed      blessed_vrb              2
bow          bow_nou-knot             2
             bow_nou-ship             2
close        close_adj-nou          100
             close_vrb                7
combine      combine_vrb              4
compress     compress                 1
conflict     conflict_nou             8
construct    construct_vrb            1
content      content_nou              4
             content_adj-nou-vrb      3
contest

In [84]:
low_acc_df = pd.read_csv(LOW_ACC_PATH, sep='\t')

In [85]:
final_wordids = final.wordid.unique().tolist()
low_acc_wordids = low_acc_df.wordid.unique().tolist()

In [86]:
len(final_wordids)

105

In [87]:
len(low_acc_wordids)

77

In [88]:
intersect_low_final = [value for value in final_wordids if value in low_acc_wordids] 

In [89]:
len(intersect_low_final)

33

In [90]:
len(intersect_low_final)/len(low_acc_wordids)

0.42857142857142855

In [93]:
low_acc_df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,homograph,wordid,wordid_count,ttl,percent_ttl,accuracy
0,3,1,abuses,abuses_vrb,16,90,0.177778,0.5
1,8,0,affiliate,affiliate_vrb,3,89,0.033708,0.0
2,11,1,aged,aged_adj,4,91,0.043956,0.0
3,14,0,alternate,alternate_adj-nou,87,92,0.945652,0.9
4,15,1,alternate,alternate_vrb,5,92,0.054348,0.0
5,18,0,animate,animate_vrb,50,86,0.581395,0.833333
6,29,1,axes,axes_nou-vrb,43,90,0.477778,0.2
7,31,1,bass,bass_corp,13,87,0.149425,0.666667
8,34,0,bow,bow_nou-knot,43,90,0.477778,0.5
9,35,1,bow,bow_nou-ship,47,90,0.522222,0.5


In [94]:
low_acc_df.percent_ttl.describe()

count    77.000000
mean      0.370036
std       0.357885
min       0.011111
25%       0.044444
50%       0.177778
75%       0.744444
max       0.966667
Name: percent_ttl, dtype: float64

In [105]:
len(low_acc_df[low_acc_df.percent_ttl <= .4])/77

0.5844155844155844

In [73]:
low_acc_finals = final[final.wordid.isin(low_acc_wordids)]
low_acc_finals.groupby('homograph_x')['wordid'].value_counts()

homograph_x  wordid             
aged         aged_adj                 3
alternate    alternate_adj-nou        5
axes         axes_nou-vrb             1
bass         bass_corp                9
bow          bow_nou-knot             2
             bow_nou-ship             2
close        close_adj-nou          100
             close_vrb                7
conflict     conflict_nou             8
content      content_adj-nou-vrb      3
duplicate    duplicate_adj-nou        1
frequent     frequent_adj             4
initiate     initiate_vrb             2
invite       invite_vrb               3
lead         lead_nou-vrb             8
lives        lives_vrb               51
             lives_nou               34
misuse       misuse_vrb               1
mobile       mobile                   5
present      present_adj-nou         18
             present_vrb              4
produce      produce_vrb              7
read         read_present           229
             read_past              134
reading

In [116]:
swbd_counts_low_acc = low_acc_finals['wordid'].value_counts()
swbd_wordid_cts = swbd_counts_low_acc.reset_index().rename(columns={'index' : 'wordid', 'wordid' : 'swbd_wordid_cts'})

In [127]:
swbd_whd_df = pd.merge(swbd_wordid_cts, low_acc_df, on='wordid')

In [128]:
swbd_whd_df = swbd_whd_df[['wordid', 'swbd_wordid_cts', 'homograph', 'wordid_count', 'ttl', 'percent_ttl', 'accuracy']]
swbd_whd_df

Unnamed: 0,wordid,swbd_wordid_cts,homograph,wordid_count,ttl,percent_ttl,accuracy
0,use_vrb,358,use,37,91,0.406593,0.75
1,read_present,229,read,60,112,0.535714,0.833333
2,read_past,134,read,52,112,0.464286,0.833333
3,reading_en,101,reading,74,90,0.822222,0.777778
4,close_adj-nou,100,close,84,89,0.94382,0.9
5,lives_vrb,51,lives,57,91,0.626374,0.833333
6,lives_nou,34,lives,34,91,0.373626,0.5
7,wind_vrb,21,wind,6,90,0.066667,0.0
8,present_adj-nou,18,present,83,90,0.922222,0.888889
9,separate_adj,16,separate,85,91,0.934066,0.9


In [129]:
swbd_whd_df['ttl_cap'] = 120

In [130]:
swbd_whd_df['allowed_new_samples'] = swbd_whd_df.ttl_cap - swbd_whd_df.ttl

In [131]:
swbd_whd_df

Unnamed: 0,wordid,swbd_wordid_cts,homograph,wordid_count,ttl,percent_ttl,accuracy,ttl_cap,allowed_new_samples
0,use_vrb,358,use,37,91,0.406593,0.75,120,29
1,read_present,229,read,60,112,0.535714,0.833333,120,8
2,read_past,134,read,52,112,0.464286,0.833333,120,8
3,reading_en,101,reading,74,90,0.822222,0.777778,120,30
4,close_adj-nou,100,close,84,89,0.94382,0.9,120,31
5,lives_vrb,51,lives,57,91,0.626374,0.833333,120,29
6,lives_nou,34,lives,34,91,0.373626,0.5,120,29
7,wind_vrb,21,wind,6,90,0.066667,0.0,120,30
8,present_adj-nou,18,present,83,90,0.922222,0.888889,120,30
9,separate_adj,16,separate,85,91,0.934066,0.9,120,29


In [124]:
#swbd_whd_df['new_ttl'] =  swbd_whd_df.swbd_wordid_cts + swbd_whd_df.ttl

In [156]:
def get_sample_size(row): 
    if row.swbd_wordid_cts >= row.allowed_new_samples:
        print("here: {}".format(row.wordid))
        return row.allowed_new_samples + row.wordid_count
    else: 
        return row.swbd_wordid_cts + row.wordid_count

In [157]:
swbd_whd_df['new_wordid_ttl'] =  swbd_whd_df.apply(lambda row: get_sample_size(row), axis=1)

here: use_vrb
here: read_present
here: read_past
here: reading_en
here: close_adj-nou
here: lives_vrb
here: lives_nou


In [158]:
swbd_whd_df

Unnamed: 0,wordid,swbd_wordid_cts,homograph,wordid_count,ttl,percent_ttl,accuracy,ttl_cap,allowed_new_samples,new_wordid_ttl
0,use_vrb,358,use,37,91,0.406593,0.75,120,29,66
1,read_present,229,read,60,112,0.535714,0.833333,120,8,68
2,read_past,134,read,52,112,0.464286,0.833333,120,8,60
3,reading_en,101,reading,74,90,0.822222,0.777778,120,30,104
4,close_adj-nou,100,close,84,89,0.94382,0.9,120,31,115
5,lives_vrb,51,lives,57,91,0.626374,0.833333,120,29,86
6,lives_nou,34,lives,34,91,0.373626,0.5,120,29,63
7,wind_vrb,21,wind,6,90,0.066667,0.0,120,30,27
8,present_adj-nou,18,present,83,90,0.922222,0.888889,120,30,101
9,separate_adj,16,separate,85,91,0.934066,0.9,120,29,101


In [159]:
swbd_whd_df['other_wd_ct'] = swbd_whd_df.ttl - swbd_whd_df.wordid_count

In [166]:
swbd_whd_df['aug_ttl'] = swbd_whd_df.other_wd_ct + swbd_whd_df.new_wordid_ttl

In [169]:
swbd_whd_df['percent_aug_ttl'] = swbd_whd_df.new_wordid_ttl / swbd_whd_df.aug_ttl

In [173]:
swbd_whd_df['abs_percent_ttl_change'] = (swbd_whd_df.percent_aug_ttl - swbd_whd_df.percent_ttl)*100

In [176]:
swbd_whd_df.abs_percent_ttl_change.describe()

count    33.000000
mean      3.892790
std       4.946972
min       0.062422
25%       0.481100
50%       1.442308
75%       4.444444
max      17.657658
Name: abs_percent_ttl_change, dtype: float64

In [177]:
swbd_whd_df 

Unnamed: 0,wordid,swbd_wordid_cts,homograph,wordid_count,ttl,percent_ttl,accuracy,ttl_cap,allowed_new_samples,new_wordid_ttl,other_wd_ct,aug_ttl,percent_aug_ttl,abs_percent_ttl_change
0,use_vrb,358,use,37,91,0.406593,0.75,120,29,66,54,120,0.55,14.340659
1,read_present,229,read,60,112,0.535714,0.833333,120,8,68,52,120,0.566667,3.095238
2,read_past,134,read,52,112,0.464286,0.833333,120,8,60,60,120,0.5,3.571429
3,reading_en,101,reading,74,90,0.822222,0.777778,120,30,104,16,120,0.866667,4.444444
4,close_adj-nou,100,close,84,89,0.94382,0.9,120,31,115,5,120,0.958333,1.451311
5,lives_vrb,51,lives,57,91,0.626374,0.833333,120,29,86,34,120,0.716667,9.029304
6,lives_nou,34,lives,34,91,0.373626,0.5,120,29,63,57,120,0.525,15.137363
7,wind_vrb,21,wind,6,90,0.066667,0.0,120,30,27,84,111,0.243243,17.657658
8,present_adj-nou,18,present,83,90,0.922222,0.888889,120,30,101,7,108,0.935185,1.296296
9,separate_adj,16,separate,85,91,0.934066,0.9,120,29,101,6,107,0.943925,0.98593


In [178]:
swbd_whd_df.to_csv("C:/Users/jseal/Dev/dissertation/Data/SWBD/SWBD_Homographs/swbd_homs_meta.tsv", sep='\t')