In [1]:
import os
from datetime import datetime
import xml.etree.ElementTree as ET
import glob
from collections import Counter
import pandas as pd
from typing import List
pd.set_option('display.max_rows', None)

### Variables

In [2]:
#Paths
NXT_SWBD = '/Users/jen/Dev/Dissertation/Data/SWBD/nxt_1.4.4'
PHONWORDS = os.path.join(NXT_SWBD, 'xml/phonwords')
WORDS = os.path.join(NXT_SWBD, 'xml/terminals')
WHD = 'Dev/dissertation/Data/WikipediaHomographData/data'
WHD_CTS = '~/Dev/Dissertation/Data/WHD_CTS'
#NXT SWBD XML-specific variables
PW = 'phonword'
WD ='word'
ORTH = 'orth'
#Data
whd_df = pd.read_csv(os.path.join(WHD,'WikipediaHomographData.csv'))
nxt_whd_cts_df = whd_df.drop_duplicates(subset='homograph')

### Functions

In [3]:
def get_orth_list(PATH : str, item_type : str) -> List: 
    orths = []
    for f in glob.glob(os.path.join(PATH, '*')): 
        tree = ET.parse(f)
        root = tree.getroot()
        for item in root.findall(item_type): 
            orths.append(item.get(ORTH))
    return orths

In [4]:
def get_whd_subset(orths : list, item_type : str, nxt_whd_cts_df : pd.DataFrame) -> pd.DataFrame:
    orths_ctr = Counter(orths)
    nxt_whd_cts_df['{}_cts'.format(item_type)] = nxt_whd_cts_df['homograph'].apply(lambda hg : orths_ctr[hg])
    nxt_whd_cts_df = nxt_whd_cts_df[['homograph', '{}_cts'.format(item_type)]]
    return nxt_whd_cts_df

## Script

In [5]:
#Obtain lists of NXT SWBD graphemes
phonwords = get_orth_list(PHONWORDS, PW)
words = get_orth_list(WORDS, WD)
#Get subsets of NXT SWBD graphemes also in Wikipedia Homograph Data with instance counts
pw_cts_df = get_whd_subset(phonwords, PW, nxt_whd_cts_df)
wd_cts_df = get_whd_subset(words, WD, nxt_whd_cts_df)
#Serialize
cts_df = pd.merge(wd_cts_df, pw_cts_df, on='homograph')
DATETIME = datetime.now().strftime("%m_%d_%Y_%H_%M_%S")
cts_df.to_csv(os.path.join(WHD_CTS, 'nxt_whd_{}_cts.csv'.format(DATETIME)))
#Review data
#Ct percents (~35% Wikipedia graphemes have no instances in NXT SWBD; 14% have 1 instance)
print(cts_df['phonword_cts'].value_counts(normalize=True) * 100)
print(cts_df['word_cts'].value_counts(normalize=True) * 100)
#Phonwords with 40 greatest count values
cts_df.sort_values(by=['phonword_cts'], ascending=False).head(40)
#Phonwords with 0 instances in NXT (58 graphmes, ~36% of WHs)
zero_instances = cts_df[cts_df['phonword_cts'] == 0]
print(zero_instances)
print(zero_instances.shape)
print(zero_instances.shape[0]/nxt_whd_cts_df.shape[0] * 100)
#Phonwords with 1-9 instances in NXT (69 graphemes, ~43% of WHs)
single_digit_instances = cts_df[(cts_df['phonword_cts'] > 0) & (cts_df['phonword_cts'] < 10)].sort_values(by=['phonword_cts'], ascending=False)
print(single_digit_instances)
print(single_digit_instances.shape)
print(single_digit_instances.shape[0]/nxt_whd_cts_df.shape[0] *100)
#Phonwords with 10 or more instances in NXT (35 graphemes, ~22% of WHs)
more_instances = cts_df[cts_df['phonword_cts'] > 9].sort_values(by=['phonword_cts'], ascending=False)
print(more_instances)
print(more_instances.shape)
print(more_instances.shape[0]/nxt_whd_cts_df.shape[0] *100)


0      35.802469
1      14.814815
4       6.172840
2       4.938272
8       3.703704
9       3.703704
5       3.086420
6       2.469136
15      1.851852
21      1.851852
7       1.851852
3       1.851852
11      1.234568
56      1.234568
24      0.617284
12      0.617284
14      0.617284
16      0.617284
18      0.617284
20      0.617284
495     0.617284
533     0.617284
29      0.617284
483     0.617284
444     0.617284
147     0.617284
130     0.617284
370     0.617284
110     0.617284
98      0.617284
57      0.617284
52      0.617284
48      0.617284
43      0.617284
41      0.617284
39      0.617284
38      0.617284
31      0.617284
25      0.617284
Name: phonword_cts, dtype: float64
0      36.419753
1      14.197531
4       6.172840
2       4.938272
5       3.703704
8       3.703704
6       2.469136
7       2.469136
11      1.851852
15      1.851852
9       1.851852
3       1.851852
21      1.851852
44      1.234568
23      0.617284
13      0.617284
14      0.617284
16      0.617

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
