<h2>1. Load the Data</h2>

In [49]:
import pandas as pd
import numpy as np
compound_list = pd.read_csv('data/epsd_compound_chart.csv')
compound_list = compound_list.replace(np.nan, '', regex=True)

Load the ETCSL Corpus

In [2]:
etcsl_words = pd.read_csv('data/etcsl_words.csv')
etcsl_words['cf_gw'] = [str(etcsl_words['cf'][i]) + '[' + str(etcsl_words['gw'][i]) + ']' for i in etcsl_words.index]
etcsl_words['form'] = pd.Series([str(s) for s in etcsl_words['form']])
etcsl_lines = etcsl_words.groupby([etcsl_words['id_text'],etcsl_words['line_id'],etcsl_words['line_label']]).agg({'cf_gw': ' '.join,'form': ' '.join}).reset_index()

Load the ETCSRI Corpus

In [33]:
etcsri_words = pd.read_csv('data/etcsri_words.csv')
etcsri_words.head()
etcsri_words['cf_gw'] = [str(etcsri_words['cf'][i]) + '[' + str(etcsri_words['gw'][i]) + ']' for i in etcsri_words.index]
etcsri_words['form'] = pd.Series([str(s) for s in etcsri_words['form']])
etcsri_lines = etcsri_words.groupby([etcsri_words['id_text'],etcsri_words['line_id'],etcsri_words['line_label']]).agg({'cf_gw': ' '.join,'form': ' '.join}).reset_index()

Define functions to search through the lines to find compounds

In [4]:
#limit rows for faster iteration
def limit_rows(comp_parts,corpus_lines):
    comp1 = comp_parts[0].replace('[','\[').replace(']','\]')
    comp2 = comp_parts[1].replace('[','\[').replace(']','\]')
    lines = corpus_lines[corpus_lines['cf_gw'].str.contains(comp1) & corpus_lines['cf_gw'].str.contains(comp2)]
    return lines

In [75]:
def search_compounds(comp1,comp2,wr1,lines):
    all_compounds = []
    for i,row in lines.iterrows():
        cf_gws = row['cf_gw'].split(' ')
        forms = row['form'].split(' ')
        in_comp = False
        nominal = ''
        for i in range(len(cf_gws)):
            if cf_gws[i].find('X') > -1:
                in_comp = False
                comp_text = ''
                middles = []
                continue
            if in_comp:
                comp_text = comp_text + ' ' + str(forms[i])
                if comp2 == cf_gws[i]:
                    nom_suff = ''
                    if(nominal.find(wr1) > -1 and wr1 != ''):
                        nom_suff = nominal.split(wr1)[1]
                    
                    #Found a compound! Package the information and append to the main list
                    comp_info = {}
                    comp_info['compound'] = comp1 + ' ' + comp2
                    comp_info['comp_text'] = comp_text
                    comp_info['id_text'] = row['id_text']
                    comp_info['line_id'] = row['line_id']
                    comp_info['line_label'] = row['line_label']
                    comp_info['middle_terms'] = ' '.join(middles)
                    comp_info['nom_suff'] = nom_suff
                    comp_info['all_middle'] = comp_info['nom_suff'] + ':' + comp_info['middle_terms']
                    all_compounds.append(comp_info)
                    #reset
                    in_comp = False
                    comp_text = ''
                else:
                    middles.append(cf_gws[i])   
            if comp1 == cf_gws[i]: #Start the compound
                comp_text = forms[i]
                nominal = forms[i]
                in_comp = True
                line_curr = int(row['line_id'])
                middles = []
    return all_compounds

Gather all the compounds from ETCSL into a Data Frame

In [76]:
all_compounds_etcsl = []
for i, row in compound_list.iterrows():
    comp_str = ''
    if(row['etcsl_form'] == ''):
        comp_str = row['epsd_form']
    else:
        comp_str = row['etcsl_form']

    comp_arr = comp_str.split(' ')
    comp_lines = limit_rows(comp_arr,etcsl_lines)
    all_compounds_etcsl.extend(search_compounds(comp_arr[0],comp_arr[1],row['comp1_wr'],comp_lines))

In [77]:
df_compounds = pd.DataFrame(all_compounds_etcsl)
df_compounds['link'] = 'http://etcsl.orinst.ox.ac.uk/cgi-bin/etcsl.cgi?text=' + df_compounds['id_text'] + '&display=Crit&charenc=gcirc#'
df_compounds

Unnamed: 0,all_middle,comp_text,compound,id_text,line_id,line_label,middle_terms,nom_suff,link
0,:,a₂ aŋ₂-ŋa₂,a[arm] aŋ[measure],c.0.2.01,57,57,,,http://etcsl.orinst.ox.ac.uk/cgi-bin/etcsl.cgi...
1,:,a₂ aŋ₂,a[arm] aŋ[measure],c.0.2.13,15,15,,,http://etcsl.orinst.ox.ac.uk/cgi-bin/etcsl.cgi...
2,:,a₂ mu-un-da-an-aŋ₂,a[arm] aŋ[measure],c.1.1.3,40,40,,,http://etcsl.orinst.ox.ac.uk/cgi-bin/etcsl.cgi...
3,:,a₂ ša-ma-ni-ib-aŋ₂-e,a[arm] aŋ[measure],c.1.1.3,115,115,,,http://etcsl.orinst.ox.ac.uk/cgi-bin/etcsl.cgi...
4,:,a₂ ša-mu-un-aŋ₂-e,a[arm] aŋ[measure],c.1.1.3,170,170,,,http://etcsl.orinst.ox.ac.uk/cgi-bin/etcsl.cgi...
5,-ba:,a₂-ba mu-un-da-aŋ₂-e,a[arm] aŋ[measure],c.1.2.2,28,A28,,-ba,http://etcsl.orinst.ox.ac.uk/cgi-bin/etcsl.cgi...
6,:,a₂ aŋ₂-ŋa₂,a[arm] aŋ[measure],c.1.2.2,44,A44,,,http://etcsl.orinst.ox.ac.uk/cgi-bin/etcsl.cgi...
7,:,a₂ mu-da-an-aŋ₂-ta,a[arm] aŋ[measure],c.1.2.2,68,A74,,,http://etcsl.orinst.ox.ac.uk/cgi-bin/etcsl.cgi...
8,:,a₂ mu-da-aŋ₂-e,a[arm] aŋ[measure],c.1.2.2,83,A89,,,http://etcsl.orinst.ox.ac.uk/cgi-bin/etcsl.cgi...
9,:,a₂ im-ma-an-aŋ₂,a[arm] aŋ[measure],c.1.3.1,42,B13,,,http://etcsl.orinst.ox.ac.uk/cgi-bin/etcsl.cgi...


Gather all the compounds from ETCSRI into a Data Frame

In [78]:
#DO SAME FOR ETCSRI
all_compounds_etcsri = []
for i, row in compound_list.iterrows():   
    comp_str = ''
    if(row['etcsl_form'] == ''):
        comp_str = row['epsd_form']
    else:
        comp_str = row['etcsl_form']

    comp_arr = comp_str.split(' ')
    comp_lines = limit_rows(comp_arr,etcsri_lines)
    all_compounds_etcsri.extend(search_compounds(comp_arr[0],comp_arr[1],row['comp1_wr'],comp_lines))

In [79]:
df_compounds_etcsri = pd.DataFrame(all_compounds_etcsri)
df_compounds_etcsri

Unnamed: 0,all_middle,comp_text,compound,id_text,line_id,line_label,middle_terms,nom_suff
0,-bi:,a₂-bi mu-da-an-aŋ₂,a[arm] aŋ[measure],etcsri/Q000376,23,23,,-bi
1,-bi:,a₂-bi mu-da-an-aŋ₂,a[arm] aŋ[measure],etcsri/Q000376,31,31,,-bi
2,:,a₂ aŋ₂-ŋa₂-ŋa₂,a[arm] aŋ[measure],etcsri/Q000377,65,Cylinder A x 24,,
3,:,a₂ mu-da-aŋ₂,a[arm] aŋ[measure],etcsri/Q000377,122,Cylinder A xv 11,,
4,:,a₂ mu-da-aŋ₂,a[arm] aŋ[measure],etcsri/Q000377,126,Cylinder A xv 15,,
5,:,a₂ e-na-aŋ₂,a[arm] aŋ[measure],etcsri/Q001067,13,ii 3,,
6,:niŋhulu[evil] dim[create],a₂ niŋ₂-hul dim₂-ma ib₂-ši-aŋ₂-ŋa₂-a,a[arm] aŋ[measure],etcsri/Q001943,6,6,niŋhulu[evil] dim[create],
7,:,a₂ bad-a-ŋu₁₀,a[arm] bad[open],etcsri/Q000377,41,Cylinder A ix 26,,
8,:,a₂ bad,a[arm] bad[open],etcsri/Q001155,1,1,,
9,:bad[open] lu[person],a₂ bad-a-ŋu₁₀ lu₂ la-ba-ta-e₃,a[arm] e[leave],etcsri/Q000377,41,Cylinder A ix 26,bad[open] lu[person],


Combine them into one large Data Frame

In [80]:
df_compounds_all = pd.concat([df_compounds,df_compounds_etcsri])
df_compounds_all = df_compounds_all.sort_values(['comp_text','middle_terms'])
df_compounds_all

Unnamed: 0,all_middle,comp_text,compound,id_text,line_id,line_label,link,middle_terms,nom_suff
482,:10[10] gur[unit] ze[gall-bladder],a 10 gur-am₃ ze₂-bi mu-un-de₂,a[water] de[pour],c.1.8.2.2,67,67,http://etcsl.orinst.ox.ac.uk/cgi-bin/etcsl.cgi...,10[10] gur[unit] ze[gall-bladder],
679,:,a ab-ra-an,a[water] ri[impose],c.6.1.04,8,A8,http://etcsl.orinst.ox.ac.uk/cgi-bin/etcsl.cgi...,,
607,:ur[convulsed] id[river],a al-ur₄-ur₄-re id₂-da bi₂-i-gi-gi,a[water] gi[turn],c.1.4.1.3,76,85,http://etcsl.orinst.ox.ac.uk/cgi-bin/etcsl.cgi...,ur[convulsed] id[river],
670,:An[1],a an-ne₂ ru-a-ŋen,a[water] ri[impose],c.4.08.18,11,11,http://etcsl.orinst.ox.ac.uk/cgi-bin/etcsl.cgi...,An[1],
589,:,a ba-ab-ŋar,a[water] ŋar[place],c.2.2.5,64,D17,http://etcsl.orinst.ox.ac.uk/cgi-bin/etcsl.cgi...,,
498,:,a ba-an-de₂,a[water] de[pour],c.4.08.05,2,2,http://etcsl.orinst.ox.ac.uk/cgi-bin/etcsl.cgi...,,
564,:,a ba-an-dug₄,a[water] dug[speak],c.4.08.05,1,1,http://etcsl.orinst.ox.ac.uk/cgi-bin/etcsl.cgi...,,
565,:,a ba-an-dug₄,a[water] dug[speak],c.4.08.05,3,3,http://etcsl.orinst.ox.ac.uk/cgi-bin/etcsl.cgi...,,
566,:,a ba-an-dug₄,a[water] dug[speak],c.4.08.05,4,4,http://etcsl.orinst.ox.ac.uk/cgi-bin/etcsl.cgi...,,
567,:,a ba-an-dug₄,a[water] dug[speak],c.4.08.05,10,10,http://etcsl.orinst.ox.ac.uk/cgi-bin/etcsl.cgi...,,


For simplicity, let's only look at compoudns that have at most one term between their components. For a later project we can examine compounds with multiple terms in between, but chances are many of them are not good examples of compounds

In [81]:
df_compounds_oneterm = df_compounds_all[~df_compounds_all['middle_terms'].str.contains(' ')]

<h2>2. Process the Data</h2>
<h3>2.1 Most Frequent Compounds</h3>
<p>There are 417 total compounds given by ePSD but to get a sense of what we're working with let's examine only the 30 most frequent</p>

In [18]:
compound_freqs = df_compounds.groupby(['compound'])['compound'].agg('count')
compound_freqs = compound_freqs.sort_values(ascending=False)
compound_freqs_top30 = list(compound_freqs.nlargest(30).keys())
compound_freqs_top30

['nam[fate] tar[cut]',
 'si[horn] sa[equal]',
 'ki[place] aŋ[measure]',
 'gu[voice] de[pour]',
 'igi[eye] du[spread]',
 'ki[place] ŋar[place]',
 'pa[branch] e[leave]',
 'šu[hand] du[push]',
 'saŋ[head] il[raise]',
 'igi[eye] bar[outside]',
 'mi[cvne] dug[speak]',
 'šu[hand] teŋ[approach]',
 'šu[hand] dug[speak]',
 'šag[heart] hul[rejoice]',
 'šu[hand] ŋar[place]',
 'a[arm] aŋ[measure]',
 'šu[hand] ŋal[be]',
 'dur[buttocks] ŋar[place]',
 'ki[place] us[follow]',
 'en[cvne] tar[cut]',
 'u[admiration] dug[speak]',
 'šu[hand] gi[turn]',
 'ni[fear] teŋ[approach]',
 'igi[eye] il[raise]',
 'er[tears] šeš[weep]',
 'saŋ[head] rig[donate]',
 'igi[eye] ŋal[be]',
 'ŋiri[foot] gub[stand]',
 'gu[neck] ŋar[place]',
 'gu[neck] la[hang]']

Save each compounds occurrence for the 30 most frequent compounds

In [93]:
df_compounds_top30 = df_compounds_all[df_compounds_all['compound'].isin(compound_freqs_top30)]
df_compounds_top30 = df_compounds_top30.sort_values(['compound','all_middle'])
df_compounds_top30.to_csv('output/top30_full.csv')

<h3>2.2 Middle Morpheme Frequencies</h3><br>
Find the frequencies of each middle element combination per compound

In [82]:
#Group compound data with a sum aggregation
def count_middle_terms(df,comp = ''):
    middle_counts = 0
    if comp:
        middle_counts = df[df['compound'] == comp].groupby(['compound','all_middle']).agg({'all_middle': ['count']}).reset_index()
    else:
        middle_counts = df.groupby(['compound','all_middle']).agg({'all_middle': ['count']}).reset_index()
    middle_counts.columns = ["_".join(x) for x in middle_counts.columns.ravel()]
    middle_counts = middle_counts.sort_values(['compound_','all_middle_count'], ascending = [True, False])
    return middle_counts

In [83]:
term_count = count_middle_terms(df_compounds_oneterm)
term_count[term_count['compound_'].isin(compound_freqs_top30)]

Unnamed: 0,compound_,all_middle_,all_middle_count
5,a[arm] aŋ[measure],:,95
1,a[arm] aŋ[measure],-bi:,18
3,a[arm] aŋ[measure],-še₃:,4
2,a[arm] aŋ[measure],-zu:,2
6,a[arm] aŋ[measure],:gal[big],2
0,a[arm] aŋ[measure],-ba:,1
4,a[arm] aŋ[measure],-še₃:kiŋ[work],1
7,a[arm] aŋ[measure],:kilulak[murder],1
8,a[arm] aŋ[measure],:nun[prince],1
216,dur[buttocks] ŋar[place],:,99


Save the full result

In [90]:
term_count.to_csv('output/middle_counts.csv')

<h3>2.3 Specific Counts and Ratios</h3><br>
Find the specific counts for three categories:
<ol>
    <li>compounds without any interceding morphology</li>
    <li>compounds with suffixes on their nominal component and no interceding words</li>
    <li>compounds with interceding words</li>
</ol>

In [86]:
def middle_0(arr):
    count = 0
    for s in arr:
        if s == ':':
            count = count + 1
    return count

def middle_term(arr):
    count = 0
    for s in arr:
        if s.find('[') > -1:
            count = count + 1
    return count

def middle_suff(arr):
    count = 0
    for s in arr:
        if s.find('[') == -1 and s != ':':
            count = count + 1
    return count

def ratio_middle_terms(df):
    middle_ratios = df['all_middle'].groupby(df['compound']).agg([middle_0,middle_suff,middle_term])
    #middle_ratios['ratio'] = middle_ratios['middle_1'] / middle_ratios['middle_0']
    middle_ratios = middle_ratios.reset_index()
    #middle_ratios = middle_ratios.sort_values(by='ratio',ascending=False)
    return middle_ratios

middle_ratios = ratio_middle_terms(df_compounds_oneterm)
middle_ratios[middle_ratios['compound'].isin(compound_freqs_top30)]

Unnamed: 0,compound,middle_0,middle_suff,middle_term
0,a[arm] aŋ[measure],95,25,5
58,dur[buttocks] ŋar[place],99,5,10
60,en[cvne] tar[cut],84,21,5
65,er[tears] šeš[weep],32,8,54
78,gu[neck] la[hang],17,48,12
85,gu[neck] ŋar[place],66,6,17
87,gu[voice] de[pour],279,0,31
113,igi[eye] bar[outside],92,3,63
115,igi[eye] du[spread],235,8,11
119,igi[eye] il[raise],62,6,31


Save the full result

In [91]:
middle_ratios.to_csv('output/middle_ratios.csv')