In [2]:
import xml.etree.ElementTree as et
import pandas as pd
import os
import re
import pdb

# xml namespace stuff
ns = {'': 'http://www.talkbank.org/ns/talkbank'}
ns_tag = lambda tag: '{' + ns[''] + '}' + tag

In [67]:
def load_transcript(transcript_file):
    #print(transcript_file)
    transcript = open(transcript_file).read()
    transcript = re.sub('</?g>', '', transcript) # g tags destroy everything
    return(et.fromstring(transcript))
    
def process_transcript(transcript_file, df_w, df_mor, df_mk):
    t = load_transcript(transcript_file)
    for u_index, u in enumerate(t.findall('./u', ns)):
        df_w, df_mor, df_mk = process_utterance(u, u_index, df_w, df_mor, df_mk)
                
    df_w = pd.DataFrame(df_w)
    df_mor = pd.DataFrame(df_mor)
    df_mk = pd.DataFrame(df_mk)
        
    df_w['transcript'] = transcript_file
    df_mor['transcript'] = transcript_file
    return(df_w, df_mor, df_mk)

def process_utterance(u, u_index, df_w, df_mor, df_mk, verbose = False):
    if verbose: print(u_index, end = ' ')

    # find all nodes in utterance that have tag w or tagMarker
    w_tags = [ns_tag(tag) for tag in ['w', 'tagMarker']]
    u_parts = [p for p in u.findall("./", ns) if p.tag in w_tags]
    if verbose: print(len(u_parts))

    # pdb.set_trace()
    for p in u_parts:
        if verbose: print(p)
        # tagMarkers don't go in df_w, only df_mor
        if p.tag == ns_tag("tagMarker"):
            mor = p.find('./mor', ns)
            if mor is not None:
                df_mor, df_mk = process_mor(mor, len(df_mor), False, u_index, None, df_mor, df_mk)
        else:
            if verbose: print("\t" + p.text)
            df_w, df_mor, df_mk = process_word(p, len(df_w), u_index, df_w, df_mor, df_mk)
            
    return(df_w, df_mor, df_mk)

In [55]:
def construct_word(w):
    # construct word string out of text and internal shortenings
    shorts = w.findall('./shortening', ns)
    word = "".join([w.text or ""] + [s.text for s in shorts] + [s.tail or "" for s in shorts[-1:]])
    return(word)

def process_word(w, w_index, u_id, df_w, df_mor, df_mk, verbose = False):

    gloss = construct_word(w)

    words = [w]
    # replacements contain 1 or more words
    rep_words = w.findall('./replacement/w', ns)
    if len(rep_words):
        # treat multiple words in replacement as a linkage
        rep_gloss = "_".join([construct_word(rep) for rep in rep_words])
        # mor tag can be child of w within replacement or after replacement
        words += rep_words
    else:
        rep_gloss = None
    
    if verbose: print(gloss)
    w_data = {
        'id': w_index,
        'u_fk': u_id,
        'gloss': gloss,
        'replacement': rep_gloss
    }
    df_w.append(w_data)

    for wo in words:
        #pdb.set_trace()
        # TODO: mor-pre?

        # 0 or 1 mor
        mor = wo.find('./mor', ns)
        if mor is not None:
            df_mor, df_mk = process_mor(mor, len(df_mor), False, u_id, w_index, df_mor, df_mk)

        # 0 or 1 or multiple mor-post (clitics)
        mor_posts = wo.findall('./mor/mor-post', ns)
        if mor_posts is not None:
            for mor_index, mor_post in enumerate(mor_posts, start = 1):
                df_mor, df_mk = process_mor(mor_post, len(df_mor) + mor_index, True, u_id, w_index, df_mor, df_mk)
    
    return(df_w, df_mor, df_mk)

In [57]:
def process_mor(mor, mor_index, is_post, u_id, w_id, df_mor, df_mk):
    
    # expect 0 or 1 of each of these tags
    assert len(mor.findall('./mw', ns)) <= 1
    assert len(mor.findall('./mwc', ns)) <= 1
    assert len(mor.findall('./gra', ns)) <= 1
    assert len(mor.findall('./mw/pos', ns)) <= 1
    assert len(mor.findall('./mw/pos/c', ns)) <= 1
    assert len(mor.findall('./mw/stem', ns)) <= 1

    # mw is inside mwc for compounds
    mwc = mor.find('./mwc', ns)
    mws = mor.findall('./mw', ns) if mwc is None else mwc.findall('./mw', ns)
    is_cmp = mwc is not None
    
    # 0 or 1 or multiple menx
    menxs = mor.findall('./menx', ns)
    menx = "_".join([menx.text for menx in menxs]) if menxs is not None else None
    
    # 0 or 1 gra
    gra = mor.find('./gra', ns)
    if gra is None:
        gra_index, gra_head, gra_relation = None, None, None
    else:
        gra_index, gra_head, gra_relation = gra.get('index'), gra.get('head'), gra.get('relation')
    
    for mw in mws:
        pos_c = mw.find('./pos/c', ns)
        pos_s = mw.findall('./pos/s', ns)
        pos = ":".join([pos_c.text] + [s.text for s in pos_s if s is not None])
        mor_data = {
            'id': mor_index,
            'u_fk': u_id,
            'w_fk': w_id,
            'pos': pos,
            'is_clitic': is_post,
            'is_compound': is_cmp,
            'stem': mw.find('./stem', ns).text,
            'english': menx,
            'gra_index': gra_index,
            'gra_head': gra_head,
            'gra_relation': gra_relation
        }
        #df_mor = df_mor.append(mor_data, ignore_index = True)
        df_mor.append(mor_data)
        df_mk = process_mw(mw, mor_index, df_mk)
    return(df_mor, df_mk)

In [6]:
def process_mw(mw, mor_id, df_mk):
    
    # prefixes
    for mpfx in mw.findall('./mpfx', ns):
        mpfx_data = {
            'id': len(df_mk),
            'mor_fk': mor_id,
            'affix': mpfx.text,
            'affix_type': 'prefix'
        }
        #df_mk = df_mk.append(mpfx_data, ignore_index = True)
        df_mk.append(mpfx_data)
        
    # suffixes
    for mk in mw.findall('./mk', ns):
        mk_data = {
            'id': len(df_mk),            
            'mor_fk': mor_id,
            'affix': mk.text,
            'affix_type': mk.get('type')
        }
        #df_mk = df_mk.append(mk_data, ignore_index = True)
        df_mk.append(mk_data)
        
    return(df_mk)

In [6]:
#df_w = pd.DataFrame(columns = ['id', 'u_fk', 'gloss', 'replacement'])
#df_mor = pd.DataFrame(columns = ['id', 'w_fk', 'pos', 'clitic', 'stem', 'english', 'gra_index', 'gra_head', 'gra_relation'])
#df_mk = pd.DataFrame(columns = ['id', 'mor_fk', 'affix', 'affix_type'])

# iterate over transcripts
#transcript_file = "corpora/Spanish/Aguirre/010801.xml"
#transcript_file = "corpora/HSLLD/HV1/BR/acebr1.xml"
#transcript = et.parse(transcript_file).getroot()

# iterate over utterances
#utterances = transcript.findall('./u', ns)
#df_w, df_mor, df_mk = process_utterance(utterances[68], 68, df_w, df_mor, df_mk)
#u = utterances[29]
#[w.text for w in u.findall('.//w', ns)]

# iterate over words
#words = utterances[68].findall('./w', ns)
#df_w, df_mor, df_mk = process_word(words[1], 1, 68, df_w, df_mor, df_mk)

#df_mor, df_mk = process_mor(words[6].find('./mor', ns), 0, False, 6, df_mor, df_mk)

In [7]:
#df_w = pd.DataFrame(columns = ['id', 'u_fk', 'gloss', 'replacement'])
#df_mor = pd.DataFrame(columns = ['id', 'u_fk', 'w_fk', 'pos', 'clitic', 'stem', 'english', 'gra_index', 'gra_head', 'gra_relation'])
#df_mk = pd.DataFrame(columns = ['id', 'mor_fk', 'affix', 'affix_type'])

#utterances = et.parse("corpora/Spanish/Aguirre/010801.xml").getroot().findall('./u', ns)
#u_index = 12
#u = utterances[u_index]
#process_utterance(u, u_index, df_w, df_mor, df_mk)

#w_tags = [ns_tag(tag) for tag in ['w', 'g', 'tagMarker']]
#u_parts = [p for p in u.findall("./", ns) if p.tag in w_tags]

#for p in u_parts:
#    if p.tag == ns_tag("tagMarker"):
#        mor = p.find('./mor', ns)
#        df_mor, df_mk = process_mor(mor, len(df_mor), False, None, df_mor, df_mk)
#    else:
#        if p.tag == ns_tag("g"): p = p.find("./w", ns)
#        df_w, df_mor, df_mk = process_word(p, len(df_w), u_index, df_w, df_mor, df_mk)

#df_w, df_mor, df_mk
#df_w, df_mor, df_mk
#p = u_parts[1].find("./w", ns)
#process_word(p, len(df_w), 228, df_w, df_mor, df_mk)

In [10]:
transcript_file = "corpora/Spanish/Aguirre/020726.xml"
t_df_w, t_df_mor, t_df_mk = process_transcript(transcript_file, list(), list(), list())

In [11]:
t_df_w

Unnamed: 0,id,u_fk,gloss,replacement,transcript
0,0,0,oye,,corpora/Spanish/Aguirre/020726.xml
1,1,0,oye,,corpora/Spanish/Aguirre/020726.xml
2,2,0,Magín,,corpora/Spanish/Aguirre/020726.xml
3,3,1,oye,,corpora/Spanish/Aguirre/020726.xml
4,4,2,adéjamelas,déjamelas,corpora/Spanish/Aguirre/020726.xml
...,...,...,...,...,...
3441,3441,919,la,,corpora/Spanish/Aguirre/020726.xml
3442,3442,919,aspiradora,,corpora/Spanish/Aguirre/020726.xml
3443,3443,920,xxx,,corpora/Spanish/Aguirre/020726.xml
3444,3444,921,aquí,,corpora/Spanish/Aguirre/020726.xml


In [12]:
t_df_mor

Unnamed: 0,id,u_fk,w_fk,pos,is_clitic,is_compound,stem,english,gra_index,gra_head,gra_relation,transcript
0,0,0,0.0,co,False,False,oye,listen,1,0,ROOT,corpora/Spanish/Aguirre/020726.xml
1,1,0,,cm,False,False,cm,,2,1,JCT,corpora/Spanish/Aguirre/020726.xml
2,2,0,1.0,co,False,False,oye,listen,3,2,POBJ,corpora/Spanish/Aguirre/020726.xml
3,3,0,,cm,False,False,cm,,4,3,JCT,corpora/Spanish/Aguirre/020726.xml
4,4,0,2.0,n:prop,False,False,Magín,,5,4,POBJ,corpora/Spanish/Aguirre/020726.xml
...,...,...,...,...,...,...,...,...,...,...,...,...
3735,3735,919,3440.0,inf,False,False,pasa,pass,6,4,COMP,corpora/Spanish/Aguirre/020726.xml
3736,3736,919,3441.0,det:art,False,False,el,the,7,8,DET,corpora/Spanish/Aguirre/020726.xml
3737,3737,919,3442.0,n,False,False,aspirador,ventilator,8,6,OBJ,corpora/Spanish/Aguirre/020726.xml
3738,3738,921,3444.0,adv,False,False,aquí,here,1,2,JCT,corpora/Spanish/Aguirre/020726.xml


In [13]:
t_df_mk

Unnamed: 0,id,mor_fk,affix,affix_type
0,0,6,2S,sfx
1,1,6,IMP,sfxf
2,2,10,f,sfxf
3,3,10,PL,sfxf
4,4,10,1P,sfx
...,...,...,...,...
3010,3010,3733,PRES,sfxf
3011,3011,3735,INF,sfx
3012,3012,3736,f,sfxf
3013,3013,3736,SG,sfxf


In [36]:
trans = 'corpora/Spanish/Remedi/020805.xml'
#t_df_w, t_df_mor, t_df_mk = process_transcript(trans, df_w(), df_mor(), df_mk())
utterances = load_transcript(trans).findall('./u', ns)
#utterances = et.parse(trans).getroot().findall('./u', ns)
u_index = 172
u = utterances[u_index]
#w_tags = [ns_tag(tag) for tag in ['w', 'tagMarker']]
#u_parts = [p for p in u.findall("./", ns) if p.tag in w_tags]

#[w.text for w in u.findall("./", ns)]

#u.find('./', ns)
u_w, u_mor, u_mk = process_utterance(u, u_index, list(), list(), list(), True)
#w_tags = [ns_tag(tag) for tag in ['w', 'g', 'tagMarker']]
#u_parts = [p for p in u.findall("./", ns) if p.tag in w_tags]
#u_parts
#u_parts[0].find("./w", ns) is None

172 4
<Element '{http://www.talkbank.org/ns/talkbank}w' at 0x11cf6ac20>
	manda
<Element '{http://www.talkbank.org/ns/talkbank}w' at 0x11cf5a9f0>
	con
<Element '{http://www.talkbank.org/ns/talkbank}w' at 0x11cf5a4a0>
	la
<Element '{http://www.talkbank.org/ns/talkbank}w' at 0x11cf5abd0>
	Alfon


In [37]:
pd.DataFrame(u_w)

Unnamed: 0,id,u_fk,gloss,replacement
0,0,172,mandame,dame
1,1,172,con,
2,2,172,la,
3,3,172,Alfon,


In [38]:
pd.DataFrame(u_mor)

Unnamed: 0,id,u_fk,w_fk,pos,is_clitic,is_compound,stem,english,gra_index,gra_head,gra_relation
0,0,172,0,imp,False,False,manda,,1,0,ROOT
1,2,172,0,pro:clit,True,False,1S,order,2,1,OBJ2
2,2,172,1,prep,False,False,con,with,3,1,JCT
3,3,172,2,det:art,False,False,el,the,4,5,DET
4,4,172,3,n:prop,False,False,Alfon,,5,3,POBJ


In [39]:
pd.DataFrame(u_mk)

Unnamed: 0,id,mor_fk,affix,affix_type
0,0,0,2S,sfx
1,1,0,IMP,sfxf
2,2,3,f,sfxf
3,3,3,SG,sfxf


In [14]:
def process_dir(corpus_dir):
    dir_df_w, dir_df_mor, dir_df_mk = list(), list(), list()
    for (dirpath, dirnames, filenames) in os.walk(corpus_dir):
        print(dirpath)
        dirnames.sort()
        files = [os.path.join(dirpath, file) for file in sorted(filenames) if os.path.splitext(file)[1] == '.xml']
        for file in files:
            #print(file)
            tra_df_w, tra_df_mor, tra_df_mk = process_transcript(file, list(), list(), list())
            dir_df_w.append(tra_df_w.copy())
            dir_df_mor.append(tra_df_mor.copy())
            dir_df_mk.append(tra_df_mk.copy())
    dir_df_w = pd.concat(dir_df_w)
    dir_df_mor = pd.concat(dir_df_mor)
    dir_df_mk = pd.concat(dir_df_mk)
    return(dir_df_w, dir_df_mor, dir_df_mk)

In [81]:
t_df_w, t_df_mor, t_df_mk = process_dir("corpora/Eng-NA/Brown")

corpora/Eng-NA/Brown
corpora/Eng-NA/Brown/Adam
corpora/Eng-NA/Brown/Eve
corpora/Eng-NA/Brown/Sarah


In [82]:
t_df_w

Unnamed: 0,id,u_fk,gloss,replacement,transcript
0,0,0,play,,corpora/Eng-NA/Brown/Adam/020304.xml
1,1,0,checkers,,corpora/Eng-NA/Brown/Adam/020304.xml
2,2,1,big,,corpora/Eng-NA/Brown/Adam/020304.xml
3,3,1,drum,,corpora/Eng-NA/Brown/Adam/020304.xml
4,4,2,big,,corpora/Eng-NA/Brown/Adam/020304.xml
...,...,...,...,...,...
3180,3180,708,here,,corpora/Eng-NA/Brown/Sarah/050106.xml
3181,3181,709,and,,corpora/Eng-NA/Brown/Sarah/050106.xml
3182,3182,709,what,,corpora/Eng-NA/Brown/Sarah/050106.xml
3183,3183,709,else,,corpora/Eng-NA/Brown/Sarah/050106.xml


In [83]:
t_df_mor

Unnamed: 0,id,u_fk,w_fk,pos,is_clitic,is_compound,stem,english,gra_index,gra_head,gra_relation,transcript
0,0,0,0.0,n,False,False,play,,1,2,MOD,corpora/Eng-NA/Brown/Adam/020304.xml
1,1,0,1.0,n,False,False,checker,,2,0,INCROOT,corpora/Eng-NA/Brown/Adam/020304.xml
2,2,1,2.0,adj,False,False,big,,1,2,MOD,corpora/Eng-NA/Brown/Adam/020304.xml
3,3,1,3.0,n,False,False,drum,,2,0,INCROOT,corpora/Eng-NA/Brown/Adam/020304.xml
4,4,2,4.0,adj,False,False,big,,1,2,MOD,corpora/Eng-NA/Brown/Adam/020304.xml
...,...,...,...,...,...,...,...,...,...,...,...,...
3355,3355,708,3179.0,adv,False,False,right,,4,5,JCT,corpora/Eng-NA/Brown/Sarah/050106.xml
3356,3356,708,3180.0,adv,False,False,here,,5,3,NJCT,corpora/Eng-NA/Brown/Sarah/050106.xml
3357,3357,709,3181.0,coord,False,False,and,,1,0,INCROOT,corpora/Eng-NA/Brown/Sarah/050106.xml
3358,3358,709,3182.0,pro:int,False,False,what,,2,1,COORD,corpora/Eng-NA/Brown/Sarah/050106.xml


In [84]:
t_df_mk

Unnamed: 0,id,mor_fk,affix,affix_type
0,0,1,PL,sfx
1,1,17,3S,sfxf
2,2,20,PRES,sfxf
3,3,21,PL,sfx
4,4,25,PL,sfx
...,...,...,...,...
467,467,3323,ZERO,sfxf
468,468,3325,PL,sfx
469,469,3337,3S,sfxf
470,470,3346,3S,sfxf


In [80]:
t_df_w.to_csv('data/French_w.csv')
t_df_mor.to_csv('data/French_mor.csv')
t_df_mk.to_csv('data/French_mk.csv')

In [79]:
t_df_mor.query('gra_index == gra_index')

Unnamed: 0,id,u_fk,w_fk,pos,is_clitic,is_compound,stem,english,gra_index,gra_head,gra_relation,transcript


In [77]:
t_df_mor = t_df_mor.astype({'gra_index': 'int32'})
t_df_mor['gra_fake'] = t_df_mor.groupby(['transcript', 'u_fk']).cumcount() + 1
t_df_mor['gra_wrong'] = t_df_mor['gra_fake'] != t_df_mor['gra_index']
u_agg = t_df_mor.groupby(['transcript', 'u_fk']).agg({'is_compound': 'any', 'gra_wrong': 'any'})
u_agg.query('not is_compound and gra_wrong')

TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'

In [51]:
u_agg.query('gra_wrong')

Unnamed: 0_level_0,Unnamed: 1_level_0,is_compound,gra_wrong
transcript,u_fk,Unnamed: 2_level_1,Unnamed: 3_level_1
corpora/Spanish/FernAguado/Daniel/030724a.xml,232.0,True,True
corpora/Spanish/FernAguado/Eneko/030112a.xml,60.0,True,True
corpora/Spanish/FernAguado/Eneko/030710a.xml,72.0,True,True
corpora/Spanish/FernAguado/Eneko/030710a.xml,73.0,True,True
corpora/Spanish/FernAguado/Fermin/021120a.xml,69.0,True,True
...,...,...,...
corpora/Spanish/OreaPine/Lucia/020414a.xml,183.0,True,True
corpora/Spanish/OreaPine/Lucia/020417.xml,347.0,True,True
corpora/Spanish/OreaPine/Lucia/020417.xml,348.0,True,True
corpora/Spanish/OreaPine/Lucia/020713.xml,182.0,True,True


In [42]:
#t_df_mor['gra_fake'] = t_df_mor.groupby('u_fk')['w_fk'].transform(lambda x: x - x.min() + 1)
#t_df_mor.query('gra_index != gra_fake | compound') #[['u_fk']].drop_duplicates()

Unnamed: 0_level_0,Unnamed: 1_level_0,is_compound,gra_wrong
transcript,u_fk,Unnamed: 2_level_1,Unnamed: 3_level_1


In [16]:
t_df_mor.query('transcript == "corpora/Spanish/Remedi/020805.xml" & u_fk == 173')

Unnamed: 0,id,u_fk,w_fk,pos,is_clitic,is_compound,stem,english,gra_index,gra_head,gra_relation,transcript


In [67]:
t_df_w.query('transcript == "corpora/Spanish/Remedi/020805.xml" & u_fk == 172')

Unnamed: 0,id,u_fk,gloss,replacement,transcript
605,605.0,172.0,mandame,dame,corpora/Spanish/Remedi/020805.xml
606,606.0,172.0,con,,corpora/Spanish/Remedi/020805.xml
607,607.0,172.0,la,,corpora/Spanish/Remedi/020805.xml
608,608.0,172.0,Alfon,,corpora/Spanish/Remedi/020805.xml


In [30]:
t_df_w[t_df_w.gloss.isna()]

Unnamed: 0,id,u_fk,gloss,replacement,transcript


In [31]:
t_df_mor[t_df_mor.stem.isna()]

AttributeError: 'DataFrame' object has no attribute 'stem'

In [217]:
t_df_mor[t_df_mor.pos.isna()]

Unnamed: 0,id,u_fk,w_fk,pos,clitic,compound,stem,english,gra_index,gra_head,gra_relation,transcript,gra_fake,gra_wrong


In [15]:
#df_w = lambda: pd.DataFrame(columns = ['id', 'u_fk', 'gloss', 'replacement'])
#df_mor = lambda: pd.DataFrame(columns = ['id', 'u_fk', 'w_fk', 'pos', 'is_clitic', 'is_compound', 'stem', 'english', 'gra_index', 'gra_head', 'gra_relation'])
#df_mk = lambda: pd.DataFrame(columns = ['id', 'mor_fk', 'affix', 'affix_type'])