In [2]:
import xml.etree.ElementTree as et
import pandas as pd
import os
import re
import pdb

# xml namespace
ns = {'': 'http://www.talkbank.org/ns/talkbank'}
ns_tag = lambda tag: '{' + ns[''] + '}' + tag

In [13]:
def load_transcript(transcript_file):
    transcript = open(transcript_file).read()
    transcript = re.sub('</?[ps]?g>', '', transcript) # get rid of groups (g, pg, sg)
    return(et.fromstring(transcript))
    
def process_transcript(transcript_file, df_w, df_mor, df_mk, verbose = False):
    t = load_transcript(transcript_file)
    for u_index, u in enumerate(t.findall('./u', ns)):
        df_w, df_mor, df_mk = process_utterance(u, u_index, df_w, df_mor, df_mk, verbose)
                
    df_w = pd.DataFrame(df_w)
    df_mor = pd.DataFrame(df_mor)
    df_mk = pd.DataFrame(df_mk)
        
    df_w['transcript'] = transcript_file
    df_mor['transcript'] = transcript_file
    df_mk['transcript'] = transcript_file
    return(df_w, df_mor, df_mk)

def process_utterance(u, u_index, df_w, df_mor, df_mk, verbose = False):
    if verbose: print('utterance', u_index)

    # find all nodes in utterance that have tag w or tagMarker
    w_tags = [ns_tag(tag) for tag in ['w', 'tagMarker']]
    u_parts = [p for p in u.findall("./", ns) if p.tag in w_tags]

    for p in u_parts:
        # tagMarkers don't go in df_w, only df_mor
        if p.tag == ns_tag("tagMarker"):
            mor = p.find('./mor', ns)
            if mor is not None:
                df_mor, df_mk = process_mor(mor = mor, mor_index = len(df_mor),
                                            clitic_type = None,
                                            is_separated_prefix = False,
                                            u_id = u_index, w_id = None, df_mor = df_mor, df_mk = df_mk)
        else:
            df_w, df_mor, df_mk = process_word(p, len(df_w), u_index, df_w, df_mor, df_mk)
            
    return(df_w, df_mor, df_mk)

In [4]:
def construct_gloss(w):
    # construct word string out of text, compound parts, internal shortenings
    w_tags = [ns_tag(tag) for tag in ['shortening', 'wk']]
    w_parts = [p for p in w.findall("./", ns) if p.tag in w_tags]
    word = w.text or ""
    has_compound, has_clitic, has_shortening = False, False, False
    for p in w_parts:
        if p.get('type') == 'cmp': # compound
            word = word + '+' + (p.tail or "")
            has_compound = True
        elif p.get('type') == 'cli': # clitic
            word = word + '~' + (p.tail or "")
            has_clitic = True
        elif p.tag == ns_tag('shortening'): # shortening
            word = word + '(' + p.text + ')' + (p.tail or "")
            has_shortening = True
        else:
            raise ValueError('unknown component inside word')
    return(word, has_compound, has_clitic, has_shortening)

def process_word(w, w_index, u_id, df_w, df_mor, df_mk, verbose = False):

    gloss, has_compound, has_clitic, has_shortening = construct_gloss(w)
    
    words = [w]
    # replacements contain 1 or more words
    rep_words = w.findall('./replacement/w', ns)
    if len(rep_words):
        # treat multiple words in replacement as a linkage (except separated prefix)
        rep_glosses = [construct_gloss(rep)[0] for rep in rep_words]
        if rep_words[0].get('separated-prefix') == 'true':
            rep_gloss = rep_glosses[0] + ' ' + "_".join(rep_glosses[1:])
        else:
            rep_gloss = "_".join(rep_glosses)
        # mor tag can be child of w within replacement or after replacement
        words += rep_words
    else:
        rep_gloss = None
    
    if verbose: print("\t", gloss)
    w_data = {
        'id': w_index,
        'u_fk': u_id,
        'gloss': gloss,
        'replacement': rep_gloss,
        'pos': w.get('pos'),
        'form_marker': w.get('formType'), # @x
        'form_marker_suffix': w.get('formSuffix'), # @x-s
        'incomplete_type': w.get('type'), # omission, fragment, filler, incomplete
        'unidentifiable_type': w.get('untranscribed'), # unintelligible, unintelligible-with-pho, untranscribed
        'has_compound': has_compound,
        'has_clitic': has_clitic,
        'has_shortening' : has_shortening
    }
    df_w.append(w_data)

    for wo in words:
        is_separated_prefix = wo.get('separated-prefix') == 'true'

        # 0 or 1 mor, only type mor (i.e. not training)
        mor = wo.findall('./mor[@type="mor"]', ns)
        assert len(mor) <= 1
        if len(mor):
            mor = mor[0]
            df_mor, df_mk = process_mor(mor = mor, mor_index = len(df_mor), clitic_type = None,
                                        is_separated_prefix = is_separated_prefix,
                                        u_id = u_id, w_id = w_index, df_mor = df_mor, df_mk = df_mk)

            # 0 or 1 or multiple mor-pre or mor-post (clitics)
            clitics = mor.findall('./mor-pre', ns) + mor.findall('./mor-post', ns)
            if clitics is not None:
                for mor_index, clitic in enumerate(clitics, start = 1):
                    clitic_type = re.search("-(.*?)$", clitic.tag).group(1)
                    df_mor, df_mk = process_mor(mor = clitic, mor_index = len(df_mor) + mor_index,
                                                clitic_type = clitic_type, is_separated_prefix = is_separated_prefix,
                                                u_id = u_id, w_id = w_index, df_mor = df_mor, df_mk = df_mk)
    
    return(df_w, df_mor, df_mk)

In [5]:
def process_mor(mor, mor_index, clitic_type, is_separated_prefix, u_id, w_id, df_mor, df_mk):
    
    # expect 0 or 1 of each of these tags
    assert len(mor.findall('./mw', ns)) <= 1
    assert len(mor.findall('./mwc', ns)) <= 1
    assert len(mor.findall('./gra', ns)) <= 1
    assert len(mor.findall('./mw/pos', ns)) <= 1
    assert len(mor.findall('./mw/pos/c', ns)) <= 1
    assert len(mor.findall('./mw/stem', ns)) <= 1

    # mw is inside mwc for compounds
    # TODO: mwc has a pos and unbounded mpfx
    mwc = mor.find('./mwc', ns)
    mws = mor.findall('./mw', ns) if mwc is None else mwc.findall('./mw', ns)
    is_compound = mwc is not None
    
    # 0 or 1 or multiple menx (multiples separated by /)
    menxs = mor.findall('./menx', ns)
    menx = "/".join([menx.text for menx in menxs]) if menxs is not None else None
    
    # 0 or 1 gra
    gra = mor.findall('./gra[@type="gra"]', ns)
    assert len(gra) <= 1
    if len(gra):
        gra = gra[0]
        gra_index, gra_head, gra_relation = gra.get('index'), gra.get('head'), gra.get('relation')
    else:
        gra_index, gra_head, gra_relation = None, None, None
    
    for mw in mws:
        pos_c = mw.find('./pos/c', ns)
        pos_s = mw.findall('./pos/s', ns)
        pos = ":".join([pos_c.text] + [s.text for s in pos_s if s is not None])
        mor_data = {
            'id': mor_index,
            'u_fk': u_id,
            'w_fk': w_id,
            'is_compound': is_compound,
            'clitic_type': clitic_type,
            'is_separated_prefix': is_separated_prefix,
            'pos': pos,
            'stem': mw.find('./stem', ns).text,
            'english': menx,
            'gra_index': gra_index,
            'gra_head': gra_head,
            'gra_relation': gra_relation
        }
        df_mor.append(mor_data)
        
        df_mk = process_mw(mw, mor_index, df_mk)
        
    return(df_mor, df_mk)

In [6]:
def process_mw(mw, mor_id, df_mk):
    
    # prefixes
    for mpfx in mw.findall('./mpfx', ns):
        mpfx_data = {
            'id': len(df_mk),
            'mor_fk': mor_id,
            'affix': mpfx.text,
            'affix_type': 'prefix'
        }
        df_mk.append(mpfx_data)
        
    # suffixes
    for mk in mw.findall('./mk', ns):
        mk_data = {
            'id': len(df_mk),            
            'mor_fk': mor_id,
            'affix': mk.text,
            'affix_type': mk.get('type')
        }
        df_mk.append(mk_data)
        
    return(df_mk)

In [170]:
# transcript_file = "/Users/mikabr/childes/childes/Spanish/OreaPine/Juan/020317.xml"
# t_df_w, t_df_mor, t_df_mk = process_transcript(transcript_file, list(), list(), list())

In [7]:
def process_dir(corpus_dir):
    for (dirpath, dirnames, filenames) in os.walk(corpus_dir):
        print(dirpath)
        dirnames.sort()
        files = [os.path.join(dirpath, file) for file in sorted(filenames) if os.path.splitext(file)[1] == '.xml']
        if len(files):
            dir_df_w, dir_df_mor, dir_df_mk = list(), list(), list()
            for file in files:
                tra_df_w, tra_df_mor, tra_df_mk = process_transcript(file, list(), list(), list())
                dir_df_w.append(tra_df_w.copy())
                dir_df_mor.append(tra_df_mor.copy())
                dir_df_mk.append(tra_df_mk.copy())
            pd.concat(dir_df_w).to_csv(os.path.join(dirpath, "w.csv"))
            pd.concat(dir_df_mor).to_csv(os.path.join(dirpath, "mor.csv"))
            pd.concat(dir_df_mk).to_csv(os.path.join(dirpath, "mk.csv"))

In [7]:
process_dir("/Users/mikabr/childes/phonbank")