In [1]:
import pandas as pd
import spacy
import numpy as np
import childespy
#python -m spacy download en_core_web_lg in the environment

# SpaCy test

In [31]:
en_nlp = spacy.load("en_core_web_lg")

In [32]:
def spacy_extraction(obj, nlp):    
    if isinstance(obj, str):
        utt_str = obj        
    elif isinstance(obj, dict):
        utt_str = obj['gloss_with_punct']

    doc = nlp(utt_str)
    rdf = pd.DataFrame([{'text':token.text, 'lemma':token.lemma_, 'pos':token.pos_, 'tag':
              token.tag_, 'dependency':token.dep_,
                'morph':token.morph} for token in doc])    
    
    if isinstance(obj, str):
        return(rdf)
    elif isinstance(obj, dict):
        rdf['utterance_id'] = obj['id']
        rdf['speaker_code'] = obj['speaker_code']        
        return(rdf)

In [33]:
spacy_extraction('what does the bus driver say?', en_nlp)

Unnamed: 0,text,lemma,pos,tag,dependency,morph
0,what,what,PRON,WP,dobj,()
1,does,do,AUX,VBZ,aux,"(Mood=Ind, Number=Sing, Person=3, Tense=Pres, ..."
2,the,the,DET,DT,det,"(Definite=Def, PronType=Art)"
3,bus,bus,NOUN,NN,compound,(Number=Sing)
4,driver,driver,NOUN,NN,nsubj,(Number=Sing)
5,say,say,VERB,VB,ROOT,(VerbForm=Inf)
6,?,?,PUNCT,.,punct,(PunctType=Peri)


# Run SpaCy on Providence Data

In [None]:
pvd_idx = childespy.get_sql_query('select * from corpus where name = "Providence"').iloc[0]['id']

In [None]:
# pvd_utts = childespy.get_sql_query('select * from utterance where corpus_id = '+str(pvd_idx) ,
#         db_version = "2020.1")

In [13]:
selected_transcript_id = 42204

pvd_utts = childespy.get_sql_query('select * from utterance where transcript_id = '+str(selected_transcript_id) ,
db_version = "2020.1")

R[write to console]: Using supported database version: '2020.1'.



In [18]:
pvd_utts.columns

Index(['id', 'gloss', 'stem', 'actual_phonology', 'model_phonology', 'type',
       'language', 'num_morphemes', 'num_tokens', 'utterance_order',
       'corpus_name', 'part_of_speech', 'speaker_code', 'speaker_name',
       'speaker_role', 'target_child_name', 'target_child_age',
       'target_child_sex', 'media_start', 'media_end', 'media_unit',
       'collection_name', 'collection_id', 'corpus_id', 'speaker_id',
       'target_child_id', 'transcript_id', 'punct', 'gloss_with_punct'],
      dtype='object')

In [14]:
def fix_gloss(gloss):
    # migt be better to split these glosses (black+bird -> black bird),but then we lose the alignment 
    return(str(gloss).replace('+','').replace('_',''))
pvd_utts.gloss = [fix_gloss(x) for x in pvd_utts.gloss]

# add back punctuation from the utterance type
punct_for_type = {
    'question':'?',
    'declarative':'.',
    'self interruption':'.',
    'interruption':'!',
    'trail off':'...',
    'interruption question':'?',
    'trail off question':'?',
    'imperative_emphatic':'!' 
}
pvd_utts['punct'] = [punct_for_type[x] if x in punct_for_type else '.'
                        for x in pvd_utts.type ]

# add the speaker code (for compatibility with a fine-tuned model that has speaker identity)
pvd_utts = pvd_utts.loc[[x is not None for x in pvd_utts.punct]]


# build a single form that is appropriate for running through the tokenizer
pvd_utts['gloss_with_punct'] = [x['gloss'] + x['punct'] for x in pvd_utts.to_dict('records')] 
pvd_utts.shape

(722, 29)

In [15]:
pvd_utts

Unnamed: 0,id,gloss,stem,actual_phonology,model_phonology,type,language,num_morphemes,num_tokens,utterance_order,...,media_end,media_unit,collection_name,collection_id,corpus_id,speaker_id,target_child_id,transcript_id,punct,gloss_with_punct
1,16759250,where do you want me to go,where do you want me to go,,,question,eng,7,7,1,...,7.830,s,Eng-NA,21,328,22708,22704,42204,?,where do you want me to go?
2,16759261,anywhere you'll feel comfortable um anywhere,anywhere you feel comfort anywhere,,,declarative,eng,8,6,2,...,,,Eng-NA,21,328,22707,22704,42204,.,anywhere you'll feel comfortable um anywhere.
3,16759270,please don't do that,please do do that,,,declarative,eng,5,4,3,...,10.888,s,Eng-NA,21,328,22707,22704,42204,.,please don't do that.
4,16759279,this is,this be,,,self interruption,eng,3,2,4,...,15.525,s,Eng-NA,21,328,22707,22704,42204,.,this is.
5,16759300,go into the kitchen and check in about,go into the kitchen and check in about,,,self interruption,eng,8,8,5,...,,,Eng-NA,21,328,22708,22704,42204,.,go into the kitchen and check in about.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
718,16769439,god bless you,god bless you,,,declarative,eng,3,3,718,...,3317.463,s,Eng-NA,21,328,22707,22704,42204,.,god bless you.
719,16769460,thank you,thank you,,,declarative,eng,2,2,719,...,3338.480,s,Eng-NA,21,328,22707,22704,42204,.,thank you.
720,16769486,thank you,thank you,,,declarative,eng,2,2,720,...,3347.488,s,Eng-NA,21,328,22707,22704,42204,.,thank you.
721,16769509,Alex where's your big truck,Alex where your big truck,,,question,eng,6,5,721,...,3358.255,s,Eng-NA,21,328,22707,22704,42204,?,Alex where's your big truck?


In [34]:
spacy_examples = pd.concat([spacy_extraction(x, en_nlp) for x in pvd_utts.to_dict('records')])

In [28]:
spacy_examples.to_csv('Providence_42204_spacy.csv', index=False)

# Run SpaCy on Lyon Data

In [36]:
fr_nlp = spacy.load("fr_core_news_lg")

In [37]:
selected_transcript_id = 44629

lyon_utts = childespy.get_sql_query('select * from utterance where transcript_id = '+str(selected_transcript_id) ,
db_version = "2020.1")

R[write to console]: Using supported database version: '2020.1'.



In [38]:
lyon_utts.gloss = [fix_gloss(x) for x in lyon_utts.gloss]

# add back punctuation from the utterance type
punct_for_type = {
    'question':'?',
    'declarative':'.',
    'self interruption':'.',
    'interruption':'!',
    'trail off':'...',
    'interruption question':'?',
    'trail off question':'?',
    'imperative_emphatic':'!' 
}
lyon_utts['punct'] = [punct_for_type[x] if x in punct_for_type else '.'
                        for x in lyon_utts.type ]

# add the speaker code (for compatibility with a fine-tuned model that has speaker identity)
lyon_utts = lyon_utts.loc[[x is not None for x in lyon_utts.punct]]


# build a single form that is appropriate for running through the tokenizer
lyon_utts['gloss_with_punct'] = [x['gloss'] + x['punct'] for x in lyon_utts.to_dict('records')] 
lyon_utts.shape

(336, 29)

In [40]:
lyon_examples = pd.concat([spacy_extraction(x, fr_nlp) for x in lyon_utts.to_dict('records')])

In [41]:
lyon_examples.to_csv('Lyon_44629_spacy.csv', index=False)

# What Information is Present in SpaCy?

In [42]:
test_str = 'what do the bus drivers say?'
test_spacy = en_nlp(test_str)
dir(test_spacy[4])

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang',
 'lang_',
 'le

In [43]:
test_spacy[4].morph

Number=Plur

In [44]:
test_spacy[5].morph

Tense=Pres|VerbForm=Fin