In [1]:
import pandas as pd
import spacy
import numpy as np
import childespy
#python -m spacy download en_core_web_lg in the environment

# SpaCy test

In [5]:
nlp = spacy.load("en_core_web_lg")

In [31]:
def spacy_extraction(str):
    doc = nlp(str)
    return(pd.DataFrame([{'text':token.text, 'lemma':token.lemma_, 'pos':token.pos_, 'tag':
          token.tag_, 'dependency':token.dep_,
            'morph':token.morph} for token in doc]))

In [32]:
spacy_extraction('what does the bus driver say?')

Unnamed: 0,text,lemma,pos,tag,dependency,morph
0,what,what,PRON,WP,dobj,()
1,does,do,AUX,VBZ,aux,"(Mood=Ind, Number=Sing, Person=3, Tense=Pres, ..."
2,the,the,DET,DT,det,"(Definite=Def, PronType=Art)"
3,bus,bus,NOUN,NN,compound,(Number=Sing)
4,driver,driver,NOUN,NN,nsubj,(Number=Sing)
5,say,say,VERB,VB,ROOT,(VerbForm=Inf)
6,?,?,PUNCT,.,punct,(PunctType=Peri)


# Run SpaCy on Providence Data

In [9]:
pvd_idx = childespy.get_sql_query('select * from corpus where name = "Providence"').iloc[0]['id']

R[write to console]: Using current database version: '2020.1'.



In [13]:
pvd_utts = childespy.get_sql_query('select * from utterance where corpus_id = '+str(pvd_idx) ,
        db_version = "2020.1")

R[write to console]: Using supported database version: '2020.1'.



In [14]:
def fix_gloss(gloss):
    # migt be better to split these glosses (black+bird -> black bird),but then we lose the alignment 
    return(str(gloss).replace('+','').replace('_',''))
pvd_utts.gloss = [fix_gloss(x) for x in pvd_utts.gloss]

# add back punctuation from the utterance type
punct_for_type = {
    'question':'?',
    'declarative':'.',
    'self interruption':'.',
    'interruption':'!',
    'trail off':'...',
    'interruption question':'?',
    'trail off question':'?',
    'imperative_emphatic':'!' 
}
pvd_utts['punct'] = [punct_for_type[x] if x in punct_for_type else '.'
                        for x in pvd_utts.type ]

# add the speaker code (for compatibility with a fine-tuned model that has speaker identity)
pvd_utts = pvd_utts.loc[[x is not None for x in pvd_utts.punct]]


# build a single form that is appropriate for running through the tokenizer
pvd_utts['gloss_with_punct'] = [x['gloss'] + x['punct'] for x in pvd_utts.to_dict('records')] 
pvd_utts.shape

(460061, 29)

In [15]:
pvd_utts

Unnamed: 0,id,gloss,stem,actual_phonology,model_phonology,type,language,num_morphemes,num_tokens,utterance_order,...,media_end,media_unit,collection_name,collection_id,corpus_id,speaker_id,target_child_id,transcript_id,punct,gloss_with_punct
1,16759250,where do you want me to go,where do you want me to go,,,question,eng,7,7,1,...,7.830,s,Eng-NA,21,328,22708,22704,42204,?,where do you want me to go?
2,16759261,anywhere you'll feel comfortable um anywhere,anywhere you feel comfort anywhere,,,declarative,eng,8,6,2,...,,,Eng-NA,21,328,22707,22704,42204,.,anywhere you'll feel comfortable um anywhere.
3,16759270,please don't do that,please do do that,,,declarative,eng,5,4,3,...,10.888,s,Eng-NA,21,328,22707,22704,42204,.,please don't do that.
4,16759279,this is,this be,,,self interruption,eng,3,2,4,...,15.525,s,Eng-NA,21,328,22707,22704,42204,.,this is.
5,16759300,go into the kitchen and check in about,go into the kitchen and check in about,,,self interruption,eng,8,8,5,...,,,Eng-NA,21,328,22708,22704,42204,.,go into the kitchen and check in about.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
460057,17280964,oh why lick hippo,oh why lick hippo,o waɪ liʔ ɪ,oʊ waɪ lɪk hɪ,self interruption,eng,4,4,756,...,2757.566,s,Eng-NA,21,328,22764,22764,42569,.,oh why lick hippo.
460058,17280992,hippo,hippo,hɪpo,hɪpoʊ,declarative,eng,1,1,757,...,,,Eng-NA,21,328,22764,22764,42569,.,hippo.
460059,17281011,xxx,,,,declarative,eng,-2147483648,1,758,...,,,Eng-NA,21,328,22769,22764,42569,.,xxx.
460060,17281030,xxx la la xxx,la la,* lɑ lɑ *,* lɑː lɑː *,declarative,eng,2,4,759,...,2787.225,s,Eng-NA,21,328,22764,22764,42569,.,xxx la la xxx.


In [33]:
spacy_examples = pd.concat([spacy_extraction(x) for x in pvd_utts.head(10)['gloss_with_punct']])

In [34]:
spacy_examples

Unnamed: 0,text,lemma,pos,tag,dependency,morph
0,where,where,ADV,WRB,advmod,()
1,do,do,AUX,VBP,aux,"(Mood=Ind, Tense=Pres, VerbForm=Fin)"
2,you,you,PRON,PRP,nsubj,"(Case=Nom, Person=2, PronType=Prs)"
3,want,want,VERB,VB,ROOT,(VerbForm=Inf)
4,me,I,PRON,PRP,nsubj,"(Case=Acc, Number=Sing, Person=1, PronType=Prs)"
...,...,...,...,...,...,...
4,he,he,PRON,PRP,nsubj,"(Case=Nom, Gender=Masc, Number=Sing, Person=3,..."
5,takes,take,VERB,VBZ,ccomp,"(Number=Sing, Person=Three, Tense=Pres, VerbFo..."
6,it,it,PRON,PRP,dobj,"(Case=Acc, Gender=Neut, Number=Sing, Person=3,..."
7,apart,apart,ADV,RB,advmod,()


# What Information is Present in SpaCy?

In [22]:
test_str = 'what do the bus drivers say?'
test_spacy = nlp(test_str)
dir(test_spacy[4])

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang',
 'lang_',
 'le

In [29]:
test_spacy[4].morph

Number=Plur

In [30]:
test_spacy[5].morph

Tense=Pres|VerbForm=Fin