In [1]:
import spacy
import utils
import re

from collections import defaultdict, Counter
from minicons.utils import get_batch
from tqdm import tqdm

In [2]:
# spacy setup
gpu = spacy.prefer_gpu(2)
nlp = spacy.load("en_core_web_trf")

In [3]:
# aochildes
corpus = utils.read_file("../../smolm/data/corpora/babylm_data/babylm_100M/aochildes.train")

In [4]:
alternating_verbs = '''feed,give,lease,lend,loan,pass,pay,peddle,refund,render,rent,repay,sell,serve,trade,advance,allocate,allot,assign,award,bequeath,cede,concede,extend,grant,guarantee,issue,leave,offer,owe,promise,vote,will,yield,bring,take,forward,hand,mail,post,send,ship,slip,smuggle,sneak,bounce,float,roll,slide,carry,drag,haul,heave,heft,hoise,kick,lug,pull,push,schlep,shove,tote,tow,tug,barge,bus,cart,drive,ferry,fly,row,shuttle,truck,wheel,wire,bash,bat,bunt,catapult,chuck,flick,fling,flip,hit,hurl,kick,lob,pass,pitch,punt,shoot,shove,slam,slap,sling,throw,tip,toss,ask,cite,pose,preach,quote,read,relay,show,teach,tell,write,cable,email,e-mail,fax,modem,netmail,phone,radio,relay,satellite,semaphore,sign,signal,telephone,telecast,telegraph,telex,wire,wireless'''.split(",")

do_only_verbs = '''accord,ask,bear,begrudge,bode,cost,deny,envy,flash,forbid,forgive,guarantee,issue,refuse,save,spare,strike,vouchsafe,wish,write,bet,bill,charge,fine,mulct,overcharge,save,spare,tax,tip,undercharge,wager,acknowledge,adopt,appoint,consider,crown,deem,designate,elect,esteeem,imagine,mark,nominate,ordain,proclaim,rate,recon,report,want,anoint,baptize,brand,call,christen,consecrate,crown,decree,dub,label,make,name,nickname,pronounce,rule,stamp,style,term,vote,adjudge,adjudicate,assume,avow,believe,confess,declare,fancy,find,judge,presume,profess,prove,suppose,think,warrant'''.split(",")

pp_only_verbs = '''address,administer,broadcast,convey,contribute,delegate,deliver,denounce,demonstrate,describe,dictate,dispatch,display,distribute,donate,elucidate,exhibit,express,explain,explicate,forfeit,illustrate,introduce,narrate,portray,proffer,recite,recommend,refer,reimburse,remit,restore,return,sacrifice,submit,surrender,transfer,transport,admit,allege,announce,articulate,assert,communicate,confess,convey,declare,mention,propose,recount,repeat,report,reveal,say,state,babble,bark,bawl,bellow,bleat,boom,bray,burble,cackle,call,carol,chant,chatter,chrip,cluck,coo,croak,croon,crow,cry,drawl,drone,gabble,gibber,groan,growl,grumble,grunt,hiss,holler,hoot,howl,jabber,lilt,lisp,moan,mumble,murmur,mutter,purr,rage,rasp,roar,rumble,scream,screech,shout,shriek,sing,snap,snarl,snuffle,splutter,squall,squawk,squeak,squeal,stammer,stutter,thunder,tisk,trill,trumpet,twitter,wail,warble,wheeze,whimper,whine,whisper,whistle,whoop,yammer,yap,yell,yelp,yodel,drop,hoist,lift,lower,raise,credit,entrust,furnish,issue,leave,present,provide,serve,supply,trust'''.split(",")

In [5]:
dative_verbs = sorted(list(set(alternating_verbs + do_only_verbs + pp_only_verbs)))

In [6]:
def get_postags(texts, batch_size, processor):
    taglist = []
    for doc in tqdm(processor.pipe(texts, disable=["tok2vec", "parser", "attribute_ruler", "lemmatizer", "ner"], batch_size=batch_size)):
        sentlist = []
        for entity in doc:
            sentlist += [(entity.text, entity.tag_)]
        taglist += [sentlist]
    return taglist

# recursively get children of a verb using spacy
def get_children_flatten(token, depth=0, dep=False, return_tokens=False):
    children = []
    for child in token.children:
        if dep:
            if return_tokens:
                children.append((child.text.lower(), child.dep_, child.tag_, depth, child.i, child))
            else:
                children.append((child.text.lower(), child.dep_, child.tag_, depth, child.i))
        else:
            children.append(child.text.lower())
        children.extend(get_children_flatten(child, depth+1, dep, return_tokens))
    return children

def collect_args(children_obj, hyp = 'do'):
    args = {'theme': '', 'recipient': '', 'theme_pos': '', 'recipient_pos': ''}
    hyp_args = []
    if hyp == "do":
        for child in children_obj:
            if child[1] == "dobj" or child[1] == "dative":
                hyp_args.append((child[0], child[-1], child[2]))
        
        # sort by index
        hyp_args = sorted(hyp_args, key=lambda x: x[1])
        args['recipient'] = hyp_args[0][0]
        args['theme'] = hyp_args[1][0]
        args['recipient_pos'] = hyp_args[0][-1]
        args['theme_pos'] = hyp_args[1][-1]

    elif hyp == "pp":
        for child in children_obj:
            if child[1] == "pobj" or child[1] == "dobj":
                hyp_args.append((child[0], child[-1], child[2]))
        
        # sort by index
        hyp_args = sorted(hyp_args, key=lambda x: x[1])
        args['recipient'] = hyp_args[1][0]
        args['theme'] = hyp_args[0][0]
        args['recipient_pos'] = hyp_args[1][-1]
        args['theme_pos'] = hyp_args[0][-1]

    return args

def get_datives(texts, batch_size, processor):
    dos, pps = [], []
    for doc in tqdm(processor.pipe(texts, disable = ["ner"], batch_size = batch_size)):
        do = False
        pp = False
        for entity in doc:
            if entity.pos_ == "VERB":
                children = get_children_flatten(entity, 0, dep=True)
                if len(children) > 0:
                    tokens, dep, pos_string, depth, index = list(zip(*children))
                    if "to" in tokens:
                        # possibility for pp
                        dep_depth = [f"{d}_{str(depth[i])}" for i, d in enumerate(dep)]
                        tok_dep = [f"{tokens[i]}_{dep[i]}" for i in range(len(tokens))]
                        if ("dobj_0" in dep_depth and "dative_0" in dep_depth and "pobj_1" in dep_depth) or ("dobj_0" in dep_depth and "prep_0" in dep_depth and "pobj_1" in dep_depth):
                            if "to_dative" in tok_dep or "to_prep" in tok_dep:
                                pp = True
                                # pps.append(sentence)
                                # print(children)
                                # args = collect_args(children, "pp")
                                pps.append((doc.text, entity.lemma_, entity.text, entity.tag_, children))
                                break
                    else:
                        # possibility for DO
                        # concatenation of dep and depth
                        dep_depth = [f"{d}_{str(depth[i])}" for i, d in enumerate(dep)]
                        tokens_dep = [f"{tokens[i]}_{dep[i]}" for i in range(len(tokens))]
                        if ("dobj_0" in dep_depth and "dative_0" in dep_depth) or Counter(dep_depth)['dobj_0'] >= 2:
                            if 'for_dative' not in tokens_dep and 'for_dobj' not in tokens_dep:
                                do = True
                                # dos.append(sentence)
                                # print(children)
                                # args = collect_args(children)
                                dos.append((doc.text, entity.lemma_, entity.text, entity.tag_, children))
                                break

    return dos, pps

In [7]:
do_examples = [
    "she gave me the ball.",
    "she gave mom the ball.",
    "she sent grandma the big book.",
    "she is tossing the dog the big ball.",
    "she had given me the ball.",
    "she has given mommy the ball.",
]

pp_examples = [
    "she gave the ball to me.",
    "she gave the ball to mom.",
    "she sent the big book to grandma.",
    "she is tossing the big ball to the dog.",
    "she had given the ball to me.",
    "she gave it to me.",
    "she had given it to mommy yesterday morning.",
    "slap the cheese on it so the cheese starts to melt ."
]

test_corpus = do_examples + pp_examples

DOS_test, PPS_test = [], []
for batch in get_batch(test_corpus, batch_size=6):
    dos, pps = get_datives(batch, 6, nlp)
    DOS_test.extend(dos)
    PPS_test.extend(pps)

6it [00:01,  4.07it/s]
6it [00:00, 347.45it/s]
2it [00:00, 167.22it/s]


In [8]:
# DOS_test, PPS_test
len(PPS_test)

7

In [9]:
DOS, PPS = [], []
for batch in get_batch(corpus, batch_size=8192):
    dos, pps = get_datives(batch, 8192, nlp)
    DOS.extend(dos)
    PPS.extend(pps)

0it [00:00, ?it/s]

8192it [00:06, 1307.49it/s]
8192it [00:05, 1374.28it/s]
8192it [00:05, 1423.86it/s]
8192it [00:05, 1372.44it/s]
8192it [00:04, 1641.78it/s]
8192it [00:05, 1373.45it/s]
8192it [00:05, 1575.72it/s]
8192it [00:05, 1608.96it/s]
8192it [00:05, 1577.71it/s]
8192it [00:05, 1512.57it/s]
8192it [00:05, 1488.35it/s]
8192it [00:05, 1427.51it/s]
8192it [00:05, 1513.67it/s]
8192it [00:05, 1432.42it/s]
8192it [00:05, 1448.38it/s]
8192it [00:05, 1589.01it/s]
8192it [00:05, 1485.13it/s]
8192it [00:05, 1456.86it/s]
8192it [00:05, 1425.13it/s]
8192it [00:05, 1495.01it/s]
8192it [00:05, 1488.73it/s]
8192it [00:05, 1546.36it/s]
8192it [00:05, 1615.41it/s]
8192it [00:05, 1518.49it/s]
8192it [00:05, 1587.99it/s]
8192it [00:05, 1432.78it/s]
8192it [00:05, 1607.56it/s]
8192it [00:05, 1501.56it/s]
8192it [00:05, 1511.95it/s]
8192it [00:05, 1458.69it/s]
8192it [00:05, 1455.94it/s]
8192it [00:05, 1511.99it/s]
8192it [00:05, 1482.04it/s]
8192it [00:05, 1489.87it/s]
8192it [00:05, 1436.98it/s]
8192it [00:05, 1508.

In [10]:
len(DOS), len(PPS)

(8172, 4102)

In [11]:
DOS_FILTERED, PPS_FILTERED = [], []

DOS_DISCARDED, PPS_DISCARDED = [], []

for sentence, lemma, verb, verb_pos, children in DOS:
    if lemma in dative_verbs:
        args = collect_args(children)
        # if args['theme_pos'] in ['NN', 'NNS', 'NNP', 'PRP', 'DT'] and args['recipient_pos'] in ['NN', 'NNS', 'NNP', 'PRP']:
        #     DOS_FILTERED.append((sentence, lemma, verb, args['theme'], args['recipient']))
        # else:
        #     DOS_DISCARDED.append((sentence, lemma, verb, args['theme'], args['recipient'], args['theme_pos'], args['recipient_pos']))

        DOS_FILTERED.append((sentence, lemma, verb, verb_pos, args['theme'], args['recipient'], args['theme_pos'], args['recipient_pos']))

for sentence, lemma, verb, verb_pos, children in PPS:
    if lemma in dative_verbs:
        args = collect_args(children, "pp")
        # tag = children[2]
        # if args['theme_pos'] in ['NN', 'NNS', 'NNP', 'PRP', 'DT'] and args['recipient_pos'] in ['NN', 'NNS', 'NNP', 'PRP']:
        #     PPS_FILTERED.append((sentence, lemma, verb, args['theme'], args['recipient']))
        # else:
        #     PPS_DISCARDED.append((sentence, lemma, verb, args['theme'], args['recipient'], args['theme_pos'], args['recipient_pos']))
        PPS_FILTERED.append((sentence, lemma, verb, verb_pos, args['theme'], args['recipient'], args['theme_pos'], args['recipient_pos']))

In [12]:
len(DOS_FILTERED), len(PPS_FILTERED)

(6181, 3075)

In [13]:
# write both to csv in data/
import csv

with open("../data/aochildes_dos.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["sentence", "lemma", "verb", "verb_pos", "theme", "recipient", "theme_pos", "recipient_pos"])
    writer.writerows(DOS_FILTERED)

with open("../data/aochildes_pps.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["sentence", "lemma", "verb", "verb_pos", "theme", "recipient", "theme_pos", "recipient_pos"])
    writer.writerows(PPS_FILTERED)

In [31]:
PPS_DISCARDED

[('who are you reading them to ?',
  'read',
  'reading',
  'who',
  'them',
  'WP',
  'PRP'),
 ("are you going to tell her about all you've done today .",
  'tell',
  'tell',
  'her',
  'all',
  'PRP',
  'DT'),
 ('okay try not to make a mess with this .',
  'make',
  'make',
  'mess',
  'this',
  'NN',
  'DT'),
 ('is that what they said to you ?',
  'say',
  'said',
  'what',
  'you',
  'WP',
  'PRP'),
 ('thomas is coming to say hello to us .',
  'say',
  'say',
  'hello',
  'us',
  'UH',
  'PRP'),
 ('so that brings it down to four .',
  'bring',
  'brings',
  'it',
  'four',
  'PRP',
  'CD'),
 ('mm what do you say to grandma ?',
  'say',
  'say',
  'what',
  'grandma',
  'WP',
  'NN'),
 ('oh thank you are you giving that to mommy ?',
  'give',
  'giving',
  'you',
  'that',
  'PRP',
  'DT'),
 ('what are you saying to me my friend ?',
  'say',
  'saying',
  'what',
  'me',
  'WP',
  'PRP'),
 ('you want to give some of those to your papa ?',
  'give',
  'give',
  'some',
  'those',
  '

In [33]:
dos_sents, dos_lemmas, dos_verbs, _, _ = list(zip(*DOS_FILTERED))
len(Counter(dos_lemmas))

48

In [34]:
pps_sents, pps_lemmas, pps_verbs, _, _ = list(zip(*PPS_FILTERED))
len(Counter(pps_lemmas))

69

In [37]:
alternating = set(dos_lemmas).intersection(set(pps_lemmas))
do_only = set(dos_lemmas) - set(pps_lemmas)
pp_only = set(pps_lemmas) - set(dos_lemmas)

len(alternating), len(do_only), len(pp_only)

(38, 10, 31)

In [None]:
# with open(f"{args.outpath}/postags.txt", 'w') as f:
postags = []
for batch in get_batch(corpus, batch_size=256):
    tags = get_postags(batch, 256, nlp)

    for j, item in enumerate(tags):
        # print(j, item)
        if item == []:
            postags.append("")
        else:
            tokens, pos_tags = list(zip(*item))
            pos_string = " ".join(pos_tags)
            postags.append(pos_string)

In [80]:
for sentence in do_examples:
    doc = nlp(sentence)
    for entity in doc:
        if entity.pos_ == "VERB":
            # print(entity.text, [(c, c.dep_, c.head) for c in entity.children])
            # for child in entity.children:
            #     if child.dep_ == "dative" and child.pos_ in ["NOUN", "PRON", "PROPN"]\
            #     or child.dep_ == "dobj" and child.pos_ in ["NOUN", "PRON", "PROPN"]:
            #         print(entity.text, child.text, child.pos_)

            # print("")
            children = get_children_flatten(entity, dep=True, return_tokens=True)
            print(children)
    print("")

[('she', 'nsubj', 'PRP', 0, 0, she), ('me', 'dative', 'PRP', 0, 2, me), ('ball', 'dobj', 'NN', 0, 4, ball), ('the', 'det', 'DT', 1, 3, the), ('.', 'punct', '.', 0, 5, .)]

[('she', 'nsubj', 'PRP', 0, 0, she), ('mom', 'dative', 'NN', 0, 2, mom), ('ball', 'dobj', 'NN', 0, 4, ball), ('the', 'det', 'DT', 1, 3, the), ('.', 'punct', '.', 0, 5, .)]

[('she', 'nsubj', 'PRP', 0, 0, she), ('grandma', 'dative', 'NN', 0, 2, grandma), ('book', 'dobj', 'NN', 0, 5, book), ('the', 'det', 'DT', 1, 3, the), ('big', 'amod', 'JJ', 1, 4, big), ('.', 'punct', '.', 0, 6, .)]

[('she', 'nsubj', 'PRP', 0, 0, she), ('is', 'aux', 'VBZ', 0, 1, is), ('dog', 'dative', 'NN', 0, 4, dog), ('the', 'det', 'DT', 1, 3, the), ('ball', 'dobj', 'NN', 0, 7, ball), ('the', 'det', 'DT', 1, 5, the), ('big', 'amod', 'JJ', 1, 6, big), ('.', 'punct', '.', 0, 8, .)]

[('she', 'nsubj', 'PRP', 0, 0, she), ('had', 'aux', 'VBD', 0, 1, had), ('me', 'dative', 'PRP', 0, 3, me), ('ball', 'dobj', 'NN', 0, 5, ball), ('the', 'det', 'DT', 1, 4,

In [88]:
dos = []
pps = []

for sentence in do_examples + pp_examples:
    doc = nlp(sentence)
    do = False
    pp = False
    for entity in doc:
        if entity.pos_ == "VERB":
            # print(entity.text, [(c, c.dep_, c.head) for c in entity.children])
            # for child in entity.children:
            #     if (child.dep_ == "dative" or child.dep_ == "prep") and child.text_ == "to":
            #         pobj = ""
            #         for grandchild in child.children:
            #             if grandchild.dep_ == "pobj":
            #                 pobj = grandchild.text_                            
            #         print(entity.text, child.text, child.pos_)

            children = get_children_flatten(entity, 0, dep=True)
            # print(children)
            tokens, dep, pos_string, depth, index = list(zip(*children))
            if "to" in tokens:
                if ("dative" in dep and "pobj" in dep) or ("prep" in dep and "pobj" in dep):
                    pp = True
                    # pps.append(sentence)
                    break
                # possibility for PP
            else:
                # possibility for DO
                # concatenation of dep and depth
                dep_depth = [f"{d}_{str(depth[i])}" for i, d in enumerate(dep)]
                if ("dobj_0" in dep_depth and "dative_0" in dep_depth) or Counter(dep_depth)['dobj_0'] >= 2:
                    do = True
                    # dos.append(sentence)
                    break
    if do:
        dos.append(sentence)
    elif pp:
        pps.append(sentence)
            # for child in children:

[('she', 'nsubj', 'PRP', 0, 0), ('me', 'dative', 'PRP', 0, 2), ('ball', 'dobj', 'NN', 0, 4), ('the', 'det', 'DT', 1, 3), ('.', 'punct', '.', 0, 5)]
[('she', 'nsubj', 'PRP', 0, 0), ('mom', 'dative', 'NN', 0, 2), ('ball', 'dobj', 'NN', 0, 4), ('the', 'det', 'DT', 1, 3), ('.', 'punct', '.', 0, 5)]
[('she', 'nsubj', 'PRP', 0, 0), ('grandma', 'dative', 'NN', 0, 2), ('book', 'dobj', 'NN', 0, 5), ('the', 'det', 'DT', 1, 3), ('big', 'amod', 'JJ', 1, 4), ('.', 'punct', '.', 0, 6)]
[('she', 'nsubj', 'PRP', 0, 0), ('is', 'aux', 'VBZ', 0, 1), ('dog', 'dative', 'NN', 0, 4), ('the', 'det', 'DT', 1, 3), ('ball', 'dobj', 'NN', 0, 7), ('the', 'det', 'DT', 1, 5), ('big', 'amod', 'JJ', 1, 6), ('.', 'punct', '.', 0, 8)]
[('she', 'nsubj', 'PRP', 0, 0), ('had', 'aux', 'VBD', 0, 1), ('me', 'dative', 'PRP', 0, 3), ('ball', 'dobj', 'NN', 0, 5), ('the', 'det', 'DT', 1, 4), ('.', 'punct', '.', 0, 6)]
[('she', 'nsubj', 'PRP', 0, 0), ('has', 'aux', 'VBZ', 0, 1), ('mommy', 'dative', 'NN', 0, 3), ('ball', 'dobj', 'N

In [89]:
dos, pps

(['she gave me the ball.',
  'she gave mom the ball.',
  'she sent grandma the big book.',
  'she is tossing the dog the big ball.',
  'she had given me the ball.',
  'she has given mommy the ball'],
 ['she gave the ball to me.',
  'she gave the ball to mom.',
  'she sent the big book to grandma.',
  'she is tossing the big ball to the dog.',
  'she had given the ball to me.'])

In [79]:
do_examples = [
    "she gave me the ball.",
    "she gave mom the ball.",
    "she sent grandma the big book.",
    "she is tossing the dog the big ball.",
    "she had given me the ball.",
    "she has given mommy the ball"
]

pp_examples = [
    "she gave the ball to me.",
    "she gave the ball to mom.",
    "she sent the big book to grandma.",
    "she is tossing the big ball to the dog.",
    "she had given the ball to me.",
]

do_postags = []

do_tags = get_postags(do_examples, 4, nlp)

for j, item in enumerate(do_tags):
    # print(j, item)
    if item == []:
        do_postags.append("")
    else:
        tokens, pos_tags = list(zip(*item))
        pos_string = " ".join(pos_tags)
        do_postags.append(pos_string)

pp_postags = []

pp_tags = get_postags(pp_examples, 4, nlp)

for j, item in enumerate(pp_tags):
    # print(j, item)
    if item == []:
        pp_postags.append("")
    else:
        tokens, pos_tags = list(zip(*item))
        pos_string = " ".join(pos_tags)
        pp_postags.append(pos_string)


6it [00:00, 273.99it/s]
5it [00:00, 275.22it/s]


In [39]:
# get linearized postags
for dp in do_postags:
    print(dp)

PRP VBD PRP DT NN .
PRP VBD NN DT NN .
PRP VBD NN DT JJ NN .
PRP VBZ VBG DT NN DT JJ NN .
PRP VBD VBN PRP DT NN .


In [42]:
do_regex = re.compile(r"(VBD|VBZ VBG|VBD VBN|VBN) (DT\s|DT JJ\s)?(PRP|NN|NNP|NNPS|NNS) (DT\s|DT JJ\s)?(PRP|NN|NNP|NNPS|NNS)")

pp_regex = re.compile(r"(VBD|VBZ VBG|VBD VBN|VBN) (DT\s|DT JJ\s)?(PRP|NN|NNP|NNPS|NNS) IN (DT\s|DT JJ\s)?(PRP|NN|NNP|NNPS|NNS)")

In [54]:
childes_do = []
childes_pp = []

for i, item in enumerate(tqdm(postags)):
    do_match = do_regex.search(item)
    pp_match = pp_regex.search(item)
    if do_match:
        childes_do.append((corpus[i], item, do_match))
    if pp_match:
        childes_pp.append((corpus[i], item, pp_match))

  0%|          | 0/763989 [00:00<?, ?it/s]

100%|██████████| 763989/763989 [00:00<00:00, 1600973.91it/s]


In [55]:
len(childes_do), len(childes_pp)

(4738, 4050)

In [56]:
childes_pp

[('it was a piece of a hanger .',
  'PRP VBD DT NN IN DT NN .',
  <re.Match object; span=(4, 22), match='VBD DT NN IN DT NN'>),
 ("you've got it in your mouth like that .",
  'PRP VBP VBN PRP IN PRP$ NN IN DT .',
  <re.Match object; span=(8, 22), match='VBN PRP IN PRP'>),
 ('that was grandma on the phone !',
  'DT VBD NN IN DT NN .',
  <re.Match object; span=(3, 18), match='VBD NN IN DT NN'>),
 ('have you about had it with the rings .',
  'VBP PRP RB VBN PRP IN DT NNS .',
  <re.Match object; span=(11, 27), match='VBN PRP IN DT NN'>),
 ('there you got him by the nose .',
  'RB PRP VBD PRP IN DT NN .',
  <re.Match object; span=(7, 23), match='VBD PRP IN DT NN'>),
 ('i put some chocolate on you .',
  'PRP VBD DT NN IN PRP .',
  <re.Match object; span=(4, 20), match='VBD DT NN IN PRP'>),
 ('you bumped yourself in the mouth .',
  'PRP VBD PRP IN DT NN .',
  <re.Match object; span=(4, 20), match='VBD PRP IN DT NN'>),
 ('i heard catherine say they put the tiger in the zoo !',
  'PRP VBD NNP V