In [1]:
import json
import os
import random
import re
import spacy

import numpy as np

from collections import defaultdict, Counter
from nltk.tree import *
from ordered_set import OrderedSet

random.seed(42)

In [6]:
# spacy setup
gpu = spacy.prefer_gpu(0)
nlp = spacy.load("en_core_web_trf")

  from .autonotebook import tqdm as notebook_tqdm
  model.load_state_dict(torch.load(filelike, map_location=device))


In [7]:
alternating_verbs = '''feed,give,lease,lend,loan,pass,pay,peddle,refund,render,rent,repay,sell,serve,trade,advance,allocate,allot,assign,award,bequeath,cede,concede,extend,grant,guarantee,issue,leave,offer,owe,promise,vote,will,yield,bring,take,forward,hand,mail,post,send,ship,slip,smuggle,sneak,bounce,float,roll,slide,carry,drag,haul,heave,heft,hoise,kick,lug,pull,push,schlep,shove,tote,tow,tug,barge,bus,cart,drive,ferry,fly,row,shuttle,truck,wheel,wire,bash,bat,bunt,catapult,chuck,flick,fling,flip,hit,hurl,kick,lob,pass,pitch,punt,shoot,shove,slam,slap,sling,throw,tip,toss,ask,cite,pose,preach,quote,read,relay,show,teach,tell,write,cable,email,e-mail,fax,modem,netmail,phone,radio,relay,satellite,semaphore,sign,signal,telephone,telecast,telegraph,telex,wire,wireless'''.split(",")

do_only_verbs = '''accord,ask,bear,begrudge,bode,cost,deny,envy,flash,forbid,forgive,guarantee,issue,refuse,save,spare,strike,vouchsafe,wish,write,bet,bill,charge,fine,mulct,overcharge,save,spare,tax,tip,undercharge,wager,acknowledge,adopt,appoint,consider,crown,deem,designate,elect,esteeem,imagine,mark,nominate,ordain,proclaim,rate,recon,report,want,anoint,baptize,brand,call,christen,consecrate,crown,decree,dub,label,make,name,nickname,pronounce,rule,stamp,style,term,vote,adjudge,adjudicate,assume,avow,believe,confess,declare,fancy,find,judge,presume,profess,prove,suppose,think,warrant'''.split(",")

pp_only_verbs = '''address,administer,broadcast,convey,contribute,delegate,deliver,denounce,demonstrate,describe,dictate,dispatch,display,distribute,donate,elucidate,exhibit,express,explain,explicate,forfeit,illustrate,introduce,narrate,portray,proffer,recite,recommend,refer,reimburse,remit,restore,return,sacrifice,submit,surrender,transfer,transport,admit,allege,announce,articulate,assert,communicate,confess,convey,declare,mention,propose,recount,repeat,report,reveal,say,state,babble,bark,bawl,bellow,bleat,boom,bray,burble,cackle,call,carol,chant,chatter,chrip,cluck,coo,croak,croon,crow,cry,drawl,drone,gabble,gibber,groan,growl,grumble,grunt,hiss,holler,hoot,howl,jabber,lilt,lisp,moan,mumble,murmur,mutter,purr,rage,rasp,roar,rumble,scream,screech,shout,shriek,sing,snap,snarl,snuffle,splutter,squall,squawk,squeak,squeal,stammer,stutter,thunder,tisk,trill,trumpet,twitter,wail,warble,wheeze,whimper,whine,whisper,whistle,whoop,yammer,yap,yell,yelp,yodel,drop,hoist,lift,lower,raise,credit,entrust,furnish,issue,leave,present,provide,serve,supply,trust'''.split(",")

dative_verbs = sorted(list(set(alternating_verbs + do_only_verbs + pp_only_verbs)))

In [8]:
def get_children_flatten(token, depth=0, dep=False, return_tokens=False):
    children = []
    for child in token.children:
        if dep:
            if return_tokens:
                children.append((child.text.lower(), child.dep_, child.tag_, depth, child.i, child))
            else:
                children.append((child.text.lower(), child.dep_, child.tag_, depth, child.i))
        else:
            children.append(child.text.lower())
        children.extend(get_children_flatten(child, depth+1, dep, return_tokens))
    return children


def detect_dative(sentence,nlp):
    dative=False
    doc = nlp(sentence)
    for token in doc:
        if token.pos_ == "VERB":
            children = get_children_flatten(token, 0, dep=True)
            if len(children) > 0:
                tokens, dep, pos_string, depth, index = list(zip(*children))
                if "to" in tokens:
                    dep_depth = [f"{d}_{str(depth[i])}" for i, d in enumerate(dep)]
                    tok_dep = [f"{tokens[i]}_{dep[i]}" for i in range(len(tokens))]
                    if ("dobj_0" in dep_depth and "dative_0" in dep_depth and "pobj_1" in dep_depth) or ("dobj_0" in dep_depth and "prep_0" in dep_depth and "pobj_1" in dep_depth):
                        if "to_dative" in tok_dep or "to_prep" in tok_dep:
                            dative=True
                else:
                    dep_depth = [f"{d}_{str(depth[i])}" for i, d in enumerate(dep)]
                    tokens_dep = [f"{tokens[i]}_{dep[i]}" for i in range(len(tokens))]
                    if ("dobj_0" in dep_depth and "dative_0" in dep_depth) or Counter(dep_depth)['dobj_0'] >= 2:
                        if 'for_dative' not in tokens_dep and 'for_dobj' not in tokens_dep:
                            dative=True


    return dative

In [2]:
def read_childes(path):
    data = []
    with open(path, "r") as f:
        for line in f:
            data.append(line.strip().lower())

    return data

# with open("../data/lexicon/adaptation.json") as f:
#     adaptation = json.load(f)

adaptation_vocab = OrderedSet()
# for k, v in adaptation.items():
#     for vv in v:
#         vv = re.sub(r'(the\s|of)+', '', vv).strip()
#         words = vv.split()
#         for word in words:
#             # if word not in ['I', 'me', 'her', 'she', 'it']:
#             adaptation_vocab.add(word.lower())

In [3]:
adaptation_vocab

OrderedSet()

In [4]:
childes_train = read_childes(
    "../data/corpora/aochildes.train"
)

In [5]:
filenames = [
    "brown-adam.parsed",
    "brown-eve+animacy+theta.parsed",
    "brown-sarah.parsed",
    "soderstrom.parsed",
    "suppes.parsed",
    "valian+animacy+theta.parsed",
    "hslld-hv1-er.parsed",
    "hslld-hv1-mt.parsed",
]

cats = ["n", "v", "adj", "adv"]

_TAG_TO_CAT = {
    "NN": "n",
    "NNS": "n",
    "VB": "v",
    #     'VBD': 'v',
    "JJ": "adj",
    "RB": "adv",
}

relevant = list(_TAG_TO_CAT.keys())

TREEBANK_PATH = "../data/corpora/childes-treebank"

trees = []
tree = ""

for file in os.listdir(TREEBANK_PATH):
    #     if "parsed" in file:
    if file in filenames:
        with open(f"{TREEBANK_PATH}/{file}", "r") as f:
            for line in f:
                # If you have completed a tree, add it to the list of trees
                if line.strip() == "":
                    if (
                        tree.strip() != ""
                        and tree.count(")") == tree.count("(")
                        and tree.count("ROOT") == 1
                    ):
                        trees.append(tree)
                    tree = ""
                else:
                    tree += line.strip()

In [6]:
parsed = Tree.fromstring(trees[11])
parsed.leaves()

_, linearised_pos = list(zip(*parsed.pos()))

" ".join(linearised_pos)


def clean_sentence(sentence):
    sentence = (
        sentence.replace(" 're", "'re")
        .replace(" 's", "'s")
        .replace(" 'll", "'ll")
        .replace(" 've", "'ve")
        .replace("_", " ")
        .replace("ING", "ing")
        .replace(" 't", "'t")
        .replace(" 'm", "'m")
        .replace("*pro*", "")
        .replace("*t*-1", " ")
        .replace("*", " ")
        .replace("^P^", " ")
        .replace(" n't", "n't")
        .replace(" 'd", "'d")
        .lower()
        .strip()
    )
    sentence = re.sub(" {2,}", " ", sentence).strip().replace("t -1", " ")
    return sentence


def linearize(tree):
    parsed = Tree.fromstring(tree)
    leaves, pos = list(zip(*parsed.pos()))
    # sent = " ".join(leaves)
    return leaves, " ".join(pos)

In [7]:
def insert_verb(sent, pos, span):
    start_idx = len(pos[: span[0]-1].split(" "))
    initial = list(sent[:start_idx])
    end_idx = len(pos[span[1]+1:].split(" "))
    end = list(sent[-end_idx:])
    verb = sent[start_idx:start_idx+1]
    return verb, initial + ['[verb]'] + end

In [8]:
# regex = r'(?<=*)(VB|VBZ|VBD)(?=^(PRP))'
regex = r"(?<=\w\W)(\b(VBD|VBZ|VB)\b)(?=((?!(\s)?(PRP|\.|$|VB))))"
vb_regex = r"(?<=\w\W)(\bVB\b)(?=((?!(\s)?(PRP|\.|,|\!|\?|$|VB|-<V\d>))))"
vbd_regex = r"(?<=\w\W)(\bVBD\b)(?=((?!(\s)?(PRP|\.|,|\!|\?|$|VB|-<V\d>))))"
vbz_regex = r"(?<=\w\W)(\bVBZ\b)(?=((?!(\s)?(PRP|\.|,|\!|\?|$|VB|-<V\d>))))"
nn_regex = r"(?<=\w\W)(\bNN\b)(?=.*)"
adj_regex = r"(?<=\w\W)(\bJJ\b)(?=.*)"

verbs = {
    "VB": [],
    "VBD": [],
    "VBZ": [],
}

non_verbs = {"NN": [], "JJ": []}

for i, tree in enumerate(trees):
    try:
        sent, pos = linearize(tree)
        regex_searches = {
            "VB": re.search(vb_regex, pos),
            "VBD": re.search(vbd_regex, pos),
            "VBZ": re.search(vbz_regex, pos),
        }
        for k, v in regex_searches.items():
            if v:
                if "W" not in pos and "?" not in sent:
                    span = v.span()
                    # verbs[k].append((sent, pos, span))
                    verb, inserted = insert_verb(sent, pos, span)
                    sentence = clean_sentence(" ".join(inserted))
                    sent_words = sentence.split(" ")
                    # dont add sentences with adaptation words
                    found = False
                    for word in sent_words:
                        if word in adaptation_vocab:
                            found = True
                            break
                    if not found:
                        sentence = re.sub(',', ' ', sentence)
                        sentence = re.sub(r'\s+', ' ', sentence)
                        if "[verb] to" not in sentence and "[verb]n't" not in sentence and "[verb] off" not in sentence and "[verb] up" not in sentence and "[verb] down" not in sentence and "[verb] out" not in sentence and "[verb] in" not in sentence and "[verb] for" not in sentence and "[verb] down" not in sentence and "[verb] over" not in sentence and "[verb] under" not in sentence and "[verb] back" not in sentence and "[verb] on" not in sentence:
                            if verb[0] not in ['came', 'falled', 'become', 'became', 'was', 'were', 'seemed', 'knew', 'did']:
                                verbs[k].append([i, sentence, verb])

        # noun regexes
        # nn_search = re.search(nn_regex, pos)
        non_verb_searches = {
            "NN": re.search(nn_regex, pos),
            "JJ": re.search(adj_regex, pos),
        }
        for k, v in non_verb_searches.items():
            if v:
                if "W" not in pos and "?" not in sent:
                    span = v.span()
                    # non_verbs[k].append((sent, pos, span))
                    non_verb, inserted = insert_verb(sent, pos, span)
                    sentence = clean_sentence(" ".join(inserted))
                    sent_words = sentence.split(" ")
                    # dont add sentences with adaptation words
                    found = False
                    for word in sent_words:
                        if word in adaptation_vocab:
                            found = True
                            break
                    if not found:
                        sentence = re.sub(',', ' ', sentence)
                        sentence = re.sub(r'\s+', ' ', sentence)
                        if "[verb] to" not in sentence:
                            non_verbs[k].append([i, sentence, non_verb])
    except:
        continue

    # if re.search(vbd_regex, pos):
    #     verbs["VBD"].append((sent, pos))
    # if re.search(vbz_regex, pos):
    #     verbs["VBZ"].append((sent, pos))
    # if re.search(vb_regex, pos):
    #     verbs["VB"].append((sent, pos))
    # if re.search(regex, pos):
    #     print(sent)
    #     print(pos)
    #     print("\n")

In [9]:
{k : len(v) for k, v in verbs.items()}, {k : len(v) for k, v in non_verbs.items()}

({'VB': 6608, 'VBD': 1499, 'VBZ': 1025}, {'NN': 29452, 'JJ': 13826})

In [10]:
# verbs['VBD']

verb_sents = defaultdict(OrderedSet)
for idx, sentence, word in verbs['VBD']:
    for w in word:
        verb_sents[w].add(sentence)
verb_sents = dict(verb_sents)

In [21]:
is_dative = defaultdict(list)

for verb, sents in verb_sents.items():
    for sent in sents:
        dative = detect_dative(sent, nlp)
        is_dative[verb].append(dative)
is_dative = dict(is_dative)

In [11]:
# verbs['VBD']
# sample 150 verbs
verb_sentences = []
for k,v in verb_sents.items():
    verb_sentences.extend(v)

sampled_verbs = random.sample(verb_sentences, 200)
# sampled_verbs = []
# for k, v in verbs.items():
    # sampled_verbs.extend(random.sample(v, 50))

# sample 150 non-verbs
sampled_non_verbs = []
for k, v in non_verbs.items():
    sampled_non_verbs.extend(random.sample(v, 100))

In [13]:
# sampled_verbs
sampled_non_verbs

[[38656, 'you little [verb] .', ('kid',)],
 [107820, 'but a [verb] throws with his', ('seal',)],
 [37026, "i don't have a wide [verb] .", ('pencil',)],
 [1037, 'well wait for just a [verb] .', ('second',)],
 [10882, 'and a [verb] .', ('banana',)],
 [129180,
  'he was talking about the first [verb] of school about a couple of days',
  ('day',)],
 [115691, "you wouldn't go inside the [verb] .", ('cage',)],
 [8601, "i don't think it's really a [verb] .", ('horse',)],
 [38452, 'ya got her head right in the [verb] .', ('water',)],
 [10260, "that's from [verb] of the books .", ('one',)],
 [4481, "okay eve's [verb] .", ('turn',)],
 [157705, 'be here all [verb] .', ('day',)],
 [55965,
  "they started out with about five in the [verb] and i guess gradually they've able dropped out and finally she's alone .",
  ('class',)],
 [10871, "they're peeking out of the [verb] door .", ('kitchen',)],
 [92413,
  'and then we went outside and colby fell in the [verb] and got all',
  ('ocean',)],
 [39971, "s

In [14]:
# for i,s,v in sampled_verbs:
#     if "-1" in s:
#         print(s, v)

# sampled_non_verbs
val_set = {
    'good': [s for s in sampled_verbs],
    'bad': [s for i, s, v in sampled_non_verbs]
}

In [15]:
with open("../data/experiments/verbhood.json", "w") as f:
    json.dump(val_set, f, indent=4)

In [14]:
linearize(trees[81762])

(('and', 'the', 'pedalpushers', 'go', 'in', 'your', '.'),
 'CC DT NNS VB-<V1> IN PRP$ .')

In [151]:
clean_sentence(non_verbs['NN'][0][0])

"and now it's [verb] for miss catherine !"

In [122]:
# pos_seq = 'NNP , PRP$ NN VBD RB .'
# sentence = 'Joseph , your balloon fell down .'
# span = (14,17)
# start_idx = len(pos_seq[:13].split(" "))
# initial = sentence.split(" " )[:start_idx]

# end_idx = len(pos_seq[18:].split(" "))
# end = sentence.split(" ")[-end_idx:]

# initial + ['[verb]'] + end
# sentence.split(" ")[4:5]

In [127]:
# for sent, pos, span in verbs['VBD'][:20]:
#     verb, inserted = insert_verb(sent, pos, span)
#     print(" ".join(inserted), verb)