In [1]:
import json
import os
import random
import re

from collections import defaultdict
from nltk.tree import *
from ordered_set import OrderedSet

random.seed(42)

In [2]:
def read_childes(path):
    data = []
    with open(path, "r") as f:
        for line in f:
            data.append(line.strip().lower())

    return data

with open("../data/lexicon/adaptation.json") as f:
    adaptation = json.load(f)

adaptation_vocab = OrderedSet()
for k, v in adaptation.items():
    for vv in v:
        vv = re.sub(r'(the\s|of)+', '', vv).strip()
        words = vv.split()
        for word in words:
            # if word not in ['I', 'me', 'her', 'she', 'it']:
            adaptation_vocab.add(word.lower())

In [3]:
adaptation_vocab

OrderedSet(['me', 'her', 'she', 'i', 'it', 'elmo', 'bert', 'mommy', 'grandpa', 'big', 'bear', 'little', 'kitty', 'bird', 'dog', 'ball', 'doll', 'candy', 'teddy', 'juice', 'some', 'chocolate', 'beautiful', 'red'])

In [4]:
childes_train = read_childes(
    "../../smolm/data/corpora/babylm_data/babylm_100M/aochildes.train"
)

In [5]:
filenames = [
    "brown-adam.parsed",
    "brown-eve+animacy+theta.parsed",
    "brown-sarah.parsed",
    "soderstrom.parsed",
    "suppes.parsed",
    "valian+animacy+theta.parsed",
    "hslld-hv1-er.parsed",
    "hslld-hv1-mt.parsed",
]

cats = ["n", "v", "adj", "adv"]

_TAG_TO_CAT = {
    "NN": "n",
    "NNS": "n",
    "VB": "v",
    #     'VBD': 'v',
    "JJ": "adj",
    "RB": "adv",
}

relevant = list(_TAG_TO_CAT.keys())

TREEBANK_PATH = "../../smolm/data/corpora/childes-treebank"

trees = []
tree = ""

for file in os.listdir(TREEBANK_PATH):
    #     if "parsed" in file:
    if file in filenames:
        with open(f"{TREEBANK_PATH}/{file}", "r") as f:
            for line in f:
                # If you have completed a tree, add it to the list of trees
                if line.strip() == "":
                    if (
                        tree.strip() != ""
                        and tree.count(")") == tree.count("(")
                        and tree.count("ROOT") == 1
                    ):
                        trees.append(tree)
                    tree = ""
                else:
                    tree += line.strip()

In [6]:
parsed = Tree.fromstring(trees[11])
parsed.leaves()

_, linearised_pos = list(zip(*parsed.pos()))

" ".join(linearised_pos)


def clean_sentence(sentence):
    sentence = (
        sentence.replace(" 're", "'re")
        .replace(" 's", "'s")
        .replace(" 'll", "'ll")
        .replace(" 've", "'ve")
        .replace("_", " ")
        .replace("ING", "ing")
        .replace(" 't", "'t")
        .replace(" 'm", "'m")
        .replace("*pro*", "")
        .replace("*t*-1", " ")
        .replace("*", " ")
        .replace("^P^", " ")
        .replace(" n't", "n't")
        .replace(" 'd", "'d")
        .lower()
        .strip()
    )
    sentence = re.sub(" {2,}", " ", sentence).strip().replace("t -1", " ")
    return sentence


def linearize(tree):
    parsed = Tree.fromstring(tree)
    leaves, pos = list(zip(*parsed.pos()))
    # sent = " ".join(leaves)
    return leaves, " ".join(pos)

In [7]:
def insert_verb(sent, pos, span):
    start_idx = len(pos[: span[0]-1].split(" "))
    initial = list(sent[:start_idx])
    end_idx = len(pos[span[1]+1:].split(" "))
    end = list(sent[-end_idx:])
    verb = sent[start_idx:start_idx+1]
    return verb, initial + ['[verb]'] + end

In [8]:
# regex = r'(?<=*)(VB|VBZ|VBD)(?=^(PRP))'
regex = r"(?<=\w\W)(\b(VBD|VBZ|VB)\b)(?=((?!(\s)?(PRP|\.|$|VB))))"
vb_regex = r"(?<=\w\W)(\bVB\b)(?=((?!(\s)?(PRP|\.|,|\!|\?|$|VB|-<V\d>))))"
vbd_regex = r"(?<=\w\W)(\bVBD\b)(?=((?!(\s)?(PRP|\.|,|\!|\?|$|VB|-<V\d>))))"
vbz_regex = r"(?<=\w\W)(\bVBZ\b)(?=((?!(\s)?(PRP|\.|,|\!|\?|$|VB|-<V\d>))))"
nn_regex = r"(?<=\w\W)(\bNN\b)(?=.*)"
adj_regex = r"(?<=\w\W)(\bJJ\b)(?=.*)"

verbs = {
    "VB": [],
    "VBD": [],
    "VBZ": [],
}

non_verbs = {"NN": [], "JJ": []}

for i, tree in enumerate(trees):
    try:
        sent, pos = linearize(tree)
        regex_searches = {
            "VB": re.search(vb_regex, pos),
            "VBD": re.search(vbd_regex, pos),
            "VBZ": re.search(vbz_regex, pos),
        }
        for k, v in regex_searches.items():
            if v:
                if "W" not in pos and "?" not in sent:
                    span = v.span()
                    # verbs[k].append((sent, pos, span))
                    verb, inserted = insert_verb(sent, pos, span)
                    sentence = clean_sentence(" ".join(inserted))
                    sent_words = sentence.split(" ")
                    # dont add sentences with adaptation words
                    found = False
                    for word in sent_words:
                        if word in adaptation_vocab:
                            found = True
                            break
                    if not found:              
                        verbs[k].append([i, sentence, verb])

        # noun regexes
        # nn_search = re.search(nn_regex, pos)
        non_verb_searches = {
            "NN": re.search(nn_regex, pos),
            "JJ": re.search(adj_regex, pos),
        }
        for k, v in non_verb_searches.items():
            if v:
                if "W" not in pos and "?" not in sent:
                    span = v.span()
                    # non_verbs[k].append((sent, pos, span))
                    non_verb, inserted = insert_verb(sent, pos, span)
                    sentence = clean_sentence(" ".join(inserted))
                    sent_words = sentence.split(" ")
                    # dont add sentences with adaptation words
                    found = False
                    for word in sent_words:
                        if word in adaptation_vocab:
                            found = True
                            break
                    if not found:     
                        non_verbs[k].append([i, sentence, non_verb])
    except:
        continue

    # if re.search(vbd_regex, pos):
    #     verbs["VBD"].append((sent, pos))
    # if re.search(vbz_regex, pos):
    #     verbs["VBZ"].append((sent, pos))
    # if re.search(vb_regex, pos):
    #     verbs["VB"].append((sent, pos))
    # if re.search(regex, pos):
    #     print(sent)
    #     print(pos)
    #     print("\n")

In [9]:
{k : len(v) for k, v in verbs.items()}, {k : len(v) for k, v in non_verbs.items()}

({'VB': 6969, 'VBD': 1433, 'VBZ': 837}, {'NN': 25575, 'JJ': 12732})

In [10]:
verbs

{'VB': [[21, "let's get [verb] if thomas will like those .", ('see',)],
  [62, 'we [verb] to get !', ('need',)],
  [77, "i'm going to [verb] down there and get .", ('run',)],
  [79, "let's [verb] if you have anything clean .", ('see',)],
  [182, "he doesn't [verb] back up anymore .", ('go',)],
  [202,
   "let's [verb] if we have any clean pants that thomas can wear .",
   ('see',)],
  [226, "or you'd [verb] to eat your socks .", ('wanT',)],
  [241, 'we have to [verb] this microphone because .', ('move',)],
  [245,
   "now you're going to [verb] because you don't like your shirt off , huh .",
   ('cry',)],
  [294, 'one about to [verb] in .', ('go',)],
  [333, 'yes , you [verb] to let go !', ('need',)],
  [366, 'oh no , you may not [verb] with that now .', ('play',)],
  [374, "we'll [verb] about that .", ('see',)],
  [383, 'that one you can [verb] on .', ('chew',)],
  [386, "i'm going to [verb] down and get the phone .", ('go',)],
  [422, 'a cup would [verb] this easier .', ('make',)],
 

In [11]:
# verbs['VBD']
# sample 150 verbs
sampled_verbs = []
for k, v in verbs.items():
    sampled_verbs.extend(random.sample(v, 50))

# sample 150 non-verbs
sampled_non_verbs = []
for k, v in non_verbs.items():
    sampled_non_verbs.extend(random.sample(v, 75))

In [14]:
# for i,s,v in sampled_verbs:
#     if "-1" in s:
#         print(s, v)

# sampled_non_verbs
val_set = {
    'good': [s for i, s, v in sampled_verbs],
    'bad': [s for i, s, v in sampled_non_verbs]
}

In [15]:
val_set

{'good': ['oh , you have to [verb] one more day .',
  "yeah , let's [verb] with those .",
  "you're just going to have to [verb] at them , and then if",
  "because then you wouldn't [verb] any to drink .",
  'no [verb] ahead .',
  "he'll [verb] on your finger .",
  "you're not going to [verb] any in five years .",
  "tony tony don't [verb] any fingers .",
  "well , i'm afraid you're going   to [verb] the plate .",
  "don't [verb] that .",
  'and later on [verb] to put them in the dryer .',
  "he cann't [verb] cecile .",
  "he cann't [verb] out .",
  "don't [verb] with that .",
  "they'll probably [verb] to sleep .",
  'you did [verb] a mouth full of sand chi .',
  "because we've seen them used that way and just wanted to [verb] if",
  "i'll [verb] this one .",
  'mkay , now you can [verb] at this .',
  'santa claus will [verb] and take everything back .',
  'no he had to [verb] to the north pole .',
  "ok ok , let's [verb] nina have a turn first .",
  "no , leaves don't [verb] splinter

In [16]:
with open("../data/experiments/validation.json", "w") as f:
    json.dump(val_set, f, indent=4)

In [14]:
linearize(trees[81762])

(('and', 'the', 'pedalpushers', 'go', 'in', 'your', '.'),
 'CC DT NNS VB-<V1> IN PRP$ .')

In [151]:
clean_sentence(non_verbs['NN'][0][0])

"and now it's [verb] for miss catherine !"

In [122]:
# pos_seq = 'NNP , PRP$ NN VBD RB .'
# sentence = 'Joseph , your balloon fell down .'
# span = (14,17)
# start_idx = len(pos_seq[:13].split(" "))
# initial = sentence.split(" " )[:start_idx]

# end_idx = len(pos_seq[18:].split(" "))
# end = sentence.split(" ")[-end_idx:]

# initial + ['[verb]'] + end
# sentence.split(" ")[4:5]

In [127]:
# for sent, pos, span in verbs['VBD'][:20]:
#     verb, inserted = insert_verb(sent, pos, span)
#     print(" ".join(inserted), verb)