In [1]:
import math
import operator
import random
import utils

from collections import defaultdict, Counter
from functools import reduce
from itertools import product, cycle, islice
from ordered_set import OrderedSet
from string import Template

In [2]:
prenom = utils.read_json("../data/new-lexicon/prenom-modifiers.json")
pp = utils.read_json("../data/new-lexicon/pp-modifiers.json")

lexicon = utils.read_csv_dict("../data/new-lexicon/datives-lexicon-new.csv")

In [3]:
lexicon[0]

{'lemma': 'him',
 'singular': 'him',
 'plural': 'NA',
 'animacy': 'animate',
 'pronominality': 'pronoun',
 'definiteness_flexibility': 'fixed',
 'definiteness': 'definite'}

In [4]:
def prenomify(key, form):
    modifiers = prenom[key]
    modified = []
    for m in modifiers:
        modified.append(f"{m} {form}")

    return modified

def ppmodify(key, form):
    modifiers = pp[key]
    modified = []
    for m in modifiers:
        modified.append(f"{form} {m}")

    return modified

print(f"Prenominal Modification Example: {prenomify('ball', 'ball')}")

Prenominal Modification Example: ['red ball', 'blue ball', 'green ball', 'little ball']


In [5]:
do_template = Template("$agent [verb] $recipient $theme .")
po_template = Template("$agent [verb] $theme to $recipient .")

In [6]:
lexicon[0]

{'lemma': 'him',
 'singular': 'him',
 'plural': 'NA',
 'animacy': 'animate',
 'pronominality': 'pronoun',
 'definiteness_flexibility': 'fixed',
 'definiteness': 'definite'}

In [7]:
def possible_definiteness_forms(entry):
    return {
        "singular": [
            (f"the {entry['singular']}", "definite"),
            (f"a {entry['singular']}", "indefinite"),
        ],
        "plural": [
            (f"the {entry['plural']}", "definite"),
            (f"some {entry['plural']}", "indefinite"),
        ],
    }


def definiteness_forms(word, number):
    if number == "singular":
        return [(f"a {word}", "indefinite"), (f"the {word}", "definite")]
    else:
        return [(f"some {word}", "indefinite"), (f"the {word}", "definite")]

In [8]:
definiteness_forms("balls", "plural")

[('some balls', 'indefinite'), ('the balls', 'definite')]

In [9]:
"""
Pipeline:

for entry in lexicon:
    create singular unique values w/ definiteness expansion
        add
    create plural unique values w/ definiteness expansion
        add

    for entry in singular and plural:
        access prenoms
            add
        access pp modifiers
            add
"""

definiteness = {"definite": OrderedSet(), "indefinite": OrderedSet()}
animacy = {"animate": OrderedSet(), "inanimate": OrderedSet()}
pronominality = {"pronoun": OrderedSet(), "noun": OrderedSet()}
unique_args = OrderedSet()
form2lemma = {}
lemma2forms = defaultdict(list)

for entry in lexicon:
    lemma, singular, plural, anim, defness, pronom = (
        entry['lemma'],
        entry["singular"],
        entry["plural"],
        entry["animacy"],
        entry["definiteness"],
        entry["pronominality"],
    )
    # add all the cases that are not supposed 
    # to be modified and have fixed definiteness
    if entry["definiteness_flexibility"] == "fixed":
        definiteness[defness].add(singular)
        animacy[anim].add(singular)
        pronominality[pronom].add(singular)
        unique_args.add(singular)
        form2lemma[singular] = lemma
        lemma2forms[lemma].append(singular)
    else:
        defs = possible_definiteness_forms(entry)
        # add the unmodified forms first
        for k,v in defs.items():
            for form, definess in v:
                definiteness[definess].add(form)
                animacy[anim].add(form)
                pronominality[pronom].add(form)
                unique_args.add(form)
                form2lemma[form] = lemma
                lemma2forms[lemma].append(form)

        # modify and then add definiteness
        all_modified_singular = []
        all_modified_plural = []

        # basic prenom modification
        all_modified_singular.extend(prenomify(lemma, singular))
        all_modified_plural.extend(prenomify(lemma, plural)) # comment out if we only want singular.
        
        try:
            # basic pp modification (I think singular only for these)
            all_modified_singular.extend(ppmodify(lemma, singular))
        except:
            continue

        # # combine the two
        # for entry in prenomify(lemma, singular):
        #     try:
        #         pped = ppmodify(lemma, entry)
        #         all_modified_singular.extend(pped)
        #     except:
        #         continue

        # add definiteness to all modified
        for entry in all_modified_singular:
            for form, definess in definiteness_forms(entry, "singular"):
                definiteness[definess].add(form)
                animacy[anim].add(form)
                pronominality[pronom].add(form)
                unique_args.add(form)
                form2lemma[form] = lemma
                lemma2forms[lemma].append(form)

        for entry in all_modified_plural:
            for form, definess in definiteness_forms(entry, "plural"):
                definiteness[definess].add(form)
                animacy[anim].add(form)
                pronominality[pronom].add(form)
                unique_args.add(form)
                form2lemma[form] = lemma
                lemma2forms[lemma].append(form)

lemma2forms = dict(lemma2forms)

In [10]:
# sampling based on feature combinations. 
# Unsure what to do with length

x = '''
Sampling Strategies:

Option 1: just sample randomly by only varying pronominality x animacy x definiteness

Option 2: bucket forms into bins (short/long) and then include them in the combinations, i.e., 
pronominality x animacy x definiteness x length_bin

Option 3: get all possible length differences, include these into combination, then sample smaller amounts so as to not blow up the final stimuli


Meta-Strategy: first see how many unique combinations we get in each case.

Final decision: start with the setting in option1; then for each hypothesis, sample possible length-diffs.
'''

print(x)


Sampling Strategies:

Option 1: just sample randomly by only varying pronominality x animacy x definiteness

Option 2: bucket forms into bins (short/long) and then include them in the combinations, i.e., 
pronominality x animacy x definiteness x length_bin

Option 3: get all possible length differences, include these into combination, then sample smaller amounts so as to not blow up the final stimuli


Meta-Strategy: first see how many unique combinations we get in each case.

Final decision: start with the setting in option1; then for each hypothesis, sample possible length-diffs.



In [11]:
option1 = []

pronominality_features = pronominality.keys()
animacy_features = animacy.keys()
definiteness_features = definiteness.keys()

single_combo = list(product(pronominality_features, animacy_features, definiteness_features))

argument_combos = list(product(single_combo, single_combo))

len(single_combo), len(argument_combos)

(8, 64)

In [12]:
pronominality_features = pronominality.keys()
animacy_features = animacy.keys()
definiteness_features = definiteness.keys()
length_bins = ["short", "long"]

single_combo = list(product(pronominality_features, animacy_features, definiteness_features, length_bins))

argument_combos = list(product(single_combo, single_combo))

len(single_combo), len(argument_combos)

(16, 256)

In [13]:
pronominality_features = pronominality.keys()
animacy_features = animacy.keys()
definiteness_features = definiteness.keys()

arg_lengths = OrderedSet([len(item.split(" ")) for item in unique_args])
length_bins = sorted(OrderedSet([first - second for first, second in product(arg_lengths, arg_lengths)]))

single_combo = list(product(pronominality_features, animacy_features, definiteness_features))

argument_combos = list(product(single_combo, single_combo))

# argument_combos = list(product(argument_combos, length_bins))

len(single_combo), len(argument_combos)

(8, 64)

In [14]:
# pronominality['pronoun'].intersection(animacy['animate']).intersection(definiteness['definite'])
all_features = dict()
for feature_set in [pronominality, animacy, definiteness]:
    for k, v in feature_set.items():
        all_features[k] = v

def feature_intersection(features: tuple):
    items = [all_features[f] for f in features]
    intersected = OrderedSet.intersection(*items)

    return intersected

def feature_string(features):
    string = "".join([f[0] for f in features])
    return string

In [15]:
'''
Basic sampling pipeline

for feature values in the set of all feature combos:
    items_arg = get their intersection => this would be that argument's space
    pairs = product(items_arg1, items_arg2)
    for pair in all possible pairs of items:
        if lemma is not equal:
            throw into sampling space
        if empty, do nothing

    sample = sample.sample(n)


Basic sampling pipeline that includes all possible length diffs

for feature values in the set of all feature combos:
    items_arg = get their intersection => this would be that argument's space
    pairs = product(items_arg1, items_arg2)
    initial_sampling_space = defaultdict(diff -> list)
    for pair in all possible pairs of items:
        if lemma is not equal:
            initial_sampling_space[diff].append(pair)
        if empty, do nothing, and move onto the next feature value

    # sampled = flatten([sample(v, min(n, len(v))) for k,v in initial_sampling_space.items()])
    for k, v in initial_sampling_space.items():
        key = stringify(feature_values, k)
        value = sample(v, min(n, len(v)))

        # do it
'''

# theme_features, recipient_features = argument_combos[18]

# print(theme_features, recipient_features)
random.seed(1024)

unique_combos = 0
N = 8

string2combo = defaultdict(list)

samples = defaultdict(list)
for theme_features, recipient_features in argument_combos:

    string = f"{feature_string(theme_features)}{feature_string(recipient_features)}"
    string2combo[string] = [theme_features, recipient_features]

    theme_features = feature_intersection(theme_features)
    recipient_features = feature_intersection(recipient_features)

    pairs = product(theme_features, recipient_features)
    initial_sampling_space = defaultdict(list)

    for item1, item2 in pairs:
        if item1 != item2: # eliminate the obvious
            lemma1, lemma2 = form2lemma[item1], form2lemma[item2]

            if lemma1 != lemma2:
                if not ((lemma1 == "me" and lemma2 == "us") or (lemma1 == "us" and lemma2 == "me")):
                    length_diff = len(item1.split(" ")) - len(item2.split(" "))
                    initial_sampling_space[length_diff].append((item1, item2))
    initial_sampling_space = dict(initial_sampling_space)

    for k, v in initial_sampling_space.items():
        key = str(k)
        sampled = random.sample(v, min(N, len(v)))
        if len(sampled) < N:
            sampled = list(islice(cycle(sampled), N))
        samples[f"{string}_{k}"] = sampled

    unique_combos += len(initial_sampling_space)

string2combo = dict(string2combo)

In [16]:
samples

defaultdict(list,
            {'padpad_0': [('him', 'her'),
              ('them-a', 'her'),
              ('us', 'her'),
              ('her', 'me'),
              ('us', 'them-a'),
              ('them-a', 'me'),
              ('me', 'him'),
              ('him', 'me')],
             'padpai_0': [('us', 'someone'),
              ('me', 'someone'),
              ('them-a', 'someone'),
              ('her', 'someone'),
              ('him', 'someone'),
              ('us', 'someone'),
              ('me', 'someone'),
              ('them-a', 'someone')],
             'padpid_0': [('him', 'it'),
              ('her', 'them-i'),
              ('us', 'them-i'),
              ('him', 'them-i'),
              ('her', 'it'),
              ('them-a', 'it'),
              ('me', 'it'),
              ('me', 'them-i')],
             'padpii_0': [('us', 'something'),
              ('me', 'something'),
              ('them-a', 'something'),
              ('him', 'something'),
              ('her',

In [17]:
unique_combos

366

In [18]:
length_distribution = defaultdict(int)
non_length_feature_distribution = defaultdict(int)
lemma_distribution_theme = defaultdict(int)
lemma_distribution_recipient = defaultdict(int)

for k, v in samples.items():
    feature, length = k.split("_")
    length_distribution[length] += len(v)
    non_length_feature_distribution[feature] += len(v)
    for vv in v:
        theme, recipient = form2lemma[vv[0]], form2lemma[vv[1]]
        lemma_distribution_theme[theme] += 1
        lemma_distribution_recipient[recipient] += 1


length_distribution = Counter(length_distribution)
non_length_feature_distribution = Counter(non_length_feature_distribution)
lemma_distribution_theme = Counter(lemma_distribution_theme)
lemma_distribution_recipient = Counter(lemma_distribution_recipient)

In [19]:
lemma_distribution_theme.most_common()

[('someone', 208),
 ('something', 208),
 ('ball', 183),
 ('dog', 180),
 ('girl', 178),
 ('book', 176),
 ('cookie', 168),
 ('horse', 156),
 ('cup', 155),
 ('boy', 142),
 ('cat', 136),
 ('bear', 136),
 ('box', 114),
 ('pencil', 112),
 ('them-i', 109),
 ('toy', 108),
 ('it', 107),
 ('him', 48),
 ('daddy', 48),
 ('us', 47),
 ('them-a', 46),
 ('me', 40),
 ('her', 35),
 ('mommy', 32),
 ('lego', 22),
 ('chair', 21),
 ('food', 13)]

In [20]:
lemma_distribution_recipient.most_common()

[('someone', 208),
 ('something', 208),
 ('dog', 178),
 ('cat', 169),
 ('ball', 167),
 ('cookie', 165),
 ('book', 162),
 ('girl', 158),
 ('cup', 148),
 ('bear', 147),
 ('horse', 140),
 ('boy', 138),
 ('box', 136),
 ('toy', 127),
 ('it', 113),
 ('them-i', 103),
 ('pencil', 97),
 ('me', 48),
 ('us', 45),
 ('him', 44),
 ('them-a', 40),
 ('mommy', 40),
 ('her', 39),
 ('daddy', 38),
 ('lego', 30),
 ('chair', 27),
 ('food', 13)]

In [21]:
# length_distribution
# non_length_feature_distribution
# samples['paipii_0']
# samples

for k, v in length_distribution.items():
    print(f"{k},{v}")


0,304
-1,256
-2,256
-4,256
-5,208
-3,256
-6,80
1,256
2,256
4,256
5,208
3,256
6,80


In [22]:
# for k, v in non_length_feature_distribution.items():
#     print(f"{k},{v}")

# non_length_feature_distribution.most_common(20)
non_length_feature_distribution

Counter({'nadnad': 88,
         'nadnid': 88,
         'nadnii': 88,
         'nidnad': 88,
         'nidnid': 88,
         'nidnii': 88,
         'niinad': 88,
         'niinid': 88,
         'niinii': 88,
         'nadnai': 80,
         'nainad': 80,
         'nainid': 80,
         'nainii': 80,
         'nidnai': 80,
         'niinai': 80,
         'nainai': 72,
         'padnad': 48,
         'padnid': 48,
         'padnii': 48,
         'painad': 48,
         'painid': 48,
         'painii': 48,
         'pidnad': 48,
         'pidnid': 48,
         'pidnii': 48,
         'piinad': 48,
         'piinid': 48,
         'piinii': 48,
         'nadpad': 48,
         'nadpai': 48,
         'nadpid': 48,
         'nadpii': 48,
         'nidpad': 48,
         'nidpai': 48,
         'nidpid': 48,
         'nidpii': 48,
         'niipad': 48,
         'niipai': 48,
         'niipid': 48,
         'niipii': 48,
         'padnai': 40,
         'painai': 40,
         'pidnai': 40,
         'p

In [23]:
hims = ["Ross", "Joseph", "Ethan", "Peter", "Thomas"]  # sample some names
hers = ["Lily", "Nina", "Eve", "Catherine", "Sally"]  # sample some names
thems_animate = ["those people", "the children", "the birds", "the ducks", "the pigs"]
thems_inanimate = [
    "the funny pictures",
    "the crayons",
    "the photographs",
    "the candies",
    "the socks",
]
its = ["the picture", "the milk", "the paper", "the apple", "the coffee"]  # sample some object names

given_items = {
    "him": hims,
    "her": hers,
    "them-animate": thems_animate,
    "them-inanimate": thems_inanimate,
    "it": its,
}

# prior mention instead of "given"
given_templates = {
    1: {
        'agent-only': Template("Do you see $agent ?"),
        'agent-1arg': Template("Do you see $agent and $arg1 ?"),
        'agent-2arg': Template("Do you see $agent $arg1 and $arg2 ?"),
    },
    2: {
        'agent-only': Template("Look it's $agent !"),
        'agent-1arg': Template("Look it's $agent and $arg1 !"),
        'agent-2arg': Template("Look it's $agent $arg1 and $arg2 !"),
    },
    3: {
        'agent-only': Template("Here's $agent !"),
        'agent-1arg': Template("Here's $agent with $arg1 !"),
        'agent-2arg': Template("Here's $agent $arg1 and $arg2 !"),
    }
}

# # agent-only, agent-theme, agent-recipient, agent-theme-recipient
# def generate_given_prefix(agent, arg1, arg2, type="agent-only", template=1):
#     template = given_templates[template][type]
#     if type == "agent-only":
#         prefix = template.substitute(agent=entry['agent'])
#     elif type == 'agent-arg2'

In [24]:
# random.sample(range(1, 10), 5)
list(range(1,11))

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [25]:
# len(samples)
# samples
agents = ["Laura", "Mark", "Sarah", "William", "Alex"]

agents = list(islice(cycle(agents), 10))

raw_stimuli = []
idx = 1

for h_id, (combo, items) in enumerate(samples.items()):
    random.shuffle(agents)

    # different givenness orders when both are given:
    theme_recipient = random.sample(range(1, 11), 5)
    recipient_theme = [x for x in range(1, 11) if x not in theme_recipient]

    for h_item, (agent, item) in enumerate(zip(agents, items)):
        # raw_stimuli.append(item)
        theme, recipient = item

        do_sentence = do_template.substitute(
            agent=agent, theme=item[0], recipient=item[1]
        )
        po_sentence = po_template.substitute(
            agent=agent, theme=item[0], recipient=item[1]
        )

        string, length_diff = combo.split("_")
        length_diff = int(length_diff)
        theme_features, recipient_features = string2combo[string]

        for template_num, type_templates in given_templates.items():
            for typ, template in type_templates.items():
                if typ == "agent-only":
                    prefix = template.substitute(agent=agent)
                    raw_stimuli.append(
                        {   
                            "template_id": template_num,
                            "item": idx,
                            "hypothesis_id": h_id + 1,
                            "hypothesis_item": h_item + 1,
                            "template_type": "agent-only",
                            "template": typ,
                            "combo": combo,
                            "agent": agent,
                            "theme": item[0],
                            "recipient": item[1],
                            "prefix": prefix,
                            "do_sentence": do_sentence,
                            "po_sentence": po_sentence,
                            "theme_pronominality": theme_features[0],
                            "theme_animacy": theme_features[1],
                            "theme_definiteness": theme_features[2],
                            "recipient_pronominality": recipient_features[0],
                            "recipient_animacy": recipient_features[1],
                            "recipient_definiteness": recipient_features[2],
                            "length_diff": length_diff,
                        }
                    )
                elif typ == "agent-1arg":
                    # theme: theme def recipient indef
                    # theme: theme def recipient def
                    if theme_features[2] == "definite":
                        if theme_features[0] == "pronoun":
                            if theme in ["him", "her", "them-a", "them-i", "it"]:
                                if theme == "them-a":
                                    theme_arg = "them-animate"
                                elif theme == "them-i":
                                    theme_arg = "them-inanimate"
                                else:
                                    theme_arg = theme
                                given_theme = random.sample(given_items[theme_arg], 1)[0]
                        else:
                            given_theme = theme
                        prefix = template.substitute(agent=agent, arg1=given_theme)
                        raw_stimuli.append(
                            {   
                                "template_id": template_num,
                                "item": idx,
                                "hypothesis_id": h_id + 1,
                                "hypothesis_item": h_item + 1,
                                "template_type": "agent-1arg",
                                "template": "agent-theme",
                                "combo": combo,
                                "agent": agent,
                                "theme": item[0],
                                "recipient": item[1],
                                "prefix": prefix,
                                "do_sentence": do_sentence,
                                "po_sentence": po_sentence,
                                "theme_pronominality": theme_features[0],
                                "theme_animacy": theme_features[1],
                                "theme_definiteness": theme_features[2],
                                "recipient_pronominality": recipient_features[0],
                                "recipient_animacy": recipient_features[1],
                                "recipient_definiteness": recipient_features[2],
                                "length_diff": length_diff,
                            }
                        )

                    # recipient: theme def recipient def
                    # recipient: theme indef recipient def
                    if recipient_features[2] == "definite":
                        if recipient_features[0] == "pronoun":
                            if recipient in ["him", "her", "them-a", "them-i", "it"]:
                                if recipient == "them-a":
                                    recipient_arg = "them-animate"
                                elif recipient == "them-i":
                                    recipient_arg = "them-inanimate"
                                else:
                                    recipient_arg = recipient
                                given_recipient = random.sample(given_items[recipient_arg], 1)[0]
                        else:
                            given_recipient = recipient
                        prefix = template.substitute(agent=agent, arg1=given_recipient)
                        raw_stimuli.append(
                            {   
                                "template_id": template_num,
                                "item": idx,
                                "hypothesis_id": h_id + 1,
                                "hypothesis_item": h_item + 1,
                                "template_type": "agent-1arg",
                                "template": "agent-recipient",
                                "combo": combo,
                                "agent": agent,
                                "theme": item[0],
                                "recipient": item[1],
                                "prefix": prefix,
                                "do_sentence": do_sentence,
                                "po_sentence": po_sentence,
                                "theme_pronominality": theme_features[0],
                                "theme_animacy": theme_features[1],
                                "theme_definiteness": theme_features[2],
                                "recipient_pronominality": recipient_features[0],
                                "recipient_animacy": recipient_features[1],
                                "recipient_definiteness": recipient_features[2],
                                "length_diff": length_diff,
                            }
                        )
                elif typ == "agent-2arg":
                    # half = theme and recipient
                    # half = recipient and theme
                    if theme_features[2] == "definite" and recipient_features[2] == "definite":
                        if theme_features[0] == "pronoun":
                            if theme in ["him", "her", "them-a", "them-i", "it"]:
                                if theme == "them-a":
                                    theme_arg = "them-animate"
                                elif theme == "them-i":
                                    theme_arg = "them-inanimate"
                                else:
                                    theme_arg = theme
                                given_theme = random.sample(given_items[theme_arg], 1)[0]
                        else:
                            given_theme = theme
                            
                        if recipient_features[0] == "pronoun":
                            if recipient in ["him", "her", "them-a", "them-i", "it"]:
                                if recipient == "them-a":
                                    recipient_arg = "them-animate"
                                elif recipient == "them-i":
                                    recipient_arg = "them-inanimate"
                                else:
                                    recipient_arg = recipient
                                given_recipient = random.sample(given_items[recipient_arg], 1)[0]
                        else:
                            given_recipient = recipient
                        
                        # -- actual stimuli
                        if idx+1 in theme_recipient:
                            prefix = template.substitute(agent=agent, arg1=given_theme, arg2=given_recipient)
                            raw_stimuli.append(
                                {   
                                    "template_id": template_num,
                                    "item": idx,
                                    "hypothesis_id": h_id + 1,
                                    "hypothesis_item": h_item + 1,
                                    "template_type": "agent-2arg",
                                    "template": "agent-theme-recipient",
                                    "combo": combo,
                                    "agent": agent,
                                    "theme": item[0],
                                    "recipient": item[1],
                                    "prefix": prefix,
                                    "do_sentence": do_sentence,
                                    "po_sentence": po_sentence,
                                    "theme_pronominality": theme_features[0],
                                    "theme_animacy": theme_features[1],
                                    "theme_definiteness": theme_features[2],
                                    "recipient_pronominality": recipient_features[0],
                                    "recipient_animacy": recipient_features[1],
                                    "recipient_definiteness": recipient_features[2],
                                    "length_diff": length_diff,
                                }
                            )
                        else:
                            premise = template.substitute(agent=agent, arg1=given_recipient, arg2=given_theme)
                            raw_stimuli.append(
                                {   
                                    "template_id": template_num,
                                    "item": idx,
                                    "hypothesis_id": h_id + 1,
                                    "hypothesis_item": h_item + 1,
                                    "template_type": "agent-2arg",
                                    "template": "agent-recipient-theme",
                                    "combo": combo,
                                    "agent": agent,
                                    "theme": item[0],
                                    "recipient": item[1],
                                    "prefix": prefix,
                                    "do_sentence": do_sentence,
                                    "po_sentence": po_sentence,
                                    "theme_pronominality": theme_features[0],
                                    "theme_animacy": theme_features[1],
                                    "theme_definiteness": theme_features[2],
                                    "recipient_pronominality": recipient_features[0],
                                    "recipient_animacy": recipient_features[1],
                                    "recipient_definiteness": recipient_features[2],
                                    "length_diff": length_diff,
                                }
                            )


        # raw_stimuli.append((idx, h_id+1, h_item+1, combo, agent, item[0], item[1], do_sentence, po_sentence))
        idx += 1


# len(raw_stimuli) * 2 * 3

In [26]:
4300 * 9 * 2

77400

In [32]:
len(raw_stimuli) * 2
# raw_stimuli

40224

In [28]:
(len(raw_stimuli) * 2) * 45 * 10

18100800

In [None]:
# raw_stimuli

[{'template_id': 1,
  'item': 1,
  'hypothesis_id': 1,
  'hypothesis_item': 1,
  'template_type': 'agent-only',
  'template': 'agent-only',
  'combo': 'padpad_0',
  'agent': 'Mark',
  'theme': 'him',
  'recipient': 'her',
  'prefix': 'Do you see Mark ?',
  'do_sentence': 'Mark [verb] her him .',
  'po_sentence': 'Mark [verb] him to her .',
  'theme_pronominality': 'pronoun',
  'theme_animacy': 'animate',
  'theme_definiteness': 'definite',
  'recipient_pronominality': 'pronoun',
  'recipient_animacy': 'animate',
  'recipient_definiteness': 'definite',
  'length_diff': 0},
 {'template_id': 1,
  'item': 1,
  'hypothesis_id': 1,
  'hypothesis_item': 1,
  'template_type': 'agent-1arg',
  'template': 'agent-theme',
  'combo': 'padpad_0',
  'agent': 'Mark',
  'theme': 'him',
  'recipient': 'her',
  'prefix': 'Do you see Mark and Peter ?',
  'do_sentence': 'Mark [verb] her him .',
  'po_sentence': 'Mark [verb] him to her .',
  'theme_pronominality': 'pronoun',
  'theme_animacy': 'animate',
  

In [34]:
form2lemma[item['theme']]

'book'

In [37]:
final_lemma_distribution_theme = defaultdict(int)
final_lemma_distribution_recipient = defaultdict(int)

# for k, v in samples.items():
#     feature, length = k.split("_")
#     for vv in v:
#         theme, recipient = form2lemma[vv[0]], form2lemma[vv[1]]
#         lemma_distribution_theme[theme] += 1
#         lemma_distribution_recipient[recipient] += 1

for item in raw_stimuli:
    final_lemma_distribution_theme[form2lemma[item['theme']]] +=1
    final_lemma_distribution_recipient[form2lemma[item['recipient']]] +=1


# length_distribution = Counter(length_distribution)
# non_length_feature_distribution = Counter(non_length_feature_distribution)
final_lemma_distribution_theme = Counter(final_lemma_distribution_theme)
final_lemma_distribution_recipient = Counter(final_lemma_distribution_recipient)

In [38]:
final_lemma_distribution_theme

Counter({'book': 2064,
         'girl': 1914,
         'cookie': 1728,
         'cat': 1608,
         'dog': 1578,
         'bear': 1560,
         'horse': 1482,
         'box': 1476,
         'cup': 1470,
         'them-i': 1416,
         'it': 1404,
         'someone': 1380,
         'something': 1380,
         'ball': 1380,
         'boy': 1344,
         'pencil': 1146,
         'toy': 972,
         'them-a': 672,
         'him': 660,
         'us': 648,
         'her': 504,
         'mommy': 468,
         'daddy': 396,
         'me': 336,
         'lego': 168,
         'chair': 156,
         'food': 150})

In [39]:
final_lemma_distribution_recipient

Counter({'dog': 1890,
         'ball': 1818,
         'book': 1758,
         'girl': 1746,
         'cup': 1674,
         'cat': 1608,
         'it': 1482,
         'box': 1470,
         'horse': 1452,
         'boy': 1434,
         'cookie': 1422,
         'bear': 1392,
         'someone': 1380,
         'something': 1380,
         'them-i': 1338,
         'pencil': 1140,
         'toy': 984,
         'us': 780,
         'her': 624,
         'mommy': 540,
         'me': 492,
         'him': 468,
         'them-a': 456,
         'daddy': 288,
         'lego': 192,
         'chair': 138,
         'food': 114})