In [1]:
import config
import inflect
import json
import math
import os
import random

from string import Template
from dataclasses import dataclass
from itertools import product
from ordered_set import OrderedSet
from collections import defaultdict

In [3]:
adaptation_lexicon = {
    'pronoun': 'me,her,she,I,it'.split(","),
    'animate': 'me,her,she,elmo,bert,mommy,grampa,the boy,the big bear,the little kittie,the tiny bird,the little dog'.split(","),
    'inanimate': 'it,the ball,the doll,the candy,the teddy,the juice,some of the juice,some of the chocolate,the big doll,the beautiful doll,the red ball'.split(','),
    # 'agent': 'I,it,she,elmo,bert,mommy,grandpa,the big bear,the little kitty,the little bird,the little doggy,the ball,the doll,the candy,the teddy,the big doll,the beautiful doll,the red ball'.split(','),
    'agent': 'I,she,elmo,bert,mommy,grampa,grandpa,nina,mom,you,the kittie,the bear,the boy,teddy,ernie,nonna,becky,nancy,ryan'.split(','),
    'theme': 'me,it,her,elmo,bert,mommy,grampa,the big bear,the little kittie,the tiny bird,the little dog,the ball,the doll,the boy,the candy,the teddy,the juice,some of the juice,some of the chocolate,the big doll,the beautiful doll,the red ball'.split(','),
    'recipient': 'me,it,her,elmo,bert,mommy,grampa,the big bear,the little kittie,the tiny bird,the little dog,the ball,the doll,the boy,the candy,the teddy,the juice,some of the juice,some of the chocolate,the big doll,the beautiful doll,the red ball'.split(',')
}

generalization_lexicon = {
    'pronoun': 'him,them,we,us,he,it,they'.split(","),
    'animate': 'him,them,we,us,he,they,them,cookie monster,daddy,lucy,grandma,david,barney,nemo,pooh,the cute puppy,the small puppy,the cute baby,the small baby,the cute cat,the small cat,the girl,animals'.split(','),
    'inanimate': 'it,the book,the milk,the soup,the cake,the cheerios,the toys,the car,a bit of the milk,a few toys,a bit of the soup,a piece of cake,the small cake,a bit of the cheerios,a few cheerios'.split(','),
    # 'agent': 'we,he,they,it,cookie monster,daddy,lucy,grandma,david,barney,nemo,bugs bunny,the cute puppy,the small puppy,the cute baby,the small baby,the cute cat,the small cat,the book,the milk,the soup,the cake,the cheerios,the toys,the car'.split(','),
    'agent': 'we,he,they,cookie monster,daddy,lucy,grandma,winnie the pooh,the girl,danny,dad,papa,pinnochio,ben,charlie,jenny,sam,sammy,sarah,mark,jimmy'.split(','),
    'theme': 'us,him,them,it,cookie monster,daddy,lucy,grandma,david,barney,nemo,pooh,the cute puppy,the small puppy,the cute baby,the small baby,the cute cat,the small cat,the girl,it,the book,the milk,the soup,the cake,the cheerios,the toys,the car,a bit of the milk,a few toys,a bit of the soup,a piece of cake,the small cake,a bit of the cheerios,a few cheerios,the animals'.split(','),
    'recipient': 'us,him,them,it,cookie monster,daddy,lucy,grandma,david,barney,nemo,pooh,the cute puppy,the small puppy,the cute baby,the small baby,the cute cat,the small cat,the girl,it,the book,the milk,the soup,the cake,the cheerios,the toys,the car,a bit of the milk,a few toys,a bit of the soup,a piece of cake,the small cake,a bit of the cheerios,a few cheerios,the animals'.split(',')   
}

# write both to ../data/lexicon/{}.json
with open('../data/lexicon/adaptation.json', 'w') as f:
    json.dump(adaptation_lexicon, f)

with open('../data/lexicon/generalization.json', 'w') as f:
    json.dump(generalization_lexicon, f)


In [3]:
@dataclass
class Dative:
    dative: str
    verb: str
    agent: str
    theme: str
    recipient: str


    def generate(self, marked_theme=False, marked_recipient=False):
        if self.dative == "do":
            template = Template("$agent $verb $recipient $theme.")
        elif self.dative == "pp":
            template = Template("$agent $verb $theme to $recipient.")

        if marked_theme:
            self.theme = f"the {self.theme}"

        if marked_recipient:
            self.recipient = f"the {self.recipient}"
            
        self.sentence = template.substitute(
            agent=self.agent, 
            verb=self.verb, 
            theme=self.theme, 
            recipient=self.recipient
        )
        return self.sentence
    
    def givenness(self, discourse_sentence=None):
        return NotImplementedError

In [4]:
def read_lexicon(path):
    with open(path, "r") as f:
        lexicon = json.load(f)
        lexicon = {k: OrderedSet(v) for k, v in lexicon.items()}
        long = OrderedSet([x for x in lexicon['animate'].union(lexicon['inanimate']) if len(x.split(" ")) > 2])
        short = OrderedSet([x for x in lexicon['animate'].union(lexicon['inanimate']) if len(x.split(" ")) <= 2])
        nominals = OrderedSet([x for x in lexicon['animate'].union(lexicon['inanimate']) - lexicon['pronoun']])
        lexicon.update({'long': long, 'short': short, 'nominal': nominals})
    return lexicon

In [5]:
adaptation_lexicon = read_lexicon("../data/lexicon/adaptation.json")
generalization_lexicon = read_lexicon("../data/lexicon/generalization.json")

In [6]:
# lexicon_files = os.listdir("../data/lexicon/")
# lexicon = {}
# for file in lexicon_files:
#     with open(f"../data/lexicon/{file}", "r") as f:
#         if 'txt' in file:
#             key = file.replace(".txt", "")
#             lexicon[key] = OrderedSet()
#             for line in f:
#                 lexicon[key].add(line.strip())

# long = OrderedSet([x for x in lexicon['animate'].union(lexicon['inanimate']) if len(x.split(" ")) > 2])
# short = OrderedSet([x for x in lexicon['animate'].union(lexicon['inanimate']) if len(x.split(" ")) <= 2])
# nominals = OrderedSet([x for x in lexicon['animate'].union(lexicon['inanimate']) - lexicon['pronoun']])
# lexicon.update({'long': long, 'short': short, 'nominal': nominals})

# expressions = [v for k,v in lexicon.items()]
# expressions = OrderedSet().union(*expressions)

# lexicon is the union of both
lexicon = {k: adaptation_lexicon[k].union(generalization_lexicon[k]) for k in adaptation_lexicon.keys()}


In [7]:
{k: len(v) for k,v in lexicon.items()}, {k: len(v) for k,v in adaptation_lexicon.items()}, {k: len(v) for k,v in generalization_lexicon.items()}

({'pronoun': 11,
  'animate': 31,
  'inanimate': 25,
  'agent': 42,
  'theme': 52,
  'recipient': 52,
  'long': 22,
  'short': 34,
  'nominal': 46},
 {'pronoun': 5,
  'animate': 11,
  'inanimate': 11,
  'agent': 18,
  'theme': 21,
  'recipient': 21,
  'long': 9,
  'short': 13,
  'nominal': 18},
 {'pronoun': 7,
  'animate': 20,
  'inanimate': 15,
  'agent': 25,
  'theme': 32,
  'recipient': 32,
  'long': 13,
  'short': 22,
  'nominal': 28})

In [8]:
# features
pronominality = ['pronoun', 'nominal']
animacy = ['animate', 'inanimate']
length = ['long', 'short']

lex = lexicon.copy()

# generate all possible combinations of features for theme and recipient and then prune
features = list(product(pronominality, animacy, length))

def generate_feature_combinations(lex = lexicon):    
    feature_combinations = []
    for fc in product(features, features):
        theme_features, recipient_features = fc
        theme_features = [lex[feature] for feature in theme_features]
        recipient_features = [lex[feature] for feature in recipient_features]
        theme_features = OrderedSet.intersection(*theme_features)
        recipient_features = OrderedSet.intersection(*recipient_features)
        if len(theme_features) >= 1 and len(recipient_features) >= 1:
            if len(theme_features) == 1 and len(recipient_features) == 1:
                continue
            else:
                feature_combinations.append(fc)
    return feature_combinations

feature_combinations = generate_feature_combinations(adaptation_lexicon)
len(feature_combinations)

35

In [9]:
def generate_feature_space(feature_combo, lex):
    theme_features, recipient_features = feature_combo
    theme_features = [lex[feature] for feature in theme_features] + [lex['theme']]
    recipient_features = [lex[feature] for feature in recipient_features] + [lex['recipient']]
    return lex['agent'], OrderedSet.intersection(*theme_features), OrderedSet.intersection(*recipient_features)

In [10]:
# def sample_items(agents, themes, recipients, N):
#     sampled_agents, sampled_themes, sampled_recipients = [], [], []
#     for _ in range(N):
        

In [11]:
def sample_items(agents, themes, recipients, N):
    sampled_agents, sampled_themes, sampled_recipients = [], [], []
    for i in range(N):
        sampled_theme = random.choice(list(themes))
        
        conflict_set = OrderedSet(config.CONFLICTS[sampled_theme] if sampled_theme in config.CONFLICTS.keys() else [])
        # print(sampled_theme, conflict_set)
        recipient_space = recipients - OrderedSet([sampled_theme]) - conflict_set
        # print(recipient_space)
        sampled_recipient = random.choice(list(recipient_space))
        
        if sampled_theme in config.CONFLICTS.keys():
            conflict_set = conflict_set.union(OrderedSet(config.CONFLICTS[sampled_theme]))
        # print(sampled_theme, conflict_set)
        agent_space = agents - OrderedSet([sampled_theme]+[sampled_recipient]) - conflict_set
        sampled_agent = random.choice(list(agent_space))

        sampled_agents.append(sampled_agent)
        sampled_themes.append(sampled_theme)
        sampled_recipients.append(sampled_recipient)
        # print("")
    return sampled_agents, sampled_themes, sampled_recipients

In [12]:
fc = (('pronoun', 'inanimate', 'short'), ('pronoun', 'inanimate', 'short'))

feature_space = generate_feature_space(fc, generalization_lexicon)
# sample items
# print(feature_space)
# sample_items(*feature_space, 1)

feature_combinations

[(('pronoun', 'animate', 'short'), ('pronoun', 'animate', 'short')),
 (('pronoun', 'animate', 'short'), ('pronoun', 'inanimate', 'short')),
 (('pronoun', 'animate', 'short'), ('nominal', 'animate', 'long')),
 (('pronoun', 'animate', 'short'), ('nominal', 'animate', 'short')),
 (('pronoun', 'animate', 'short'), ('nominal', 'inanimate', 'long')),
 (('pronoun', 'animate', 'short'), ('nominal', 'inanimate', 'short')),
 (('pronoun', 'inanimate', 'short'), ('pronoun', 'animate', 'short')),
 (('pronoun', 'inanimate', 'short'), ('nominal', 'animate', 'long')),
 (('pronoun', 'inanimate', 'short'), ('nominal', 'animate', 'short')),
 (('pronoun', 'inanimate', 'short'), ('nominal', 'inanimate', 'long')),
 (('pronoun', 'inanimate', 'short'), ('nominal', 'inanimate', 'short')),
 (('nominal', 'animate', 'long'), ('pronoun', 'animate', 'short')),
 (('nominal', 'animate', 'long'), ('pronoun', 'inanimate', 'short')),
 (('nominal', 'animate', 'long'), ('nominal', 'animate', 'long')),
 (('nominal', 'anima

In [15]:
# generate generalization set -- 10 items per feature combination? = 350 items
generalization_set = []
for fc in feature_combinations:
    feature_space = generate_feature_space(fc, generalization_lexicon)
    sampled_items = sample_items(*feature_space, 100)

    for a, t, r in zip(*sampled_items):
        dative = Dative("do", "pilked", a, t, r)
        print(dative.generate())

daddy pilked them him.
we pilked them him.
the small baby pilked him them.
the small cat pilked him them.
daddy pilked us him.
nemo pilked them him.
the cake pilked them us.
the small cat pilked him them.
barney pilked them him.
the cute baby pilked him us.
david pilked him them.
cookie monster pilked him us.
we pilked them him.
cookie monster pilked them him.
lucy pilked him them.
the small puppy pilked us him.
the cake pilked us them.
the milk pilked us him.
the cheerios pilked him them.
it pilked them him.
we pilked him them.
they pilked him us.
the cake pilked him them.
we pilked them him.
the book pilked us him.
the toys pilked him them.
the small puppy pilked him them.
we pilked him them.
the toys pilked us them.
the book pilked him them.
david pilked him them.
cookie monster pilked them him.
the cute baby pilked us him.
the milk pilked him them.
the cake pilked him them.
the book pilked them us.
grandma pilked him them.
the cake pilked us them.
the car pilked them him.
the cake 

In [325]:
random.seed(42)

adaptation_lexicon = defaultdict(list)
adaptation_pct = 0.4

for p, a, l in features:
    space = OrderedSet.intersection(lexicon[p], lexicon[a], lexicon[l])
    space_length = len(space)

    # sample
    sampled_items = random.sample(list(space), math.floor(adaptation_pct*space_length))
    for item in [p, a, l]:
        adaptation_lexicon[item].extend(sampled_items)

    adaptation_lexicon['theme'].extend(list(lexicon['theme'].intersection(sampled_items)))
    adaptation_lexicon['recipient'].extend(list(lexicon['recipient'].intersection(sampled_items)))

adaptation_lexicon['agent'] = random.sample(list(lexicon['agent']), math.floor(adaptation_pct*len(lexicon['agent'])))

adaptation_lexicon = {k: OrderedSet(v) for k,v in adaptation_lexicon.items()}

generalization_lexicon = {k: v - adaptation_lexicon[k] for k,v in lexicon.items()}


In [326]:
adaptation_lexicon

{'pronoun': OrderedSet(['me', 'us']),
 'animate': OrderedSet(['me', 'us', 'the little puppy', 'the little kitty', 'grandma', 'the bird', 'mommy', 'the kitty', 'becky', 'santa claus', 'the dog']),
 'long': OrderedSet(['the little puppy', 'the little kitty', 'some of the soup', 'the beautiful doll', 'a few toys', 'the big teddy']),
 'theme': OrderedSet(['me', 'us', 'the little kitty', 'the little puppy', 'becky', 'mommy', 'grandma', 'the kitty', 'the dog', 'the bird', 'santa claus', 'the big teddy', 'the beautiful doll', 'some of the soup', 'a few toys', 'the ball', 'the car', 'the lollipop', 'the juice', 'the pooh', 'the soup', 'the cake', 'pooh', 'donald duck', 'some milk', 'some candy']),
 'recipient': OrderedSet(['me', 'us', 'the little kitty', 'the little puppy', 'becky', 'mommy', 'grandma', 'the kitty', 'the dog', 'the bird', 'santa claus', 'the big teddy', 'the beautiful doll', 'the ball', 'the car', 'the lollipop', 'the juice', 'pooh', 'donald duck']),
 'short': OrderedSet(['me',

In [333]:
generalization_lexicon

{'theme': OrderedSet(['him', 'her', 'them', 'it', 'mary', 'grandpa', 'the book', 'the toy', 'the tiger', 'the lion', 'the bear', 'rudolph', 'king kong']),
 'animate': OrderedSet(['him', 'her', 'them', 'mary', 'grandpa']),
 'recipient': OrderedSet(['him', 'her', 'them', 'it', 'mary', 'grandpa', 'the book', 'the toy', 'the tiger', 'the lion', 'the bear', 'rudolph', 'king kong']),
 'agent': OrderedSet(['he', 'she', 'we', 'mary', 'grandpa', 'king kong']),
 'pronoun': OrderedSet(['he', 'him', 'she', 'her', 'them', 'it', 'we']),
 'inanimate': OrderedSet(['it', 'the book', 'the toy', 'the tiger', 'the lion', 'the bear', 'rudolph', 'king kong']),
 'long': OrderedSet(),
 'short': OrderedSet(['him', 'her', 'them', 'mary', 'grandpa', 'it', 'the book', 'the toy', 'the tiger', 'the lion', 'the bear', 'rudolph', 'king kong']),
 'nominal': OrderedSet(['mary', 'grandpa', 'the book', 'the toy', 'the tiger', 'the lion', 'the bear', 'rudolph', 'king kong'])}

In [327]:
adaptation_expressions = [v for k,v in adaptation_lexicon.items()]
adaptation_expressions = OrderedSet().union(*adaptation_expressions)

adaptation_vocab = OrderedSet([xx for x in adaptation_expressions for xx in x.split()]) - OrderedSet(['the', 'of', 'a', 'an'])

In [328]:
ignore_list = []
for exp in expressions:
    if len(OrderedSet(exp.split()).intersection(adaptation_vocab)) > 0:
        ignore_list.append(exp)

len(ignore_list)

59

In [329]:
generalization_lexicon = {k: v - OrderedSet(ignore_list) for k,v in lexicon.items()}

In [332]:
adaptation_lexicon

{'pronoun': OrderedSet(['me', 'us']),
 'animate': OrderedSet(['me', 'us', 'the little puppy', 'the little kitty', 'grandma', 'the bird', 'mommy', 'the kitty', 'becky', 'santa claus', 'the dog']),
 'long': OrderedSet(['the little puppy', 'the little kitty', 'some of the soup', 'the beautiful doll', 'a few toys', 'the big teddy']),
 'theme': OrderedSet(['me', 'us', 'the little kitty', 'the little puppy', 'becky', 'mommy', 'grandma', 'the kitty', 'the dog', 'the bird', 'santa claus', 'the big teddy', 'the beautiful doll', 'some of the soup', 'a few toys', 'the ball', 'the car', 'the lollipop', 'the juice', 'the pooh', 'the soup', 'the cake', 'pooh', 'donald duck', 'some milk', 'some candy']),
 'recipient': OrderedSet(['me', 'us', 'the little kitty', 'the little puppy', 'becky', 'mommy', 'grandma', 'the kitty', 'the dog', 'the bird', 'santa claus', 'the big teddy', 'the beautiful doll', 'the ball', 'the car', 'the lollipop', 'the juice', 'pooh', 'donald duck']),
 'short': OrderedSet(['me',

In [310]:
# OrderedSet.intersection(*recipient_features)
# feature_combinations

fc = (('pronoun', 'animate', 'short'), ('nominal', 'inanimate', 'short'))
# theme_features, recipient_features = fc
# theme_features = [lexicon[feature] for feature in theme_features] + [lexicon['theme']]
# recipient_features = [lexicon[feature] for feature in recipient_features] + [lexicon['recipient']]

# OrderedSet.intersection(*theme_features), OrderedSet.intersection(*recipient_features)


def generate_feature_space(feature_combo, lex):
    theme_features, recipient_features = feature_combo
    theme_features = [lex[feature] for feature in theme_features] + [lex['theme']]
    recipient_features = [lex[feature] for feature in recipient_features] + [lex['recipient']]
    return lex['agent'], OrderedSet.intersection(*theme_features), OrderedSet.intersection(*recipient_features)
    

In [311]:
# for fc in feature_combinations:
#     theme_features, recipient_features = fc
#     theme_features = [lexicon[feature] for feature in theme_features] + [lexicon['theme']]
#     recipient_features = [lexicon[feature] for feature in recipient_features] + [lexicon['recipient']]
#     theme = random.choice(list(OrderedSet.intersection(*theme_features)))
#     recipient = random.choice(list(OrderedSet.intersection(*recipient_features)))
#     print(theme, recipient)

# random.seed(1234)

def sample_items(agents, themes, recipients, N):
    sampled_agents, sampled_themes, sampled_recipients = [], [], []
    for i in range(N):
        sampled_theme = random.choice(list(themes))
        
        conflict_set = OrderedSet(config.CONFLICTS[sampled_theme] if sampled_theme in config.CONFLICTS.keys() else [])
        # print(conflict_set)
        recipients = recipients - OrderedSet([sampled_theme]) - conflict_set
        sampled_recipient = random.choice(list(recipients))
        
        if sampled_theme in config.CONFLICTS.keys():
            conflict_set = conflict_set.union(OrderedSet(config.CONFLICTS[sampled_theme]))
        # print(conflict_set)
        agents = agents - OrderedSet([sampled_theme]+[sampled_recipient]) - conflict_set
        sampled_agent = random.choice(list(agents))

        sampled_agents.append(sampled_agent)
        sampled_themes.append(sampled_theme)
        sampled_recipients.append(sampled_recipient)
    return sampled_agents, sampled_themes, sampled_recipients


In [312]:
# generate_feature_space(fc, adaptation_lexicon)
sample_items(*generate_feature_space(fc, adaptation_lexicon), 5)

# generate generalization items
generalization_set = []
for fc in feature_combinations:
    sampled_items = sample_items(*generate_feature_space(fc, generalization_lexicon), 10)

(['the little bird',
  'grandpa',
  'the little puppy',
  'the puppy',
  'the little bird'],
 ['us', 'me', 'me', 'me', 'me'],
 ['the ball', 'the juice', 'the ball', 'the juice', 'the ball'])

In [231]:
sampled_instances = sample_items(lexicon['agent'], OrderedSet.intersection(*theme_features), OrderedSet.intersection(*recipient_features), 5)

# generate sentences and store

adaptation_vocab = OrderedSet([xx for x in OrderedSet(sum(sampled_instances, [])) for xx in x.split()]) - OrderedSet(['the', 'of', 'a', 'an'])

# lexical expressions that have at least one token overlap with the sampled instances, we want to remove all of these when sampling the generalization sets.
ignore_list = []
for exp in expressions:
    if len(OrderedSet(exp.split()).intersection(adaptation_vocab)) > 0:
        ignore_list.append(exp)

generalization_lexicon = {k: v - OrderedSet(ignore_list) for k, v in lexicon.items()}

# proceed with generating generalization sets.
for fc_g in feature_combinations:
    ag, tg, rg = generate_feature_space(fc_g, generalization_lexicon)
    try:
        sampled_instances_gen = sample_items(ag, tg, rg, min(len(ag), len(tg), len(rg), 5))
        print(len(sampled_instances_gen[0]))
    except:
        continue

2
1
2
2
2
2
1
1
1
1
1
2
1
5
5
2
5
2
1
5
5
2
5
2
1
5
5
2
5
2
1
5
5
2
5


In [188]:
sampled_instances

(['david', 'mommy', 'the little cat', 'the little puppy', 'david'],
 ['him', 'me', 'her', 'her', 'them'],
 ['the toy', 'the lion', 'the tiger', 'the doll', 'donald duck'])

In [191]:
for a, t, r in zip(*sampled_instances):
    dative = Dative("pp", "is pilking", a, t, r)
    print(dative.generate())

david is pilking him to the toy.
mommy is pilking me to the lion.
the little cat is pilking her to the tiger.
the little puppy is pilking her to the doll.
david is pilking them to donald duck.


In [172]:
adaptation_vocab = OrderedSet([xx for x in OrderedSet(sum(sampled_instances, [])) for xx in x.split()]) - OrderedSet(['the', 'of', 'a', 'an'])


for exp in expressions:
    if len(OrderedSet(exp.split()).intersection(adaptation_vocab)) > 0:
        ignore_list.append(exp)

In [175]:
removal.intersection(OrderedSet(ignore_list))

OrderedSet(['david', 'mommy', 'the little cat', 'the little puppy', 'him', 'me', 'her', 'them', 'the toy', 'the lion', 'the tiger', 'the doll', 'donald duck'])

In [179]:
ignore_list

OrderedSet(['the cat', 'the puppy', 'the little bird', 'the little kitty', 'the little doggy', 'the little dog', 'the little baby', 'the duck', 'the beautiful doll', 'the yellow duck', 'a little soup', 'a little cake'])

In [152]:
sampled_instances

(['david', 'mommy', 'the little cat', 'the little puppy', 'david'],
 ['him', 'me', 'her', 'her', 'them'],
 ['the toy', 'the lion', 'the tiger', 'the doll', 'donald duck'])

In [None]:
'''
Algo:

sample N from each:
    remove conflicts and repetitions
    return agent, theme, recipient

loop through all feature combinations:

agent space = lexicon['agent']
theme space = lexicon['theme']
recipient space = lexicon['recipient']
sample(agent_space, theme_space, recipient_space, N=k)

generalization_lexicon = lexicon - all items in the sampled adaptation agent, theme and recipient set
get generalization_spaces
for each combination of features:
    sample(*generalization_spaces, N=n)


TODO:
sample function
prune features_combinations for impossible cases, e.g. pronoun + inanimate + long

'''