In [3]:
import argparse
import config
# import inflect
import json
import math
# import os
import random
import utils
import pathlib

from collections import defaultdict
from string import Template
from dataclasses import dataclass
from itertools import product
from ordered_set import OrderedSet

In [4]:
@dataclass
class Dative:
    dative: str
    verb: str
    agent: str
    theme: str
    recipient: str

    def generate(self, marked_theme=False, marked_recipient=False):
        if self.dative == "do":
            template = Template("$agent $verb $recipient $theme .")
        elif self.dative == "pp":
            template = Template("$agent $verb $theme to $recipient .")

        if marked_theme:
            self.theme = f"the {self.theme}"

        if marked_recipient:
            self.recipient = f"the {self.recipient}"

        self.sentence = template.substitute(
            agent=self.agent, verb=self.verb, theme=self.theme, recipient=self.recipient
        )
        return self.sentence

    def givenness(self, discourse_sentence=None):
        return NotImplementedError


def read_lexicon(path):
    with open(path, "r") as f:
        lexicon = json.load(f)
        lexicon = {k: OrderedSet(v) for k, v in lexicon.items()}
        long = OrderedSet(
            [
                x
                for x in lexicon["animate"].union(lexicon["inanimate"])
                if len(x.split(" ")) > 2
            ]
        )
        short = OrderedSet(
            [
                x
                for x in lexicon["animate"].union(lexicon["inanimate"])
                if len(x.split(" ")) <= 2
            ]
        )
        nominals = OrderedSet(
            [
                x
                for x in lexicon["animate"].union(lexicon["inanimate"])
                - lexicon["pronoun"]
            ]
        )
        lexicon.update({"long": long, "short": short, "nominal": nominals})
    return lexicon


def generate_feature_combinations(lex, features):
    feature_combinations = []
    for fc in product(features, features):
        theme_features, recipient_features = fc
        theme_features = [lex[feature] for feature in theme_features]
        recipient_features = [lex[feature] for feature in recipient_features]
        theme_features = OrderedSet.intersection(*theme_features)
        recipient_features = OrderedSet.intersection(*recipient_features)
        if len(theme_features) >= 1 and len(recipient_features) >= 1:
            if len(theme_features) == 1 and len(recipient_features) == 1:
                continue
            else:
                feature_combinations.append(fc)
    return feature_combinations

In [5]:
adaptation_lexicon = read_lexicon("../data/lexicon/adaptation.json")
generalization_lexicon = read_lexicon("../data/lexicon/generalization.json")

# full_lexicon = {
#     k: adaptation_lexicon[k].union(generalization_lexicon[k])
#     for k in adaptation_lexicon.keys()
# }

pronominality = ["pronoun", "nominal"]
animacy = ["animate", "inanimate"]
length = ["long", "short"]

# generate all possible combinations of features for theme and recipient and then prune
features = list(product(pronominality, animacy, length))

feature_combinations = generate_feature_combinations(adaptation_lexicon, features)

In [6]:
plausibility = {"do": {'plausible': [], 'implausible': []}, "pp": {'plausible': [], 'implausible': []}}
for dative in ['do', 'pp']:
    for fc in feature_combinations:
        fc_id = utils.generate_acronym_tuple(fc)
        if fc_id in config.IMPLAUSIBLE[dative]:
            plausibility[dative]['implausible'].append(fc_id)
        else:
            plausibility[dative]['plausible'].append(fc_id)

In [7]:
len(plausibility['do']['implausible'])

19

In [8]:
def specify_sample_size(plausibility, N=20):
    sample_sizes = {'do': defaultdict(int), 'pp': defaultdict(int)}
    for dative, splits in plausibility.items():
        n_plausible, n_implausible = len(splits['plausible']), len(splits['implausible'])

        addition = n_implausible * N/n_plausible
        plausible_amt = int(n_plausible * math.floor(N + addition))

        print(dative, plausible_amt)

        for acronym in splits['plausible']:
            sample_sizes[dative][acronym] = math.floor(N + addition)
        for acronym in splits['implausible']:
            sample_sizes[dative][acronym] = 0
        sample_sizes[dative] = dict(sample_sizes[dative])
    return sample_sizes

In [10]:
sample_sizes = specify_sample_size(plausibility, 20)

do 688
pp 680


In [12]:
sample_sizes['do']

{'nalpas': 43,
 'nalpis': 43,
 'nalnal': 43,
 'nalnas': 43,
 'naspas': 43,
 'naspis': 43,
 'nasnal': 43,
 'nasnas': 43,
 'nilpas': 43,
 'nilpis': 43,
 'nilnal': 43,
 'nilnas': 43,
 'nispas': 43,
 'nispis': 43,
 'nisnal': 43,
 'nisnas': 43,
 'paspas': 0,
 'paspis': 0,
 'pasnal': 0,
 'pasnas': 0,
 'pasnil': 0,
 'pasnis': 0,
 'pispas': 0,
 'pisnal': 0,
 'pisnas': 0,
 'pisnil': 0,
 'pisnis': 0,
 'nalnil': 0,
 'nalnis': 0,
 'nasnil': 0,
 'nasnis': 0,
 'nilnil': 0,
 'nilnis': 0,
 'nisnil': 0,
 'nisnis': 0}