In [1]:
import inflect
from pattern.en import singularize, pluralize
import glob
import re
from collections import defaultdict
import csv
import random

In [2]:
inflector = inflect.engine()

In [3]:
def is_plural(plural_form):
    singular_form = singularize(plural_form)
    plural = True if plural_form != singular_form else False
    return plural

## predicates

- **sports, activity, :** involve/s [VBG], require/s [VBG], improve/s.
- **fruits, vegetables, clothes, toys:** can be [VBN], cause/s [VBG].
- **vehicles, weapons, plants, trees, shapes:** can [VB], can be [VBN], use [NNS] 
- **birds, animals, fish, mammal:** can [VB], can be [VBN], use [NNS], love [NN/NNS]

## Verbs

dax, blick, wif, gyre, gimble

```
verbs = [{"dax": {"VB": "dax", "VBG": "daxing", "VBD": "daxed"}, "blick}]

or

vb
```

## Nouns

wug, zup, blicket, fep, 

In [4]:
categories = ["furniture", "fruit", "vehicle", "weapon", "vegetable", 
              "tool", "bird", "sport", "toy", "clothing"]

# each category's predicate entries are of the form: (verb, arg) 
# where arg is some morpho-syntactic form of either verb, noun or adjective.
predicates = {
    "sports": [('involves', '[VBG]'), ('requires', '[VBG]'), ('includes', '[VBG]'), ('is', '[JJ]')],
    "activities": [('involves', '[VBG]'), ('requires', '[VBG]'), ('includes', '[VBG]'), ('is', '[JJ]')],
    "birds": [('can', '[VB]'), ('can be', '[VBD]'), ('are', '[JJ]'), ('use a', '[NN]'), ('love', '[NNS]'), ('have', '[NNS]')],
    "animals": [('can', '[VB]'), ('can be', '[VBD]'), ('are', '[JJ]'), ('use a', '[NN]'), ('love', '[NNS]'), ('have', '[NNS]')],
    "mammals": [('can', '[VB]'), ('can be', '[VBD]'), ('are', '[JJ]'), ('use a', '[NN]'), ('love', '[NNS]'), ('have', '[NNS]')],
    "fish": [('can', '[VB]'), ('can be', '[VBD]'), ('are', '[JJ]'), ('use a', '[NN]'), ('love', '[NNS]'), ('have', '[NNS]')],
    "vehicles": [('can', '[VB]'), ('can be', '[VBD]'), ('are', '[JJ]'), ('use a', '[NN]'), ('have', '[NNS]')],
    "weapons": [('can', '[VB]'), ('can be', '[VBD]'), ('are', '[JJ]'), ('use a', '[NN]'), ('have', '[NNS]')],
    "tools": [('can', '[VB]'), ('can be', '[VBD]'), ('are', '[JJ]'), ('use a', '[NN]'), ('have', '[NNS]')],
    "furniture": [('can', '[VB]'), ('can be', '[VBD]'), ('are', '[JJ]'), ('use a', '[NN]'), ('have', '[NNS]')],
    "plants": [('can', '[VB]'), ('can be', '[VBD]'), ('are', '[JJ]'), ('use a', '[NN]'), ('have', '[NNS]')],
    "trees": [('can', '[VB]'), ('can be', '[VBD]'), ('are', '[JJ]'), ('use a', '[NN]'), ('have', '[NNS]')],
    "shapes": [('can be', '[VBD]'), ('are', '[JJ]'), ('have', '[NNS]')],
    "fruits": [('can be', '[VBD]'), ('are', '[JJ]'), ('have', '[NNS]')],
    "vegetables": [('can be', '[VBD]'), ('are', '[JJ]'), ('have', '[NNS]')],
#     "toys": [('can be', '[VBD]'), ('are', '[JJ]')],
    "clothes": [('can be', '[VBD]'), ('are', '[JJ]'), ('have', '[NNS]')],
    
}

In [5]:
args = {
    '[VB]': ["dax", "wif", "blick", "gyre", "gimble"],
    '[VBD]': ["daxed", "wiffed", "blicked", "gyred", "gimbled"],
    '[VBG]': ["daxing", "wiffing", "blicking", "gyring", "gimbling"],
    '[JJ]': ["beamish", "slithy", "mimsy", "vorpal", "frabjous"],
    '[NN]': ["wug", "fep", "blicket", "jabberwock", "tove"],
    '[NNS]': ["wugs", "feps", "blickets", "jabberwocks", "toves"]
}

In [6]:
category_texts = ["birds", "weapons", "vegetables", "tools" , "furniture", "vehicles", "clothes", "fruits", "sports"]

control = {
    'birds': 'vegetables',
    'weapons': 'vehicles',
    'furniture': 'fruits',
    'clothes': 'tools',
    'vehicles': 'birds',
    'fruits': 'furniture',
    'vegetables': 'weapons',
    'tools': 'clothes',
    'sports': 'birds'
}

singulars = ["broccoli", "parsley", "kale", "rutabaga", "garlic", "rice"]
stimuli =  []
items = defaultdict(list)
for file in glob.glob("../data/rosch1975/*.txt"):
    category = re.search(r'(?<=5\/)(.*)(?=\.txt)', file).group(1)
    if category == "toy":
        continue
    else:
        plural_category = inflector.plural_noun(category)
        if category == "furniture":
            plural_category = "furniture"
        if category == "clothing":
            plural_category = "clothes"
        with open(file, "r") as f:
            for rank, word in enumerate(f):
                word = word.strip()
                if category not in ["sport", "activity"]:
                    word_text = inflector.plural_noun(word)
                else:
                    word_text = word
                if word in singulars:
                    word_text = word
                for i, (predicate, argument) in enumerate(predicates[plural_category]):
                    if category in ["sport", "activity"]:
                        predicate_text = predicate
                        conclusion = f"All {plural_category} {inflector.plural_verb(predicate)} {argument}."
                        control_conclusion = f"All {control[plural_category]} {inflector.plural_verb(predicate)} {argument}."
                    else:
                        if word_text in singulars:
                            if predicate == "are":
                                predicate_text = "is"
                            elif predicate == "have":
                                predicate_text = "has"
                            elif predicate == "use a":
                                predicate_text = "uses a"
                            else:
                                predicate_text = predicate
                        else:
                            predicate_text = predicate
                        conclusion = f"All {plural_category} {predicate} {argument}."
                        control_conclusion = f"All {control[plural_category]} {predicate} {argument}."
                    premise = f"{word_text.capitalize()} {predicate_text} {argument}."
                    
                    stimuli.extend([(premise.replace(arg, form), conclusion.replace(arg, form), control_conclusion.replace(arg, form), word, category, predicate, word_text.capitalize(), plural_category,  i+1, j+1) for arg in args.keys() for j, form in enumerate(args[arg]) if arg == argument])

In [8]:
conclusion_only = list(zip(*stimuli))
conclusion_only = [conclusion_only[i] for i in range(len(conclusion_only)) if i not in [0, 3, 6]]
# conclusion_only = list(zip(*set(conclusion_only)))
conclusion_only = list(set(list(zip(*conclusion_only))))

In [9]:
len(conclusion_only)

195

In [11]:
# stimuli
with open("../data/premiseconclusion.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["premise", "conclusion", "control", "item", "category", "blankpredicate", "item_word", "category_word", "predicate_id", "argument_id"])
    writer.writerows(stimuli)

In [94]:
# conclusion_only
with open("../data/conclusiononly.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["conclusion", "category", "blankpredicate", "category_word", "predicate_id", "argument_id"])
    writer.writerows(conclusion_only)

In [38]:
sentence = "Brussels sprouts can dax.".replace(".", " .")

In [39]:
sentence.split()

['Brussels', 'sprouts', 'can', 'dax', '.']

In [18]:
random.seed(1234)

In [15]:
def shuffle_sentence(sentence, word):
    sentence = sentence.replace(".", "")
    if len(word.split()) > 1:
        sentence = sentence.replace(word, "@".join(word.split())).split()
    else:
        sentence = sentence.split()
    random.shuffle(sentence)
        
    return " ".join(sentence).replace("@", " ").capitalize() + "."

In [16]:
for i in range(10):
    print(shuffle_sentence("Brussels sprouts can dax.", "Brussels sprouts"))

Dax brussels sprouts can.
Can dax brussels sprouts.
Can brussels sprouts dax.
Can brussels sprouts dax.
Dax can brussels sprouts.
Can dax brussels sprouts.
Can dax brussels sprouts.
Brussels sprouts dax can.
Brussels sprouts can dax.
Can dax brussels sprouts.


In [24]:
separated = list(zip(*stimuli))
sentences = separated[0]
words = separated[5]

shuffled_premise = []
values = []

for s, w in zip(sentences, words):
    shuffled = [shuffle_sentence(s, w) for i in range(2)]
    if shuffled[0] != s:
        shuffled_premise.append(shuffled[0])
    else:
        shuffled_premise.append(shuffled[1])



In [23]:
random.randint(0, 10)

6