# PR-like models in langauge


### Generate PR-like models with Masked LM 

In [1]:
import itertools
import numpy as np
import inflect
import random
import json

import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
from transformers import logging
logging.set_verbosity_error()

In [2]:
def get_probs(sentences_raw, options, model, tokenizer, mask_placeholder='_'):
    if type(sentences_raw) is str:
        sentences_raw = [sentences_raw]
    if type(options[0]) is str:
        options = [options for s in sentences_raw]
        
    # Convert the option words into tokens
    options_token = [[tokenizer.tokenize(op)[0] for op in ops] for ops in options]
    options_id = [[tokenizer.vocab[op] for op in ops] for ops in options_token]
    
    # Replace mask placeholders with the mask token used by the given tokenizer
    sentences = [s.replace(mask_placeholder, tokenizer.mask_token) for s in sentences_raw]
    inputs = tokenizer(sentences, return_tensors='pt', padding=True)
    
    mask_indices = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)
    
    outputs = model(**inputs)
    logits = outputs.logits
    
    mask_logits = logits[mask_indices]
    
    probs = [[] for i in range(len(sentences))]
    
    for i in range(len(mask_indices[0])):
        s_idx, m_idx = mask_indices[0][i], mask_indices[1][i]
        prob = torch.softmax(mask_logits[i][options_id[s_idx]], dim=-1).detach().numpy()
        prob = dict(zip(options[s_idx], prob))
        probs[s_idx].append(prob)
        
    if len(probs) == 1:
        probs = probs[0]
    return probs

In [3]:
# Initialise the language model and its tokenizer
lm_name = 'bert-base-uncased'
mlm = AutoModelForMaskedLM.from_pretrained(lm_name)
tokenizer = AutoTokenizer.from_pretrained(lm_name)

In [4]:
## Playground
### Uncomment this cell to play with the get_probs function 
#outcomes = ['apple', 'strawberry']
#observables = ['sweet', 'red', 'round']
#intro = f"There is an {outcomes[0]} and a {outcomes[1]}."
#contexts = [f"The _ is {observables[0]} and {observables[1]}.",
#            f"The _ is {observables[1]} and {observables[2]}.",
#            f"The _ is {observables[2]} but the other is {observables[0]}."]
#probs = get_probs([f"{c}" for c in contexts], [outcomes for i in range(3)], mlm, tokenizer)
#for i, context in enumerate(contexts):
#    print(context)
#    print(probs[i])

## The following handles a batch of examples
Examples should be given in a text file where each row has the format:

```outcome1 outcome2: observable1 observable2 observable3 observable4 ...```

For example,

```apple strawberry: sweet red round green big```

For each row, all ordered combinations of 3 observables will be considered in the following.

In [5]:
def process_topics_file(file_name):
    topics = []
    with open(file_name) as file:
        for row in file:
            outcomes = [] 
            observables = []
            outcomes_str, observables_str = map(str.strip, row.split(':'))
            outcomes = list(map(str.strip, set(outcomes_str.split(','))))
            observables = list(map(str.strip, set(observables_str.split(','))))
            topics.append((outcomes, observables))
    return topics

def process_schemas_file(file_name):
    schema_options = dict()
    schemas = dict()
    with open(file_name) as file:
        data = json.load(file)
        schema_options = data['options']
        schemas = data['schemas']
    return schema_options, schemas

In [10]:
def generate_scenarios(topics, schema):
    scenarios = []
    if schema == 'adj':
        for topic in topics:
            outcomes, observables = topic
            out_perm = itertools.permutations(outcomes, 2)
            obs_perm = itertools.permutations(observables, 3)
            scenarios += list(itertools.product(out_perm, obs_perm))
    elif schema == 'adj_no_intro':
        for topic in topics:
            outcomes, observables = topic
            out_perm = itertools.combinations(outcomes, 2)
            obs_perm = itertools.permutations(observables, 3)
            scenarios += list(itertools.product(out_perm, obs_perm))
    return scenarios

In [11]:
def a(word):
    return inflect.engine().a(word)

def generate_sentences(scenario, schema):
    sentences = []
    o, x = scenario
    if schema == 'adj':
        intro = f"There is {a(o[0])} and {a(o[1])}."
        sentences.append(f"{intro} The _ is {x[0]} and {x[1]}.")
        sentences.append(f"{intro} The _ is {x[1]} and {x[2]}.")
        sentences.append(f"{intro} The _ is {x[2]} and the other one is {x[0]}.")
    elif schema == 'adj_no_intro':
        sentences.append(f"The _ is {x[0]} and {x[1]}.")
        sentences.append(f"The _ is {x[1]} and {x[2]}.")
        sentences.append(f"The _ is {x[2]} and the other one is {x[0]}.")
    return sentences

def get_mask_options(scenario, schema):
    o, x = scenario
    return [o for i in range(3)]

## Load topics files

In [12]:
topics = process_topics_file('adjectives.txt') 
schema = 'adj'

print(f'Loaded topics for schema `{schema}`:')
for t in topics:
    print(t)

Loaded topics for schema `adj`:
(['apple', 'strawberry'], ['round', 'red', 'sweet'])
(['cat', 'dog'], ['cute furry lovely friendly sweet'])
(['yam', 'potato'], ['healthy', 'orange', 'startchy'])
(['courgette', 'cucumber'], ['long', 'green', 'juicy'])
(['daisy', 'marigold'], ['small', 'yellow', 'beautiful'])
(['coreopsis', 'daisy', 'sunflower'], ['small', 'yellow', 'beautiful'])
(['butterfly', 'moth'], ['winged', 'light', 'colorful'])
(['porpoise', 'dolphin'], ['wet', 'slippery', 'grey'])


In [19]:
scenarios = generate_scenarios(topics, schema)

# Too many scenarios would require too much computing time
# So randomly select a few to continue
random.shuffle(scenarios)
scenarios = scenarios[:30]

In [20]:
n_scenario = len(scenarios)

sentences = [generate_sentences(scen, schema) for scen in scenarios]
sentences_flat = list(itertools.chain.from_iterable(sentences))

mask_options = [get_mask_options(scen, schema) for scen in scenarios]
mask_options_flat = list(itertools.chain.from_iterable(mask_options))

probs_flat = get_probs(sentences_flat, mask_options_flat, mlm, tokenizer)
probs = [probs_flat[3*i:3*i+3] for i in range(n_scenario)]

In [22]:
from contextuality.model import Model, CyclicScenario

models = []
for i in range(n_scenario):
    outcomes, observables = scenarios[i]
    tri_scenario = CyclicScenario(observables, 2)
    o0, o1 = outcomes
    x0, x1, x2 = observables
    
    table = []
    table.append([probs[i][0][0][o0], 0, 0, probs[i][0][0][o1]])
    table.append([probs[i][1][0][o0], 0, 0, probs[i][1][0][o1]])
    table.append([0, probs[i][2][0][o0], probs[i][2][0][o1], 0])
    
    model = Model(tri_scenario, table)
    models.append(model)
    if model.signalling_fraction() < 1/2 - 1/12:
        print(outcomes, observables)

In [18]:
for model in models:
    print(model)
    print(model.signalling_fraction())

               (0, 0) (0, 1) (1, 0) (1, 1)
(juicy, green) 0.0964 0.0000 0.0000 0.9036
(green, long) 0.0680 0.0000 0.0000 0.9320
(long, juicy) 0.0000 0.2430 0.7570 0.0000

0.864006027554681
                 (0, 0) (0, 1) (1, 0) (1, 1)
(slippery, grey) 0.9975 0.0000 0.0000 0.0025
(grey, wet) 0.9911 0.0000 0.0000 0.0089
(wet, slippery) 0.0000 1.0000 0.0000 0.0000

0.9999312653788658
                    (0, 0) (0, 1) (1, 0) (1, 1)
(yellow, small) 0.0029 0.0000 0.0000 0.9971
(small, beautiful) 0.0026 0.0000 0.0000 0.9974
(beautiful, yellow) 0.0000 0.0001 0.9999 0.0000

0.9998066469524941
