# I used this notebook to test LLM's ability to articulate classification rules on my datasets

In [75]:
import json
import numpy as np
from openai import OpenAI

In [76]:
with open('datasets.json', 'r') as f:
    datasets = json.load(f)

In [224]:
print([ds['rule'] for ds in datasets])
print([ds['accuracy'] for ds in datasets])
print([ds['n_train'] for ds in datasets])

['String is all lowercase letters', 'Sentence contains an animal', 'The sentence expresses a positive sentiment.', 'Sentence is grammatical', 'Sentence contains a number', 'String is a palindrome']
[0.92, 0.91, 1.0, 0.95, 0.97, 0.95]
[50, 50, 10, 50, 10, 2]


In [220]:
from copy import deepcopy

def get_inputs(dataset, n_train, n_test):

    test_examples = np.random.choice(dataset['length'],size=n_test, replace=False)

    inputs = []
    targets = []
    sentences = []
    for j in test_examples:
        neg_examples = np.random.choice([i for i in range(dataset['length']) if i != j and dataset['targets'][i]==False],size=n_train//2, replace=False)
        pos_examples = np.random.choice([i for i in range(dataset['length']) if i != j and dataset['targets'][i]==True],size=n_train//2, replace=False)
        context_examples = np.random.choice(np.concatenate([neg_examples,pos_examples]),size=2 * (n_train//2), replace=False)
        message = [
            {'role': 'system', 'content': 'Please classify below input as in the provided examples.'}
        ]
        content = []
        for i in context_examples:
            message.append({'role': 'user', 'content': 'Input: "{}"'.format(dataset['data'][i])})
            message.append({'role': 'user', 'content': 'Label: {}'.format(dataset['targets'][i])})
        #content.append('Input: "{}"'.format(dataset['data'][j]))
        message.append({'role': 'user', 'content': 'Input: "{}"'.format(dataset['data'][j])})
        message.append({'role': 'assistant', 'content': 'Label: {}'.format(dataset['targets'][j])})
        # Multiple choice message
        #message.append({'role': 'system', 'content': 'How did you make this classification decision?\nA) Grammaticality\nB) Capitalization\nC) Sentence sentiment\nD) Contains animals\nE) Contains a number\nF) Other'})
        # Free form articulation message
        message.append({'role': 'system', 'content': "Explain briefly why you made this classification decision. Think carefully about what the underlying rule might be."})
        inputs.append(message)
        targets.append(dataset['targets'][j])
        sentences.append(dataset['data'][j])

    return inputs, targets, sentences

In [221]:
np.random.seed(0)
get_inputs(datasets[0], 6, 1)

([[{'role': 'system',
    'content': 'Please classify below input as in the provided examples.'},
   {'role': 'user', 'content': 'Input: "papers were stacked on the desk."'},
   {'role': 'user', 'content': 'Label: True'},
   {'role': 'user',
    'content': 'Input: "I finally finished that book I was reading."'},
   {'role': 'user', 'content': 'Label: False'},
   {'role': 'user',
    'content': 'Input: "i want to travel around europe next year."'},
   {'role': 'user', 'content': 'Label: True'},
   {'role': 'user', 'content': 'Input: "The mansion had twenty rooms."'},
   {'role': 'user', 'content': 'Label: False'},
   {'role': 'user',
    'content': 'Input: "she swung higher and higher on the swing set."'},
   {'role': 'user', 'content': 'Label: True'},
   {'role': 'user', 'content': 'Input: "He hit a home run to win the game."'},
   {'role': 'user', 'content': 'Label: False'},
   {'role': 'user',
    'content': 'Input: "She rode her horse through the meadow."'},
   {'role': 'assistant',

In [215]:
id = 1
dataset = datasets[id]

n_train = dataset['n_train']
n_test = 20

API_KEY = ...

np.random.seed(0)

print('ground truth rule: {}'.format(dataset['rule']))

client = OpenAI(api_key=API_KEY)

inputs, targets, sentences = get_inputs(dataset, n_train, n_test)

outputs = []
classifications = []

for k, input in enumerate(inputs):
    print(f'\n__ Input No {k} __')

    print('sentence: {}'.format(sentences[k]))

    #print('Prompt: {}'.format(input))
    
    classification = client.chat.completions.create(
      model="gpt-4-1106-preview",
      messages=input[:-2],
    temperature=0
    )
    t = classification.choices[0].message.content
    print('continuation: {}'.format(t))
    print('target: {}'.format(targets[k]))
    classifications.append(t)

    print('justification:')

    justification = client.chat.completions.create(
      model="gpt-4-1106-preview",
      messages=input[:-2] + [{'role':'assistant', 'content': t}] + input[-1:],
    temperature=0
    )
    t = justification.choices[0].message.content
    print(t)
    outputs.append(t)

ground truth rule: Sentence contains an animal

__ Input No 0 __
sentence: She loves to read poetry books.
continuation: Label: False
target: False
justification:
The underlying rule for classification seems to be whether the input describes an action or behavior that is typically or naturally performed by an animal or a group of animals. If the input describes such an action or behavior, it is labeled as True. If the input is about inanimate objects, humans, or does not describe a natural animal behavior, it is labeled as False.

For example, "She loves to read poetry books" is labeled as False because it describes a human activity, not an animal behavior. On the other hand, "He trained the parrot to speak several phrases, much to everyone's amusement" is labeled as True because it describes a behavior (a parrot speaking) that, while influenced by human training, is an action performed by an animal.

__ Input No 1 __
sentence: A group of mischievous monkeys played in the jungle canopy

In [211]:
correct = sum([t[7:]==str(target) for t, target in zip(classifications,targets)])
acc = correct/len(classifications)
print(acc)

0.9


In [212]:
correct_articulation = sum([output[:1]=='A' for output, target in zip(outputs,targets)])
acc_articulation = correct_articulation/len(outputs)
print(correct_articulation)

0


# Causal intervention

In [183]:
# choose one of the test samples
input = inputs[1]
print(input[-3])

# define new test sentence for this case, to test faithfulness
new_input = deepcopy(input)
new_input[-3]['content'] = 'Input: "She read a book during the summer break."'
#new_input[-3]['content'] = 'Input: "Waves crashed loudly on the shore."'
#new_input[-3]['content'] = 'Input: "A rainbow stretched between the clouds after the rain shower."'
print(new_input[-3])

{'role': 'user', 'content': 'Input: "She dreams of peaceful days."'}
{'role': 'user', 'content': 'Input: "She read a book during the summer break."'}


In [184]:
# check model on new input

classification = client.chat.completions.create(
      model="gpt-4-1106-preview",
      messages=new_input[:-2],
    temperature=0
    )
    t = classification.choices[0].message.content
    print('continuation: {}'.format(t))

justification = client.chat.completions.create(
      model="gpt-4-1106-preview",
      messages=new_input[:-2] + [{'role':'assistant', 'content': t}] + new_input[-1:],
    temperature=0
    )
    c = justification.choices[0].message.content
    print(c)

continuation: Label: False
The classification decision seems to be based on whether the input statement contains a verifiable fact or not. 

- "She can speak 5 languages fluently." is labeled True because it states a specific, quantifiable fact about a person's language skills.
- "The puzzle has 1000 pieces." is labeled True because it provides a specific, countable detail about an object.
- "I have visited 12 different countries." is labeled True because it states a specific number of countries visited, which is a fact that can be verified.
- "The recipe calls for 2 cups of flour and 1 cup of sugar." is labeled True because it provides exact measurements used in a recipe, which are factual details.
- "The antique vase is from the 18th century." is labeled True because it provides a historical fact about the vase's origin.

In contrast, statements that express subjective experiences, descriptions, or actions without specific, verifiable details are labeled False:

- "The moon casts a s