# Debating Quantitative Experimentation 1: **Political compass** population under **action correction**.

Can we use TinyTroupe to have genuine debates?

In [1]:
import sys

import random

from pprint import pprint

from tinytroupe.agent import TinyPerson
from tinytroupe.environment import TinyWorld
from tinytroupe.experimentation import InPlaceExperimentRunner
from tinytroupe.steering import Intervention
from tinytroupe.examples import *
from tinytroupe.validation import propositions
from tinytroupe.extraction import ResultsExtractor
from tinytroupe.utils.parallel import parallel_map_dict, parallel_map_cross
from tinytroupe.validation import hard_persona_adherence, persona_adherence, self_consistency, fluency, task_completion, divergence

# specific utilities
from common_utils import *


!!!!
DISCLAIMER: TinyTroupe relies on Artificial Intelligence (AI) models to generate content. 
The AI models are not perfect and may produce inappropriate or inacurate results. 
For any serious or consequential use, please review the generated content before using it.
!!!!

Looking for default config on: c:\Users\pdasilva\AppData\Local\anaconda3\envs\py310\lib\site-packages\tinytroupe\utils\..\config.ini
Found custom config on: c:\Users\pdasilva\OneDrive - Microsoft\TinyTroupe (shared)\Paper artifacts\Working examples\config.ini

Current TinyTroupe configuration 
[OpenAI]
api_type = openai
azure_api_version = 2024-08-01-preview
model = gpt-4o-mini
reasoning_model = o3-mini
embedding_model = text-embedding-3-small
max_tokens = 16000
temperature = 1.2
freq_penalty = 0.0
presence_penalty = 0.0
timeout = 60
max_attempts = 5
waiting_time = 0
exponential_backoff_factor = 5
reasoning_effort = high
cache_api_calls = False
cache_file_name = openai_api_cache.pickle
max_content_display_length =

## Parameters

In [2]:
full_mode = False  # set to True to run the full mode with all agents and tasks

# avoid displaying the communication, to make the output cleaner for eval
TinyPerson.communication_display = True

In [3]:
if full_mode:
    repetitions_per_task = 5
    simulation_steps = 15

else:
    repetitions_per_task = 2
    simulation_steps = 5
    qty_agents = 20
    qty_proposals = 3


## Experiment setup

In [4]:
experiment_runner = InPlaceExperimentRunner("./debating_quantitative_experimentation_1c.json")

experiment_runner.add_experiment("Control")
experiment_runner.add_experiment("Treatment")



In [5]:
experiment_runner.activate_next_experiment()

#experiment_runner.fix_active_experiment("Control")
#experiment_runner.fix_active_experiment("Treatment")

In [6]:
print(f"Running experiment {experiment_runner.get_active_experiment()}")

Running experiment None


## Agents and populations

In [7]:
people = []
if not experiment_runner.has_finished_all_experiments():
    # load agents
    people = TinyPerson.load_specifications_from_folder("./population/political_compass")

    # randomize the order of the agents, use a fixed seed for reproducibility
    random.seed(42)
    random.shuffle(people)

    # filter to make it go faster?
    if not full_mode:
        people = people[:qty_agents]

    # customize and print minibios 
    for person in people:
        ### person.import_fragment("./fragments/picky_customer.agent.fragment.json")
        print(person.minibio(requirements="focus on the political inclinations of the agent, trying to position it in the political compass (left vs right, authoritarian vs libertarian)."))


In [8]:
len(people)

0

In [9]:
# divide people in several groups of 5
people_groups = []
for i in range(0, len(people), 5):
    people_groups.append(people[i:i+5]
    )

len(people_groups)

0

In [10]:
# The experiment refers to customers

if experiment_runner.get_active_experiment() == "Control":
    for person in people:
        person.action_generator.enable_reasoning_step = False
        person.action_generator.enable_quality_checks = False

elif experiment_runner.get_active_experiment() == "Treatment":    
    for person in people:
       person.action_generator.enable_reasoning_step = False
       person.action_generator.enable_quality_checks = True
       person.action_generator.max_attempts = 2
       person.action_generator.enable_regeneration = True
       person.action_generator.quality_threshold = 5

## Proposals

In [11]:

# controversial proposals for the debate, to stress the agents
proposals = [
    {"theme": "Politics and housing policy",
     "details":\
    """
    In the city of São Paulo, Brazil, a new law is about to be passed that will require all buildings created for the rich to also include 
    a certain number of affordable housing units in the same building. This law is designed to be very 
    strict, offering no way to avoid the requirement. If the requirement is not met, the building will be
    demolished and the owner will be fined. Such draconian measures are being proposed because the new government
    believes that housing is a human right, and currently there's simply not enough affordable housing to go around,
    and it is argued that the market alone cannot solve this problem.

    DEBATE: is this a good idea? Why or why not? You must take a side and defend it, trying to convince the other side.
    """},

    {"theme": "Politics and income inequality",
        "details":\
    """
    The European Union is about to pass a new law that will require all companies to cap their CEO salaries to 10 times the salary of the lowest paid employee in the company.
    This law is designed to be very strict, offering no way to avoid the requirement. If the requirement is not met, the company will be fined 40% of its annual revenue.
    Such draconian measures are being proposed because the new government believes that income inequality is a human rights violation
    
    DEBATE: is this a good idea? Why or why not? You must take a side and defend it, trying to convince the other side.
    """},


    {"theme": "Technology, health and income inequality",
    "details":\
    """
    A new biotech company has developed a new technology that allows them to genetically modify human embryos to make them more intelligent.
    This technology is very controversial, and many people are against it. The company is planning to offer this service to the public at a very high price.
    The company argues that this technology will help to solve many of the world's problems, such as poverty and crime, by creating a more intelligent population.
    However, many people are against this technology, arguing that it is unethical to play God and that it will only benefit the rich.

    DEBATE: is this a good idea? Why or why not? You must take a side and defend it, trying to convince the other side.
    """}
]

if not full_mode:
    proposals = proposals[:qty_proposals]

## Auxiliary functions

In [12]:
def debate_battery(agents, proposals, interventions, agent_propositions, environment_propositions, 
                          repetitions = 5, simulation_steps=10): 
    
    agent_propositions_scores = {}
    environment_propositions_scores = {}

    experiments_count = 0
    total_expected_experiments = len(proposals) * repetitions #* len(agents)

    # TODO remove?
    #
    # Add intervention to prevent agents from being too quiet.
    #for agent in agents:
    #    intervention = \
    #        Intervention(agent)\
    #            .set_propositional_precondition(propositions.quiet_recently)\
    #            .set_effect(lambda target: target.think("""
    #                                                    I will say something now, I've been too quiet for a while. If I am uncomfortable, 
    #                                                    or can't think of a proper response,
    #                                                    I can always say something like "I don't want to talk about this",
    #                                                    or propose another topic.
    #                                                    """))
    #    interventions.append(intervention)

    # loop over proposals and repetitions
    for proposal in proposals:

        details = proposal["details"]
        theme = proposal["theme"]

        for i in range(repetitions):
            print("\n############## STARTING A NEW RESEARCH SESSION #################")
            print(f"Overall experiment number: {experiments_count+1} / {total_expected_experiments}")
            print(f"Discussion objective: {details}")
            print(f"Trial number: {i+1}")
            print(f"Agents: {agents}")

            # clear the episodic memory of all agents
            for person in agents:
                person.clear_episodic_memory()

            world = TinyWorld(agents=agents, interventions=interventions)
            
            # Participants introduce themselves
            world.broadcast(f"""
                Hello everyone! We are going to have a debate. Before that, however, let's start by introducing ourselves, 
                and mentioning our general views and opinions about the following theme: {theme}
                
                Please:
                  - present yourself and your background;
                  - present some of your core beliefs and values related to the theme;
                  - present any personality traits or characteristics that you think are relevant to the theme;
                  - present some key personal problems related to the theme, if any;
                  
                Don't discuss about the theme yet, just introduce yourself and your background, to get everyone on the same page.
                """)
            world.run(1)
            
            # now to the brainstorming session itself
            world.broadcast(f"""
                Great! So here is the exact situation you are going to debate:
                        ```
                        {details}
                        ```

                Your main objective is to take a side and WIN THE DEBATE, at all costs, no matter what. No compromises, no middle ground.

                Rules of the debate:
                    - There are only two possible sites you can take: it is a GOOD IDEA or a BAD IDEA, nothing else.
                    - No other opinions are possible, such as "the idea would be good if changed in this way ...". This is meant
                      to have a very sharp debate, so you must take a side and defend it.
                    - You will argue according to your own inner convictions, and will aim to persuade the both parliament and the public.
                    - You will not form any kind of collaboration with the other experts. This is a debate only, not a negotiation.
                    - You **must** take a side and defend it, trying to convince the other side at all costs. 
                   
                Please start the discussion now.
                """)
            world.run(simulation_steps)

            # extract and count ideas
            rapporteur = agents[0]  # the first agent is the rapporteur
            rapporteur.listen_and_act("Can you please consolidate all points of view during the debate? Provide a lot of details on point of view, and the arguments for and agains it.")
            opinions = ResultsExtractor().extract_results_from_agent(rapporteur, 
                                    extraction_objective="Consolidates the points of views that the group came up with, explaining each one as an item of a list." \
                                                        "Add information about: what is the point of view; what are the arguments favoring it." \
                                                        "what are the arguments agains it.", 
                                    situation="A debate about a controversial issue.",
                                    fields= ["opinion", "pros", "cons"],
                                    fields_hints={"opinions": "must be the root of the resulting dictionary."},)
            pprint(opinions)
            if "opinions_qty" not in environment_propositions_scores:
                environment_propositions_scores["opinions_qty"] = []
            environment_propositions_scores["opinions_qty"].append(len(opinions["opinions"]))

            # Evaluate environment propositions in parallel
            env_results = parallel_map_dict(
                environment_propositions,
                lambda item: item[1].copy().score(
                    world, 
                    claim_variables={"task_description": f"A debate was run about: {details}."}, 
                    return_full_response=True
                )
            )
            
            # Process environment results
            for k, result in env_results.items():
                if k not in environment_propositions_scores:
                    environment_propositions_scores[k] = []
                environment_propositions_scores[k].append(result["value"])
                print("value: ", result["value"])
                print("justification: ", result["justification"])
                print("reasoning: ", result["reasoning"])

            # Evaluate agent propositions across all agents in parallel
            agent_results = parallel_map_cross(
                [agents, agent_propositions.items()],
                lambda agent, prop_item: (
                    prop_item[0],  # proposition key
                    prop_item[1].copy().score(agent, return_full_response=True)  # result
                )
            )
            
            # Process agent results
            for k, result in agent_results:
                if k not in agent_propositions_scores:
                    agent_propositions_scores[k] = []
                if result is not None:
                    agent_propositions_scores[k].append(result["value"])
                    print("value: ", result["value"])
                    print("justification: ", result["justification"])
                    print("reasoning: ", result["reasoning"])
                    print("\n\n")
                else:
                    print(f"*****WARNING:***** Agent did not respond to proposition {k}.")
            #
            ##for k, proposition in agent_propositions.items():
            ##    for person in world.agents:
            ##        result = proposition.copy().score(person, return_full_response=True)
            ##        
            ##        if k not in agent_propositions_scores:
            ##            agent_propositions_scores[k] = []
            ##        agent_propositions_scores[k].append(result["value"])
            ##
            ##        print("value: ", result["value"])
            ##        print("justification: ", result["justification"])
            ##        print("reasoning: ", result["reasoning"])
            ##        print("\n\n")
            ##
            
            experiments_count += 1
            print("\n\n")

    return agent_propositions_scores, environment_propositions_scores



## Perform experiment

In [13]:
agent_propositions_scores={}
environment_propositions_scores={}

In [14]:
def debate(people):
    global agent_propositions_scores, environment_propositions_scores
    if not experiment_runner.has_finished_all_experiments():

        interventions = []
        #if experiment_runner.get_active_experiment() == "Treatment":
        #    interventions = \
        #        Intervention.create_for_each(people)\
        #            .set_functional_precondition(lambda target: target.actions_count >=7)\
        #            .set_textual_precondition(
        #                """
        #                AGENT IS NOT PROPOSING COMPLETELY NEW PRODUCT/SERVICE IDEAS ANYMORE:
        #                The last **entirely** new product/service idea proposed by this agent, if any, was proposed by him/her **more** than 10 of simulation events ago.
        #                That is to say, the agent has not proposed any new product/service idea in the last 10 of his/her simulation trajectory events.
        #                Additional features, variations of or other refinements to product/service ideas already proposed are NOT considered new!
        #
        #                How to compute the steps gap:
        #                1. Determine the current next event number (N); and the last event number in which the agent proposed a new product/service idea (M).
        #                    This information can be found in the simulation trajectory.
        #                2. Compute the **difference** beteween the current next event number and the last event number in which the agent proposed a new product/service idea: D = N - M
        #                3. The proposition is true if, and only if, the difference D is **greater than** 10.
        #                """)\
        #            .set_effect(lambda target: target.think("""
        #                                                    I need to propose additional, **completelly** new and different, product/service ideas. This was part of the requirement for this session.
        #                                                    I will propose an entirely **new** idea now, I **cannot** repeat or refine previous ideas! I cannot make variations
        #                                                    of previous ideas (e.g., "XYZ for A", "XYZ for B", "XYZ for Z" are repetitive, there should be only one "XYZ"), 
        #                                                    I need to think of something **entirely** new and different.
        #                                                    To help me avoid repeating previous ideas, I'll now explicitly THINK about all the ideas already given by myself or
        #                                                    others, and then, based on that, I'll think again about a new unique idea.
        #                                                    """))
#
                                                            
        tmp_agent_propositions_scores, tmp_environment_propositions_scores = \
            debate_battery(
                agents=people,
                proposals=proposals,
                interventions=interventions,    
                agent_propositions={
                    "Hard Persona Adherence": hard_persona_adherence,
                    "Self-consistency": self_consistency,
                    "Fluency": fluency
                },
                environment_propositions={
                    "Task Completion": task_completion,
                    "Divergence": divergence
                },
                repetitions=repetitions_per_task,
                simulation_steps=simulation_steps
            )

        pprint("NEW AGENT PROPOSITIONS SCORES")
        pprint(tmp_agent_propositions_scores)
        print("\n\n")
        pprint("NEW ENVIRONMENT PROPOSITIONS SCORES")
        pprint(tmp_environment_propositions_scores)

        # merge the scores lists
        agent_propositions_scores = merge_dicts_of_lists(tmp_agent_propositions_scores, agent_propositions_scores)
        environment_propositions_scores = merge_dicts_of_lists(tmp_environment_propositions_scores, environment_propositions_scores)

        return agent_propositions_scores, environment_propositions_scores

In [15]:
debate(people_groups[0]) if len(people_groups) > 0 else None

In [16]:
debate(people_groups[1]) if len(people_groups) > 1 else None

In [17]:
debate(people_groups[2]) if len(people_groups) > 2 else None

In [18]:
debate(people_groups[3]) if len(people_groups) > 3 else None

In [19]:
debate(people_groups[4]) if len(people_groups) > 4 else None

## Extract results and analyze

In [20]:
# TODO adapt to debate case
#
#from market_research_utils import *
#extract_and_analyze_results(people, "Everyone")


In [21]:
if experiment_runner.get_active_experiment() in ["Control", "Treatment"]:
    combined_scores = {**agent_propositions_scores, **environment_propositions_scores}
    experiment_runner.add_experiment_results(combined_scores, experiment_name=experiment_runner.get_active_experiment()) 
    
    plot_scores(combined_scores)

else:
    print("Experiment finished. No more experiments to run.")

Experiment finished. No more experiments to run.


In [22]:
if experiment_runner.has_finished_all_experiments():
    print("All experiments have been finished.")
    print(f"STATISTICTS: Control vs")
    pprint(experiment_runner.run_statistical_tests(control_experiment_name='Control'))

    # plot scores of both experiments
    experiment_control_scores = experiment_runner.get_experiment_results("Control")
    experiment_treatment_scores = experiment_runner.get_experiment_results("Treatment")
    
    
    plot_scores(experiment_control_scores)
    plot_scores(experiment_treatment_scores)

else:
    print("Not all experiments have been finished. RESTART AND RERUN.")

All experiments have been finished.
STATISTICTS: Control vs
{'Treatment': {'Divergence': {'confidence_interval': (-1.443379398124243,
                                                      1.193379398124243),
                              'confidence_level': 0.95,
                              'control_mean': 5.041666666666667,
                              'control_sample_size': 24,
                              'control_std': 2.293263606961592,
                              'degrees_of_freedom': 45.97846180281037,
                              'effect_size': -0.05509417975904704,
                              'mean_difference': -0.125,
                              'p_value': 0.849481919176915,
                              'percent_change': -2.4793388429752063,
                              'significant': False,
                              't_statistic': 0.19085183708800466,
                              'test_type': 'Welch t-test (unequal variance)',
                              

Unnamed: 0,Proposition,Average Score,Standard Deviation,Count
0,Hard Persona Adherence,6.158333,1.59303,120.0
1,Self-consistency,5.808333,2.373863,120.0
2,Fluency,6.483333,1.036991,120.0
3,opinions_qty,2.25,0.793999,24.0
4,Task Completion,7.333333,1.167184,24.0
5,Divergence,5.041667,2.293264,24.0


{'Divergence': [6,
                7,
                5,
                7,
                5,
                0,
                7,
                9,
                5,
                4,
                4,
                3,
                4,
                7,
                4,
                6,
                7,
                1,
                7,
                7,
                5,
                4,
                1,
                3],
 'Fluency': [9,
             8,
             8,
             7,
             8,
             5,
             5,
             7,
             7,
             7,
             5,
             8,
             6,
             7,
             5,
             5,
             6,
             5,
             7,
             5,
             6,
             7,
             5,
             6,
             6,
             6,
             5,
             6,
             5,
             5,
             5,
             6,
             6,
             9,

Unnamed: 0,Proposition,Average Score,Standard Deviation,Count
0,Hard Persona Adherence,6.625,1.341406,120.0
1,Self-consistency,6.791667,1.634172,120.0
2,Fluency,6.375,1.195668,120.0
3,opinions_qty,2.208333,0.779028,24.0
4,Task Completion,7.583333,0.974308,24.0
5,Divergence,4.916667,2.244155,24.0


In [23]:
experiment_runner.finish_active_experiment()

False