In [1]:
from itertools import chain
from random import random

import matplotlib.pyplot as plt
import numpy as np
from tqdm.notebook import tqdm

from society.action import Action
from society.agent import Agent, TrainableAgent
from society.simulations.weighted import WeightedNetworkSimulation
from society.strategies.gameplay.constant import AllC, AllD
from society.strategies.gameplay.qlearning import TabularQLearningGameplayStrategy
from society.strategies.gameplay.random import RandomGameplayStrategy
from society.strategies.gameplay.tft import TitForTat
from society.visualisation.network import *

plt.rcParams["figure.figsize"] = (10, 6)

In [2]:
def generate_population(communities, size):
    population = communities * size

    agents = [
        Agent(TabularQLearningGameplayStrategy(lookback=2, epsilon=0.2), i, population)
        for i in range(population)
    ]

    G = nx.connected_caveman_graph(communities, size)

    weights_matrix = np.zeros((population, population))
    for u, v, d in G.edges(data=True):
        weights_matrix[u, v] = weights_matrix[v, u] = 1.0
        try:
            d["weight"] = weights_matrix[u, v]
        except:
            d["weight"] = 0

    return agents, weights_matrix, G

In [3]:
def compute_policies(agents):
    policies = [
        agent.gameplay_strategy._q_table.argmax(axis=-1)
        for agent in agents
    ]

    return [
        "".join(
            [
                ("C", "D")[policy[i, j]]  # , k
                for i in range(4)
                for j in range(4)
                # for k in range(4)
            ]
        )
        for policy in policies
    ]

def calculate_cooperativeness(history):
    count = history.count(Action.COOPERATE)

    return count / len(history)

In [4]:
# PAYOFF_LABELS = ["(C, C)", "(C, D)", "(D, C)", "(D, D)"]

PAYOFF_LABELS = ["R", "S", "T", "P"]

POPULATION = (2, 32)
ROUNDS = 10_000

run_rewards = []
run_cumulative_rewards = []
run_action_histories = []
run_reward_histories = []
run_policies = []

for run in range(20):
    # Generate a new population
    agents, weights_matrix, G = generate_population(*POPULATION)

    # Run a number of rounds
    sim = WeightedNetworkSimulation(agents, weights_matrix)

    # Training phase ( // 2)
    sim.reset()
    for i in tqdm(range(ROUNDS), desc=f"Run {run + 1}"):
        sim.play_round(train=True)

    # Evaluation phase
    # for agent in agents:
    #     agent.gameplay_strategy._epsilon = 0.0

    # sim.reset()
    # for i in tqdm(range(ROUNDS // 2), desc=f"Evaluation Run {run + 1}"):
    #     sim.play_round(train=False)

    # Store results
    run_rewards.append(sim.rewards)
    run_action_histories.append(sim.action_histories)
    run_reward_histories.append(sim.reward_histories)
    run_policies.append(compute_policies(agents))

    # Output statistics for the best and worst agents
    cumulative_rewards = [(i, sum(history)) for i, history in enumerate(sim.reward_histories)]
    cumulative_rewards.sort(key=lambda x: x[1], reverse=True)

    tqdm.write(
        f"BEST AGENT: {cumulative_rewards[0][1]} ({calculate_cooperativeness(list(chain(*sim.action_histories[cumulative_rewards[0][0]])))}, {run_policies[-1][cumulative_rewards[0][0]]})"
    )
    tqdm.write(
        f"WORST AGENT: {cumulative_rewards[-1][1]} ({calculate_cooperativeness(list(chain(*sim.action_histories[cumulative_rewards[-1][0]])))}, {run_policies[-1][cumulative_rewards[-1][0]]})"
    )


Run 1:   0%|          | 0/10000 [00:00<?, ?it/s]

BEST AGENT: 67622 (0.42354633855923796, DDDDCCDCCCDDCDDC)
WORST AGENT: 48497 (0.7995577000402091, DCCCDDDCDCCCDDCC)


Run 2:   0%|          | 0/10000 [00:00<?, ?it/s]

BEST AGENT: 67091 (0.3202536069092906, DCDDDCCCDCDDCCDD)
WORST AGENT: 45475 (0.8101221289641655, CDCCCDCCCCCCCDCC)


Run 3:   0%|          | 0/10000 [00:00<?, ?it/s]

BEST AGENT: 73048 (0.1774306248752246, DDDDDCCDDCDDCCDD)
WORST AGENT: 46066 (0.8316666666666667, CDCCCDCCCDCCCCCD)


Run 4:   0%|          | 0/10000 [00:00<?, ?it/s]

BEST AGENT: 67748 (0.22155267650898866, DCDDDCCCDCDDCCDD)
WORST AGENT: 44472 (0.8389605074687948, CDCCCDCCCCCCCCCC)


Run 5:   0%|          | 0/10000 [00:00<?, ?it/s]

BEST AGENT: 71991 (0.2144161935324611, DCDDDCCDDCDDCCDD)
WORST AGENT: 46818 (0.8113014049841042, CDCCCDCCCCCCCCCC)


Run 6:   0%|          | 0/10000 [00:00<?, ?it/s]

BEST AGENT: 74497 (0.2000300045006751, DDDDCCDCDCDDDCDC)
WORST AGENT: 44835 (0.8170018469115534, DCCCDCCCDCCCDCCC)


Run 7:   0%|          | 0/10000 [00:00<?, ?it/s]

BEST AGENT: 67704 (0.32433110787611064, DCDDDCDCCCDDDCDC)
WORST AGENT: 46857 (0.8345230369889682, DCCCDCCCDCCCCCCC)


Run 8:   0%|          | 0/10000 [00:00<?, ?it/s]

BEST AGENT: 68841 (0.3459408213163016, DCDDDCDCDCDDCDDC)
WORST AGENT: 42404 (0.7989133168989284, DCCCDDCCCCCCDCCC)


Run 9:   0%|          | 0/10000 [00:00<?, ?it/s]

BEST AGENT: 69176 (0.2996196576919227, DCDDDCDCDCDDDCDC)
WORST AGENT: 46720 (0.7956768142835648, DDCCDDCCCDCCCDCD)


Run 10:   0%|          | 0/10000 [00:00<?, ?it/s]

BEST AGENT: 65990 (0.37746225377462256, DDDCCCDDDCDCCCCC)
WORST AGENT: 41363 (0.8387550907536829, DDCCDCCCCCCCCCCC)


Run 11:   0%|          | 0/10000 [00:00<?, ?it/s]

BEST AGENT: 67946 (0.37733598409542746, DDDDCDDCCCDDDCDC)
WORST AGENT: 45267 (0.7954510448220978, DDCCDDDCCDCCCCCD)


Run 12:   0%|          | 0/10000 [00:00<?, ?it/s]

BEST AGENT: 62807 (0.4365015573194012, DCDDDCCCCCDDDDDC)
WORST AGENT: 45963 (0.805380305859789, DCCCDDCCDDCCDCCC)


Run 13:   0%|          | 0/10000 [00:00<?, ?it/s]

BEST AGENT: 72597 (0.21786927843294024, DCDDDCCCDCDDDCDC)
WORST AGENT: 45725 (0.8240606756149893, DCCCDDCCDCCCDCCC)


Run 14:   0%|          | 0/10000 [00:00<?, ?it/s]

BEST AGENT: 66706 (0.40729773137924113, DCDCDCCCCCDCCCCC)
WORST AGENT: 49724 (0.8427578621068946, CDCCCDCCCCCCCCCC)


Run 15:   0%|          | 0/10000 [00:00<?, ?it/s]

BEST AGENT: 70499 (0.20046957738035767, DDDDCDDDDCDDCCDD)
WORST AGENT: 43176 (0.82005295498826, DDCCDDCCCCCCCDCC)


Run 16:   0%|          | 0/10000 [00:00<?, ?it/s]

BEST AGENT: 66646 (0.3611754050518265, DCDDDDDCDCDDDDDC)
WORST AGENT: 46682 (0.8149544928847992, DCCCDDCCCCCCDCCC)


Run 17:   0%|          | 0/10000 [00:00<?, ?it/s]

BEST AGENT: 66561 (0.3836802081248749, DCDCCDDDCDDCCDCC)
WORST AGENT: 46215 (0.8448798925747252, DDCCCDCCCCCCCCCC)


Run 18:   0%|          | 0/10000 [00:00<?, ?it/s]

BEST AGENT: 73993 (0.18613357292602575, DDDDDDDCDDDDDCDC)
WORST AGENT: 47126 (0.8062748252803057, DCCCDDCCDDCCDCCC)


Run 19:   0%|          | 0/10000 [00:00<?, ?it/s]

BEST AGENT: 71027 (0.24811504468966894, DDDDDCCCDCDDDCDC)
WORST AGENT: 43804 (0.846012832263978, CDCCCCCCCCCCDCCC)


Run 20:   0%|          | 0/10000 [00:00<?, ?it/s]

BEST AGENT: 77031 (0.11573958593165377, DDDDDDDCDDDDDCDD)
WORST AGENT: 45954 (0.835759352178449, CDCCCDCCCCCCCCCC)


In [8]:
cumulative_reward_policy_ranks = {}
cumulative_reward_policies = {}

for policies, reward_histories in zip(run_policies, run_reward_histories):
    cumulative_rewards = [(i, sum(history)) for i, history in enumerate(reward_histories)]
    cumulative_rewards.sort(key=lambda x: x[1], reverse=True)

    for rank, (i, cumulative_reward) in enumerate(cumulative_rewards):
        policy = policies[i]

        if policy not in cumulative_reward_policy_ranks:
            cumulative_reward_policy_ranks[policy] = []
            cumulative_reward_policies[policy] = []

        cumulative_reward_policy_ranks[policy].append(rank + 1)
        cumulative_reward_policies[policy].append(cumulative_reward)

mean_cumulative_reward_policy_ranks = {policy: np.mean(ranks) for policy, ranks in cumulative_reward_policy_ranks.items()}
mean_cumulative_reward_policies = {policy: np.mean(rewards) for policy, rewards in cumulative_reward_policies.items()}

In [7]:
for policy in sorted(mean_cumulative_reward_policy_ranks, key=lambda x: mean_cumulative_reward_policy_ranks[x]):
    print(f"{mean_cumulative_reward_policy_ranks[policy]:<24} {policy}")

1.0                      DDDDCCDCCCDDCDDC
1.0                      DCDDDCCCDCDDCCDD
1.0                      DDDDDCCDDCDDCCDD
1.0                      DCDDDCCDDCDDCCDD
1.0                      DCDDDCDCCCDDDCDC
1.0                      DCDDDCDCDCDDCDDC
1.0                      DCDDDCDCDCDDDCDC
1.0                      DDDCCCDDDCDCCCCC
1.0                      DDDDCDDCCCDDDCDC
1.0                      DCDCDCCCCCDCCCCC
1.0                      DDDDCDDDDCDDCCDD
1.0                      DCDDDDDCDCDDDDDC
1.0                      DCDCCDDDCDDCCDCC
1.0                      DDDDDDDCDDDDDCDC
1.0                      DDDDDCCCDCDDDCDC
1.0                      DDDDDDDCDDDDDCDD
2.0                      DCDDDCCDDCDDCDDD
2.0                      DCDCDDDCDCDCDCCC
2.0                      DCDDDCDDDCDDCDDC
2.0                      DCDDDCDDDCDDCCDC
2.0                      CDCCCDDCCCDDCCDC
2.0                      DCDDDCCCCCDDDDDC
2.0                      DCDDDCDDDCDCDCCC
2.0                      DCCDDCCCD

In [10]:
for policy in sorted(mean_cumulative_reward_policies, key=lambda x: mean_cumulative_reward_policies[x], reverse=True):
    print(f"{mean_cumulative_reward_policies[policy]:<24} {policy}")

77031.0                  DDDDDDDCDDDDDCDD
73993.0                  DDDDDDDCDDDDDCDC
73048.0                  DDDDDCCDDCDDCCDD
71991.0                  DCDDDCCDDCDDCCDD
71027.0                  DDDDDCCCDCDDDCDC
70666.0                  DDDCCCCCDCDCDCCC
70499.0                  DDDDCDDDDCDDCCDD
69176.0                  DCDDDCDCDCDDDCDC
68866.0                  DDDDCCDCDCDDDCDC
68841.0                  DCDDDCDCDCDDCDDC
68470.0                  DCDDDCDDDCDDCCDC
67946.0                  DDDDCDDCCCDDDCDC
67704.0                  DCDDDCDCCCDDDCDC
67622.0                  DDDDCCDCCCDDCDDC
67419.5                  DCDDDCCCDCDDCCDD
66842.0                  DCDDDCCDDCDDCDDD
66706.0                  DCDCDCCCCCDCCCCC
66646.0                  DCDDDDDCDCDDDDDC
66561.0                  DCDCCDDDCDDCCDCC
66475.83333333333        DCDDDCCCDCDDDCDC
66330.0                  DDDCDDCCDCDCDCCC
66086.0                  DCDDDCDDCCDDCCDC
65990.0                  DDDCCCDDDCDCCCCC
65741.0                  DCDDDDCDC