In [1]:
from itertools import chain
from random import random

import matplotlib.pyplot as plt
import numpy as np
from tqdm.notebook import tqdm

from society.action import Action
from society.agent import Agent, TrainableAgent
from society.simulations.weighted import WeightedNetworkSimulation
from society.strategies.gameplay.constant import AllC, AllD
from society.strategies.gameplay.qlearning import TabularQLearningGameplayStrategy
from society.strategies.gameplay.random import RandomGameplayStrategy
from society.strategies.gameplay.tft import TitForTat
from society.visualisation.network import *

plt.rcParams["figure.figsize"] = (10, 6)

In [2]:
def generate_population(communities, size):
    population = communities * size

    agents = [
        Agent(TabularQLearningGameplayStrategy(lookback=2, epsilon=0.2), i, population)
        for i in range(population)
    ]

    G = nx.connected_caveman_graph(communities, size)

    weights_matrix = np.zeros((population, population))
    for u, v, d in G.edges(data=True):
        weights_matrix[u, v] = weights_matrix[v, u] = 1.0
        try:
            d["weight"] = weights_matrix[u, v]
        except:
            d["weight"] = 0

    return agents, weights_matrix, G

In [3]:
def compute_policies(agents, rankings):
    policies = [
        (agents[partner[0]].gameplay_strategy._q_table.argmax(axis=-1), partner[1])
        for partner in rankings
    ]

    return [
        (
            "".join(
                [
                    ("C", "D")[policy[0][i, j]]  # , k
                    for i in range(4)
                    for j in range(4)
                    # for k in range(4)
                ]
            ),
            policy[1],
        )
        for policy in policies
    ]

def calculate_cooperativeness(history):
    count = history.count(Action.COOPERATE)

    return count / len(history)

In [4]:
# PAYOFF_LABELS = ["(C, C)", "(C, D)", "(D, C)", "(D, D)"]

PAYOFF_LABELS = ["R", "S", "T", "P"]

POPULATION = (2, 32)
ROUNDS = 10_000

cumulative_reward_results = []
# cooperativeness_results = []

for run in range(20):
    # Generate a new population
    agents, weights_matrix, G = generate_population(*POPULATION)

    # Run a number of rounds
    sim = WeightedNetworkSimulation(agents, weights_matrix)

    # Training phase
    sim.reset()
    for i in tqdm(range(ROUNDS // 2), desc=f"Training Run {run + 1}"):
        sim.play_round(train=True)

    # Evaluation phase
    for agent in agents:
        agent.gameplay_strategy._epsilon = 0.0

    sim.reset()
    for i in tqdm(range(ROUNDS // 2), desc=f"Evaluation Run {run + 1}"):
        sim.play_round(train=False)

    # Rank agents by cumulative reward
    cumulative_rewards_matrix = [
        [sum(r) if len(r) > 0 else 0 for r in agent] for agent in sim.rewards
    ]
    cumulative_reward_rankings = sorted(
        [(i, sum(cumulative_rewards_matrix[i])) for i in range(len(agents))],
        key=lambda x: x[1],
        reverse=True,
    )

    cumulative_reward_results.append(compute_policies(agents, cumulative_reward_rankings))

    # Rank agents by cooperativeness
    # cooperativeness_rankings = sorted(
    #     [
    #         (i, calculate_cooperativeness(list(chain(*history))))
    #         for i, history in enumerate(sim.action_histories)
    #     ],
    #     key=lambda x: x[1],
    #     reverse=True,
    # )

    tqdm.write(
        f"BEST REWARD: {cumulative_reward_rankings[0][1]} ({calculate_cooperativeness(list(chain(*sim.action_histories[cumulative_reward_rankings[0][0]])))}) \t\t\t WORST REWARD: {cumulative_reward_rankings[-1][1]} ({calculate_cooperativeness(list(chain(*sim.action_histories[cumulative_reward_rankings[-1][0]])))})"
    )

    # tqdm.write(
    #     f"BEST COOPERATOR: {cumul} ({cooperativeness_rankings[0][1]:<24}) \t\t\t WORST COOPERATOR: ({cooperativeness_rankings[-1][1]})"
    # )

    # cooperativeness_results.append(compute_policies(agents, cooperativeness_rankings))


Training Run 1:   0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation Run 1:   0%|          | 0/5000 [00:00<?, ?it/s]

BEST REWARD: 43674 (0.0) 			 WORST REWARD: 21438 (1.0)


Training Run 2:   0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation Run 2:   0%|          | 0/5000 [00:00<?, ?it/s]

BEST REWARD: 43091 (0.1330093235469153) 			 WORST REWARD: 23310 (1.0)


Training Run 3:   0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation Run 3:   0%|          | 0/5000 [00:00<?, ?it/s]

BEST REWARD: 41000 (0.19441962069307914) 			 WORST REWARD: 22140 (1.0)


Training Run 4:   0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation Run 4:   0%|          | 0/5000 [00:00<?, ?it/s]

BEST REWARD: 43173 (0.03988375588736346) 			 WORST REWARD: 22257 (1.0)


Training Run 5:   0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation Run 5:   0%|          | 0/5000 [00:00<?, ?it/s]

BEST REWARD: 42441 (0.0) 			 WORST REWARD: 19530 (1.0)


Training Run 6:   0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation Run 6:   0%|          | 0/5000 [00:00<?, ?it/s]

BEST REWARD: 38635 (0.2256743256743257) 			 WORST REWARD: 24441 (1.0)


Training Run 7:   0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation Run 7:   0%|          | 0/5000 [00:00<?, ?it/s]

BEST REWARD: 43641 (0.06615930999492643) 			 WORST REWARD: 23982 (1.0)


Training Run 8:   0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation Run 8:   0%|          | 0/5000 [00:00<?, ?it/s]

BEST REWARD: 46417 (0.006358668653750621) 			 WORST REWARD: 22803 (1.0)


Training Run 9:   0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation Run 9:   0%|          | 0/5000 [00:00<?, ?it/s]

BEST REWARD: 42947 (0.0) 			 WORST REWARD: 21732 (1.0)


Training Run 10:   0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation Run 10:   0%|          | 0/5000 [00:00<?, ?it/s]

BEST REWARD: 45636 (0.08793944826212528) 			 WORST REWARD: 23691 (1.0)


Training Run 11:   0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation Run 11:   0%|          | 0/5000 [00:00<?, ?it/s]

BEST REWARD: 43982 (0.10058997050147493) 			 WORST REWARD: 19755 (1.0)


Training Run 12:   0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation Run 12:   0%|          | 0/5000 [00:00<?, ?it/s]

BEST REWARD: 37346 (0.3092555331991952) 			 WORST REWARD: 24435 (1.0)


Training Run 13:   0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation Run 13:   0%|          | 0/5000 [00:00<?, ?it/s]

BEST REWARD: 41935 (0.0) 			 WORST REWARD: 22623 (1.0)


Training Run 14:   0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation Run 14:   0%|          | 0/5000 [00:00<?, ?it/s]

BEST REWARD: 40898 (0.15705864721750423) 			 WORST REWARD: 19512 (1.0)


Training Run 15:   0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation Run 15:   0%|          | 0/5000 [00:00<?, ?it/s]

BEST REWARD: 46025 (0.0) 			 WORST REWARD: 22053 (1.0)


Training Run 16:   0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation Run 16:   0%|          | 0/5000 [00:00<?, ?it/s]

BEST REWARD: 42952 (0.04110687788249449) 			 WORST REWARD: 22053 (1.0)


Training Run 17:   0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation Run 17:   0%|          | 0/5000 [00:00<?, ?it/s]

BEST REWARD: 38095 (0.2576510333863275) 			 WORST REWARD: 24057 (1.0)


Training Run 18:   0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation Run 18:   0%|          | 0/5000 [00:00<?, ?it/s]

BEST REWARD: 40868 (0.2005579356381389) 			 WORST REWARD: 24285 (1.0)


Training Run 19:   0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation Run 19:   0%|          | 0/5000 [00:00<?, ?it/s]

BEST REWARD: 40549 (0.1863817097415507) 			 WORST REWARD: 22428 (1.0)


Training Run 20:   0%|          | 0/5000 [00:00<?, ?it/s]

Evaluation Run 20:   0%|          | 0/5000 [00:00<?, ?it/s]

BEST REWARD: 38006 (0.20423312268030896) 			 WORST REWARD: 19782 (1.0)


In [5]:
cumulative_reward_policy_ranks = {}

for result in cumulative_reward_results:
    for rank, (policy, score) in enumerate(result):
        if policy not in cumulative_reward_policy_ranks:
            cumulative_reward_policy_ranks[policy] = []

        cumulative_reward_policy_ranks[policy].append(rank + 1)

mean_cumulative_reward_policy_ranks = {policy: np.mean(ranks) for policy, ranks in cumulative_reward_policy_ranks.items()}

for policy in sorted(mean_cumulative_reward_policy_ranks, key=lambda x: mean_cumulative_reward_policy_ranks[x]):
    print(f"{mean_cumulative_reward_policy_ranks[policy]:<24} {policy}")

1.0                      DDDDCCCCDCDDCDDD
1.0                      DDDDDDDCCCDCCCDC
1.0                      DCDCCDDCDCDCDCDC
1.0                      DCDCCCDDDCDDDDDC
1.0                      DCDDDCDCCCDDCCDD
1.0                      DCDCCCCDCCDCDCDC
1.0                      DCDDCCCDCCDDCCDC
1.0                      DCDDDDDDCCDDDDDC
1.0                      DCDDDCDCDCDDCCDD
1.0                      DDDCCDCCCDDCDCDC
1.0                      DCDCCDDCDCDCDDCC
1.0                      DCDCCCDDDCDCCCCC
1.0                      DDDDCDDCCCDDCCDD
1.0                      DCDCCDCCCDDCDDCC
1.0                      DDDDCCDCCDDDCCDD
1.0                      DDDDCDCCCCDDDCDC
1.0                      DCDDDDCCCCDDCDCD
1.0                      DCDCCCDDCCDDCCDD
1.0                      DDDDDCCCDCDCCCCD
1.5                      DCDCDDCCDCDCDCCC
2.0                      DDDCCCCCCCDCDCDC
2.0                      DCDDDDCDDCDDCDDC
2.0                      DDDCDCCDDCCCCCCC
2.0                      DCDDDCCCC

In [6]:
cooperativeness_policy_ranks = {}
cooperativeness_policy_scores = {}

for result in cooperativeness_results:
    for rank, (policy, score) in enumerate(result):
        if policy not in cooperativeness_policy_ranks:
            cooperativeness_policy_ranks[policy] = []
            cooperativeness_policy_scores[policy] = []

        cooperativeness_policy_ranks[policy].append(rank + 1)
        cooperativeness_policy_scores[policy].append(score)        

mean_cooperativeness_policy_ranks = {policy: (np.mean(ranks), len(ranks)) for policy, ranks in cooperativeness_policy_ranks.items()}

mean_cooperativeness_policy_scores = {policy: (np.mean(scores), len(scores)) for policy, scores in cooperativeness_policy_scores.items()}

for policy in sorted(mean_cooperativeness_policy_ranks, key=lambda x: mean_cooperativeness_policy_ranks[x][0]):
    print(f"{mean_cooperativeness_policy_ranks[policy][0]:<24} {policy:<24} {mean_cooperativeness_policy_ranks[policy][1]:<8} {mean_cooperativeness_policy_scores[policy][0]}")

NameError: name 'cooperativeness_results' is not defined

In [None]:
for policy in sorted(mean_cooperativeness_policy_scores, key=lambda x: mean_cooperativeness_policy_scores[x][0], reverse=True):
    print(f"{mean_cooperativeness_policy_ranks[policy][0]:<24} {policy:<24} {mean_cooperativeness_policy_ranks[policy][1]:<8} {mean_cooperativeness_policy_scores[policy][0]}")