In [1]:
from itertools import chain
from random import random

import matplotlib.pyplot as plt
import numpy as np
from tqdm.notebook import tqdm

from society.action import Action
from society.agent import Agent, TrainableAgent
from society.simulations.weighted import WeightedNetworkSimulation
from society.strategies.gameplay.constant import AllC, AllD
from society.strategies.gameplay.qlearning import TabularQLearningGameplayStrategy
from society.strategies.gameplay.random import RandomGameplayStrategy
from society.strategies.gameplay.tft import TitForTat
from society.visualisation.network import *

plt.rcParams["figure.figsize"] = (10, 6)

In [2]:
def generate_population(communities, size):
    population = communities * size

    agents = [
        Agent(TabularQLearningGameplayStrategy(lookback=3, epsilon=0.1, learning_rate=0.1), i, population)
        for i in range(population)
    ]

    G = nx.connected_caveman_graph(communities, size)

    weights_matrix = np.zeros((population, population))
    for u, v, d in G.edges(data=True):
        weights_matrix[u, v] = weights_matrix[v, u] = 1.0
        try:
            d["weight"] = weights_matrix[u, v]
        except:
            d["weight"] = 0

    return agents, weights_matrix, G

In [3]:
def compute_policies(agents):
    policies = [
        agent.gameplay_strategy._q_table.argmax(axis=-1)
        for agent in agents
    ]

    return [
        "".join(
            [
                ("C", "D")[policy[i, j, k]]
                for i in range(4)
                for j in range(4)
                for k in range(4)
            ]
        )
        for policy in policies
    ]

def calculate_cooperativeness(history):
    count = history.count(Action.COOPERATE)

    return count / len(history)

In [4]:
PAYOFF_LABELS = ["(C, C)", "(C, D)", "(D, C)", "(D, D)"]
# PAYOFF_LABELS = ["R", "S", "T", "P"]

In [5]:
POPULATION = (1, 16)
ROUNDS = 25_000

run_rewards = []
run_cumulative_rewards = []
run_action_histories = []
run_reward_histories = []
run_policies = []

for run in range(1):
    # Generate a new population
    agents, weights_matrix, G = generate_population(*POPULATION)

    # Run a number of rounds
    sim = WeightedNetworkSimulation(agents, weights_matrix)

    # Run the simulation
    sim.reset()
    for i in tqdm(range(ROUNDS), desc=f"Run {run + 1}"):
        sim.play_round(train=True)

    # Store results
    run_rewards.append(sim.rewards)
    run_action_histories.append(sim.action_histories)
    run_reward_histories.append(sim.reward_histories)
    run_policies.append(compute_policies(agents))

    # Output statistics for the best and worst agents
    cumulative_rewards = [(i, sum(history)) for i, history in enumerate(sim.reward_histories)]
    cumulative_rewards.sort(key=lambda x: x[1], reverse=True)

    # TODO: maybe I should switch to mean rewards (?)
    # mean_rewards = [(i, np.mean(history)) for i, history in enumerate(sim.reward_histories)]
    # mean_rewards.sort(key=lambda x: x[1], reverse=True)

    tqdm.write(
        f"BEST AGENT: {cumulative_rewards[0][1]} ({calculate_cooperativeness(list(chain(*sim.action_histories[cumulative_rewards[0][0]])))}, {run_policies[-1][cumulative_rewards[0][0]]})"
    )
    tqdm.write(
        f"WORST AGENT: {cumulative_rewards[-1][1]} ({calculate_cooperativeness(list(chain(*sim.action_histories[cumulative_rewards[-1][0]])))}, {run_policies[-1][cumulative_rewards[-1][0]]})"
    )

    # for rh in sim.reward_histories:
    #     plt.plot(np.cumsum(rh))
    
    # plt.show()


Run 1:   0%|          | 0/25000 [00:00<?, ?it/s]

BEST AGENT: 151531 (0.8748782185107864, CCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCDCCCCCCCCDCCCCCCCCCCCCCCCCCC)
WORST AGENT: 143615 (0.937342061802336, DCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC)


In [6]:
agents[cumulative_rewards[0][0]].gameplay_strategy._q_table

array([[[[258.85244791, 256.07213307],
         [263.14667497, 176.88293418],
         [267.55473068, 255.29884888],
         [249.42754356,  37.11568336]],

        [[254.43528773, 269.29796889],
         [248.06996277,  61.00552607],
         [238.01940145,  11.24905851],
         [ 65.01964217,   0.        ]],

        [[267.50826318, 265.91731419],
         [265.85541276,  77.60073739],
         [262.28001163,  92.38964146],
         [126.80438918,   0.5       ]],

        [[257.17937024,  98.03574845],
         [209.38802128,  10.19324086],
         [ 89.04572376,   3.49449315],
         [ 17.01751472,   0.        ]]],


       [[[258.36292192, 211.21590507],
         [212.04746573,   0.        ],
         [267.92187826, 246.38918269],
         [239.53263243,   8.0254341 ]],

        [[256.78407515, 172.78224022],
         [154.2284775 ,   6.27062235],
         [152.45109014,   0.        ],
         [  8.17536402,   0.        ]],

        [[263.79925288, 112.84056938],
         [1

In [7]:
from datetime import datetime
import pickle

with open(f"results - {datetime.isoformat(datetime.now()).replace(':', '-')}.pickle", "wb") as f:
    pickle.dump({
        "rewards": run_rewards,
        "cumulative_rewards": run_cumulative_rewards,
        # "action_histories": run_action_histories,
        "reward_histories": run_reward_histories,
        "policies": run_policies,
    }, f)


In [8]:
cumulative_reward_policy_ranks = {}
cumulative_reward_policies = {}

for policies, reward_histories in zip(run_policies, run_reward_histories):
    cumulative_rewards = [(i, sum(history)) for i, history in enumerate(reward_histories)]
    cumulative_rewards.sort(key=lambda x: x[1], reverse=True)

    for rank, (i, cumulative_reward) in enumerate(cumulative_rewards):
        policy = policies[i]

        if policy not in cumulative_reward_policy_ranks:
            cumulative_reward_policy_ranks[policy] = []
            cumulative_reward_policies[policy] = []

        cumulative_reward_policy_ranks[policy].append(rank + 1)
        cumulative_reward_policies[policy].append(cumulative_reward)

mean_cumulative_reward_policy_ranks = {policy: np.mean(ranks) for policy, ranks in cumulative_reward_policy_ranks.items()}
mean_cumulative_reward_policies = {policy: np.mean(rewards) for policy, rewards in cumulative_reward_policies.items()}

In [9]:
for policy in sorted(mean_cumulative_reward_policy_ranks, key=lambda x: mean_cumulative_reward_policy_ranks[x]):
    print(f"{mean_cumulative_reward_policy_ranks[policy]:<24} {policy}")

1.0                      CCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCDCCCCCCCCDCCCCCCCCCCCCCCCCCC
2.0                      DDDCCDCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCDCCCCDCCCCCCCCC
3.0                      DCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCDCCCCCCCCCCCCCCDCCCCCCCCCCCCC
4.0                      DCCCCCCCCDCCCCCCCCCCCCCDCCCCDCCCDCCDCCCCCDCCCCCCCCCCCCCCCCCCCCCC
5.0                      CCCCDCCCCDCDCCCCCCCCCCCCCCCCCDCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCDCCC
6.0                      DCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
7.0                      CCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCC
8.0                      DCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
9.0                      CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
10.0                     DCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCDCDCCCCCDCCCDCCCCCCC
11.0                     DCCCCCCCCCCCCCCCDCCCCCCCCCDCCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
12.0      

In [10]:
for policy in sorted(mean_cumulative_reward_policies, key=lambda x: mean_cumulative_reward_policies[x], reverse=True):
    print(f"{mean_cumulative_reward_policies[policy]:<24} {policy}")

151531.0                 CCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCDCCCCCCCCDCCCCCCCCCCCCCCCCCC
150599.0                 DDDCCDCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCDCCCCDCCCCCCCCC
146640.0                 DCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCDCCCCCCCCCCCCCCDCCCCCCCCCCCCC
145859.0                 DCCCCCCCCDCCCCCCCCCCCCCDCCCCDCCCDCCDCCCCCDCCCCCCCCCCCCCCCCCCCCCC
145435.0                 CCCCDCCCCDCDCCCCCCCCCCCCCCCCCDCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCDCCC
145373.0                 DCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
145288.0                 CCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCC
145173.0                 DCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
144777.0                 CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
144514.0                 DCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCDCDCCCCCDCCCDCCCCCCC
144352.0                 DCCCCCCCCCCCCCCCDCCCCCCCCCDCCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
144054.0  

In [11]:
compute_policies([agents[cumulative_rewards[0][0]]])

['CCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCDCCCCCCCCDCCCCCCCCCCCCCCCCCC']

In [12]:
agents[cumulative_rewards[0][0]].gameplay_strategy._q_table

array([[[[258.85244791, 256.07213307],
         [263.14667497, 176.88293418],
         [267.55473068, 255.29884888],
         [249.42754356,  37.11568336]],

        [[254.43528773, 269.29796889],
         [248.06996277,  61.00552607],
         [238.01940145,  11.24905851],
         [ 65.01964217,   0.        ]],

        [[267.50826318, 265.91731419],
         [265.85541276,  77.60073739],
         [262.28001163,  92.38964146],
         [126.80438918,   0.5       ]],

        [[257.17937024,  98.03574845],
         [209.38802128,  10.19324086],
         [ 89.04572376,   3.49449315],
         [ 17.01751472,   0.        ]]],


       [[[258.36292192, 211.21590507],
         [212.04746573,   0.        ],
         [267.92187826, 246.38918269],
         [239.53263243,   8.0254341 ]],

        [[256.78407515, 172.78224022],
         [154.2284775 ,   6.27062235],
         [152.45109014,   0.        ],
         [  8.17536402,   0.        ]],

        [[263.79925288, 112.84056938],
         [1

In [13]:
from itertools import product

P = compute_policies([agents[cumulative_rewards[0][0]]])[0]

for m, p in zip(product(PAYOFF_LABELS, PAYOFF_LABELS, PAYOFF_LABELS, PAYOFF_LABELS), P):
    print(m, "=>", p)

('(C, C)', '(C, C)', '(C, C)', '(C, C)') => C
('(C, C)', '(C, C)', '(C, C)', '(C, D)') => C
('(C, C)', '(C, C)', '(C, C)', '(D, C)') => C
('(C, C)', '(C, C)', '(C, C)', '(D, D)') => C
('(C, C)', '(C, C)', '(C, D)', '(C, C)') => D
('(C, C)', '(C, C)', '(C, D)', '(C, D)') => C
('(C, C)', '(C, C)', '(C, D)', '(D, C)') => C
('(C, C)', '(C, C)', '(C, D)', '(D, D)') => C
('(C, C)', '(C, C)', '(D, C)', '(C, C)') => C
('(C, C)', '(C, C)', '(D, C)', '(C, D)') => C
('(C, C)', '(C, C)', '(D, C)', '(D, C)') => C
('(C, C)', '(C, C)', '(D, C)', '(D, D)') => C
('(C, C)', '(C, C)', '(D, D)', '(C, C)') => C
('(C, C)', '(C, C)', '(D, D)', '(C, D)') => C
('(C, C)', '(C, C)', '(D, D)', '(D, C)') => C
('(C, C)', '(C, C)', '(D, D)', '(D, D)') => C
('(C, C)', '(C, D)', '(C, C)', '(C, C)') => C
('(C, C)', '(C, D)', '(C, C)', '(C, D)') => C
('(C, C)', '(C, D)', '(C, C)', '(D, C)') => C
('(C, C)', '(C, D)', '(C, C)', '(D, D)') => C
('(C, C)', '(C, D)', '(C, D)', '(C, C)') => C
('(C, C)', '(C, D)', '(C, D)', '(C