In [1]:
import math
import os
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from math import ceil, floor
from collections import deque
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
from torch.distributions import Normal

In [2]:
# Name of the directory where image files will be store
IMAGE_DIRECTORY = 'images'

# Names of the possible game scenarios
SELFISH = 'selfish'
PROSOCIAL = 'prosocial'
MIXED = 'mixed'

# Create the directory for images if not existing yet
if not os.path.exists(IMAGE_DIRECTORY):
    os.makedirs(IMAGE_DIRECTORY)

In [3]:
class Environment:
    def __init__(self, n_pirates=10, n_coins=5):
        self.n_pirates = n_pirates
        self.n_coins = n_coins
        self.decider_rank = 0
        self.state = np.zeros(n_pirates)
        self.state[self.decider_rank] = 1
        self.living_pirates = np.ones(n_pirates, dtype=bool)
        print(f"[INIT] n_pirates={n_pirates}, n_coins={n_coins}")
        print(f"[INIT] decider_rank={self.decider_rank}, state={self.state}")

    def reset(self):
        self.decider_rank = 0
        self.state = np.zeros(self.n_pirates)
        self.state[self.decider_rank] = 1
        self.living_pirates = np.ones(self.n_pirates, dtype=bool)
        print(f"[RESET] decider_rank={self.decider_rank}, state={self.state}")
        return self.state

    def step(self, plan, votes):
        print(f"[STEP] decider_rank={self.decider_rank}")
        print(f"[STEP] plan={plan}")
        print(f"[STEP] votes={votes}")

        rewards = np.array([1.0] * self.n_pirates)
        dones = np.array([False] * self.n_pirates)

        self.decider_rank = min(self.decider_rank + 1, self.n_pirates - 1)
        self.state = np.zeros(self.n_pirates)
        self.state[self.decider_rank] = 1

        print(f"[STEP] new decider_rank={self.decider_rank}, new state={self.state}")
        print(f"[STEP] rewards={rewards}, dones={dones}")

        return self.state, rewards, dones

In [4]:
env = Environment(10, 5)
print(env)

[INIT] n_pirates=10, n_coins=5
[INIT] decider_rank=0, state=[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
<__main__.Environment object at 0x000001A918C03C10>


In [5]:
# state, rewards, dones = env.step(plan, votes)

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class Planner(nn.Module):
    def __init__(self, n_pirates=10, n_coins=5):
        super(Planner, self).__init__()
        self.n_pirates = n_pirates
        self.n_coins = n_coins
        self.epsilon = 0.1
        print(f"[Planner INIT] n_pirates={n_pirates}, n_coins={n_coins}")

    def forward(self, x, decider_rank):
        print(f"[Planner FORWARD] Input state: {x}")
        print(f"[Planner FORWARD] Decider rank: {decider_rank}")

        # Zakucani izlaz: jednostavna raspodela bez računanja
        out = torch.ones((1, self.n_pirates * self.n_coins)).to(device)
        print(f"[Planner FORWARD] Output logits: {out}")
        return out

    def act(self, state, add_noise=True):
        decider_rank = np.argmax(state)
        print(f"[Planner ACT] Decider rank: {decider_rank}")
        print(f"[Planner ACT] Raw state: {state}")

        state_tensor = torch.from_numpy(state).float().unsqueeze(0).to(device)
        logits = self.forward(state_tensor, decider_rank).cpu()
        logits = logits.view([-1, self.n_pirates])
        probs_total = F.softmax(logits, dim=1)

        print(f"[Planner ACT] Softmax probabilities: {probs_total}")

        m = Categorical(probs_total)

        if add_noise and np.random.rand() < self.epsilon:
            print("[Planner ACT] Using uniform random action due to epsilon")
            proba = torch.ones([self.n_coins, self.n_pirates]) / self.n_pirates
            uniform = Categorical(proba)
            action = uniform.sample()
            log_prob = uniform.log_prob(action)
        else:
            action = m.sample()
            log_prob = m.log_prob(action)

        print(f"[Planner ACT] Action: {action}")
        print(f"[Planner ACT] Log prob: {log_prob}")

        return action, log_prob

In [7]:
planer = Planner(10, 5)
print(planer)

[Planner INIT] n_pirates=10, n_coins=5
Planner()


In [8]:
state = np.zeros(10)
state[0] = 1  # Zakucano stanje: prvi pirat je decision maker

action, log_prob = planer.act(state, add_noise=False)

[Planner ACT] Decider rank: 0
[Planner ACT] Raw state: [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[Planner FORWARD] Input state: tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])
[Planner FORWARD] Decider rank: 0
[Planner FORWARD] Output logits: tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])
[Planner ACT] Softmax probabilities: tensor([[0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
         0.1000],
        [0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
         0.1000],
        [0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
         0.1000],
        [0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
         0.1000],
        [0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
         0.1000]])
[P

In [9]:
class Voter():
    def __init__(self, n_pirates=5, n_coins=10, scenario=SELFISH):
        self.n_pirates = n_pirates
        self.n_coins = n_coins
        self.V = np.zeros([self.n_pirates+1, self.n_pirates])
        self.scenario = scenario
        self.epsilon = 0.01
        print(f"[Voter INIT] n_pirates={n_pirates}, n_coins={n_coins}, scenario={scenario}")

    def act(self, voter_rank, plan, n_living_pirates, add_noise=True):
        print(f"\n[Voter ACT] Voter rank: {voter_rank}")
        print(f"[Voter ACT] Plan: {plan}")
        print(f"[Voter ACT] Living pirates: {n_living_pirates}")

        pirate_highest_rank = self.n_pirates - n_living_pirates
        print(f"[Voter ACT] Pirate highest rank: {pirate_highest_rank}")

        # Determine behavior
        if self.scenario == SELFISH:
            behavior = SELFISH
        elif self.scenario == PROSOCIAL:
            behavior = PROSOCIAL
        elif self.scenario == MIXED:
            behavior = SELFISH if voter_rank == pirate_highest_rank else PROSOCIAL
        else:
            raise Exception("Scenario %s not implemented" % self.scenario)

        print(f"[Voter ACT] Behavior: {behavior}")

        if add_noise and np.random.rand() < self.epsilon:
            print("[Voter ACT] Random noise triggered")
            uniform = Categorical(0.5 * torch.ones([1, 2]))
            action = uniform.sample()
        else:
            if behavior == SELFISH:
                gain_plan = plan[voter_rank]
                reward_max = self.V[self.n_pirates - n_living_pirates + 1, voter_rank]
                print(f"[Voter ACT] Gain from plan: {gain_plan}")
                print(f"[Voter ACT] Max reward if planner dies: {reward_max}")
                action = gain_plan > reward_max or gain_plan == self.n_coins
            # elif behavior == PROSOCIAL:
            #     jain = get_jain_index(plan[-n_living_pirates:])
            #     jain_max = get_jain_index_max(n_living_pirates, self.n_coins)
            #     jain = round_float(jain, 2)
            #     jain_max = round_float(jain_max, 2)
            #     first_max = are_max_values_first(plan[-n_living_pirates:])
            #     print(f"[Voter ACT] Jain index: {jain}, Max Jain: {jain_max}")
            #     print(f"[Voter ACT] First max value: {first_max}")
            #     action = (jain == jain_max) and first_max
            else:
                raise Exception("Scenario %s not implemented" % self.scenario)

        print(f"[Voter ACT] Final vote: {bool(action)}")
        return bool(action)

In [10]:
voter = Voter(n_pirates=5, n_coins=10, scenario=SELFISH)
voter.V[1] = [1, 1, 2, 3, 3]  # neka hipotetička vrednost ako planer umre

plan = [1, 8, 1, 0, 0]  # P-0 uzima sve, ostali ništa
vote = voter.act(voter_rank=1, plan=plan, n_living_pirates=5)
print(f"Pirate 1 vote: {vote}")

[Voter INIT] n_pirates=5, n_coins=10, scenario=selfish

[Voter ACT] Voter rank: 1
[Voter ACT] Plan: [1, 8, 1, 0, 0]
[Voter ACT] Living pirates: 5
[Voter ACT] Pirate highest rank: 0
[Voter ACT] Behavior: selfish
[Voter ACT] Gain from plan: 8
[Voter ACT] Max reward if planner dies: 1.0
[Voter ACT] Final vote: True
Pirate 1 vote: True
