In [None]:
import numpy as np
from numba import jit
import csv
import json
import sys
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical

In [None]:
class policy_estimator():
    def __init__(self):
        self.n_inputs = 2  # agent's belief, payoff
        self.n_outputs = 1 # new belief

        # Define network
        self.network = nn.Sequential(
            nn.Linear(self.n_inputs, 16),
            nn.ReLU(),
            nn.Linear(16, self.n_outputs),
            nn.ReLU())
            #nn.Softmax(dim=-1))

    def predict(self, state):
        action = self.network(torch.FloatTensor(state))
        return action

In [None]:
def discount_rewards(rewards, gamma):
    r = np.array([gamma**i * rewards[i]
        for i in range(len(rewards))])
    # Reverse the array direction for cumsum and then
    # revert back to the original order
    r = r[::-1].cumsum()[::-1]
    r = r -np.mean(r)
    return r

In [None]:
class Two_Agent_Model:
    def __init__(self, R, S, T, P, initial_belief, number_of_learning_agents, fixed_strategy):

        # 0 is cooperate
        # 1 is defect

        # Game payoff matrix
        self.game = np.array([[R, S], [T, P]])

        self.payoffs = np.zeros(2, dtype=float)
        self.fitnesses = np.zeros(2, dtype=float)

        self.belief = np.full(number_of_learning_agents, initial_belief, dtype=float)

        self.fixed_strategy = fixed_strategy
        self.number_of_learning_agents = number_of_learning_agents

        # Create policies for learning agents
        self.policies = []
        for i in range(number_of_learning_agents):
            self.policies.append(policy_estimator())

        # Create optimizers for learning agents
        self.optimizers = []
        for i in range(number_of_learning_agents):
            optimizer = optim.Adam(self.policies[i].network.parameters(), lr=0.01)
            self.optimizers.append(optimizer)

    def encounter(self, index_focal, index_other):

        if self.number_of_learning_agents == 2:

            # choice focal
            choice_0_value = np.dot(self.game[0], np.array([self.belief[index_focal],
                                                                  1.0 - self.belief[index_focal]]))
            choice_1_value = np.dot(self.game[1], np.array([self.belief[index_focal],
                                                                  1.0 - self.belief[index_focal]]))
            choice_focal = 0 if choice_0_value > choice_1_value else 1

            # choice other
            choice_0_value = np.dot(self.game[0], np.array([self.belief[index_other],
                                                                  1.0 - self.belief[index_other]]))
            choice_1_value = np.dot(self.game[1], np.array([self.belief[index_other],
                                                                  1.0 - self.belief[index_other]]))
            choice_other = 0 if choice_0_value > choice_1_value else 1

            return self.game[choice_focal, choice_other], self.game[choice_other, choice_focal]

        else:
            # choice focal
            choice_0_value = np.dot(self.game[0], np.array([self.belief[index_focal],
                                                                  1.0 - self.belief[index_focal]]))
            choice_1_value = np.dot(self.game[1], np.array([self.belief[index_focal],
                                                                  1.0 - self.belief[index_focal]]))

            choice_focal = 0 if choice_0_value > choice_1_value else 1

            choice_other = self.fixed_strategy

            return self.game[choice_focal, choice_other], self.game[choice_other, choice_focal]


    def compute_payoff(self, samples):
        self.payoffs = np.zeros(2)
        for _ in range(samples):
                focal_index = 0
                other_index = 1
                payoff_focal, payoff_other = self.encounter(focal_index, other_index)
                self.payoffs[focal_index] = self.payoffs[focal_index] + payoff_focal
                self.payoffs[other_index] = self.payoffs[other_index] + payoff_other
        self.payoffs = self.payoffs / samples

    def reinforce(self, policy_estimator, number_of_episodes, number_of_steps,
                  number_of_pretraining_episodes, number_of_pretraining_steps,
                  batch_size, gamma, epsilon):
        ep = 0

        # Set up lists to hold results
        total_rewards =  [ [] for _ in range(self.number_of_learning_agents)]
        batch_rewards = [ [] for _ in range(self.number_of_learning_agents)]
        batch_actions = [ [] for _ in range(self.number_of_learning_agents)]
        batch_states = [ [] for _ in range(self.number_of_learning_agents)]
        beliefs = [ [] for _ in range(self.number_of_learning_agents)]
        for i in range(self.number_of_learning_agents):
            beliefs[i].append(self.belief[i])

        initial_belief = self.belief

        # pre training
        for i in range(number_of_pretraining_episodes):
            states = [ [] for _ in range(self.number_of_learning_agents) ]
            rewards = [ [] for _ in range(self.number_of_learning_agents) ]
            actions = [ [] for _ in range(self.number_of_learning_agents) ]

            for j in range(number_of_pretraining_steps):
                # Play a round of games
                #random_belief = np.random.uniform(size=1)[0]
                self.compute_payoff(batch_size)

                for i in range(self.number_of_learning_agents):
                    current_state = [self.belief[i], self.payoffs[i]]

                    # Select action
                    action = self.policies[i].predict(
                        current_state).detach().numpy()
                    if action >= 1:
                        action = np.array([0.99])
                    if action <= 0:
                        action = np.array([0.01])

                    # Calculate reward
                    self.belief[i] = action
                    payoff_rewards = self.encounter(0, 1)
                    r = payoff_rewards[i]

                    states[i].append(current_state)
                    rewards[i].append(r)
                    actions[i].append(action)
                    self.belief[i] = initial_belief[i]

            # update policies
            for i in range(self.number_of_learning_agents):
                batch_rewards[i].extend(discount_rewards(rewards[i], gamma))
                batch_states[i].extend(states[i])
                batch_actions[i].extend(actions[i])
                #total_rewards[i].append(sum(rewards[i]))
                beliefs[i].append(self.belief[i])

                optimizer = self.optimizers[i]
                optimizer.zero_grad()
                state_tensor = torch.FloatTensor(batch_states[i])
                reward_tensor = torch.FloatTensor(batch_rewards[i])
                # Actions are used as indices, must be LongTensor
                action_tensor = torch.LongTensor(
                    batch_actions[i])

                # Calculate loss
                logprob = torch.log(
                    self.policies[i].predict(state_tensor))

                if len(action_tensor.size()) == 1:
                    action_tensor = action_tensor.unsqueeze(1)

                selected_logprobs = reward_tensor * \
                                    torch.gather(logprob, 1,
                                                 action_tensor).squeeze()
                loss = -selected_logprobs.mean()

                # Calculate gradients
                loss.backward()
                # Apply gradients
                optimizer.step()

                batch_rewards[i] = []
                batch_actions[i] = []
                batch_states[i] = []

        last_pretraining_action = action

        for i in range(number_of_episodes):
            states = [[] for _ in range(self.number_of_learning_agents)]
            rewards = [[] for _ in range(self.number_of_learning_agents)]
            actions = [[] for _ in range(self.number_of_learning_agents)]

            for j in range(number_of_steps):
                # Play a round of games
                # random_belief = np.random.uniform(size=1)[0]
                self.compute_payoff(batch_size)

                for i in range(self.number_of_learning_agents):
                    current_state = [self.belief[i], self.payoffs[i]]

                    # Select action
                    action = self.policies[i].predict(
                        current_state).detach().numpy()
                    if action >= 1:
                        action = np.array([0.99])
                    if action <= 0:
                        action = np.array([0.01])

                    # Introduce some noise
                    x = np.random.uniform()
                    if x < epsilon:
                        action = np.random.uniform(size=1)
                    if action >= 1:
                        action = np.array([0.99])
                    if action <= 0:
                        action = np.array([0.01])

                    # Calculate reward
                    self.belief[i] = action
                    payoff_rewards = self.encounter(0, 1)
                    r = payoff_rewards[i]

                    states[i].append(current_state)
                    rewards[i].append(r)
                    actions[i].append(action)

            # update policies
            for i in range(self.number_of_learning_agents):
                batch_rewards[i].extend(discount_rewards(rewards[i], gamma))
                batch_states[i].extend(states[i])
                batch_actions[i].extend(actions[i])
                total_rewards[i].append(sum(rewards[i]))
                beliefs[i].append(self.belief[i])

                optimizer = self.optimizers[i]
                optimizer.zero_grad()
                state_tensor = torch.FloatTensor(batch_states[i])
                reward_tensor = torch.FloatTensor(batch_rewards[i])
                # Actions are used as indices, must be LongTensor
                action_tensor = torch.LongTensor(
                    batch_actions[i])

                # Calculate loss
                logprob = torch.log(
                    self.policies[i].predict(state_tensor))

                if len(action_tensor.size()) == 1:
                    action_tensor = action_tensor.unsqueeze(1)

                selected_logprobs = reward_tensor * \
                                    torch.gather(logprob, 1,
                                                 action_tensor).squeeze()
                loss = -selected_logprobs.mean()

                # Calculate gradients
                loss.backward()
                # Apply gradients
                optimizer.step()

                batch_rewards[i] = []
                batch_actions[i] = []
                batch_states[i] = []

                if (ep+1) % 10 == 0:
                    print('\rEpisode {}\tAverage reward of last 100: {:.2f}'.format(ep+1, avg_rewards))
                    print("Current belief ", self.belief[0])
                ep += 1

            actual_rewards = np.array(total_rewards)/number_of_steps
            return actual_rewards, beliefs, last_pretraining_action

In [None]:
def plot_results(beliefs):
    plt.plot(beliefs)
    plt.xlabel('Episode');
    plt.ylabel('Beliefs')
    plt.ylim([0, 1])
    plt.show()

In [None]:
if __name__ == "__main__":

    count_nonzero = 0
    count_zero = 0
    mc_beleifs1 = np.zeros(102, dtype=float)
    mc_beleifs_nonzero = np.zeros(102, dtype=float)
    for i in range(10):
        model = Two_Agent_Model(4, 1, 3, 2, 0.9, 1, 0)
        policy_est = policy_estimator()
        rewards, beliefs, last_pretraining_action = model.reinforce(policy_est, 100, 1000, 100, 1000, 10, 0.99, 0.00)
        beliefs = beliefs[0]
        if last_pretraining_action == 0.01:
            count_zero += 1
        else:
            count_nonzero += 1
            for j in range(len(beliefs)):
                mc_beleifs_nonzero[j] = mc_beleifs_nonzero[j] + beliefs[j]
        for j in range(len(beliefs)):
            mc_beleifs1[j] = mc_beleifs1[j] + beliefs[j]
    mc_beleifs1 = mc_beleifs1/10
    mc_beleifs_nonzero = mc_beleifs_nonzero/count_nonzero

    mc_beleifs2 = np.zeros(102, dtype=float)
    for i in range(10):
        model = Two_Agent_Model(4, 1, 3, 2, 0.9, 1, 1)
        policy_est = policy_estimator()
        rewards, beliefs, last_pretraining_action = model.reinforce(policy_est, 100, 100, 100, 100, 10, 0.99, 0.00)
        beliefs = beliefs[0]
        for j in range(len(beliefs)):
            mc_beleifs2[j] = mc_beleifs2[j] + beliefs[j]
    mc_beleifs2 = mc_beleifs2/10

    mc_beleifs3 = np.zeros(102, dtype=float)
    mc_beleifs4 = np.zeros(102, dtype=float)
    for i in range(10):
        model = Two_Agent_Model(4, 1, 3, 2, 0.9, 2, 1)
        policy_est = policy_estimator()
        rewards, beliefs, last_pretraining_action = model.reinforce(policy_est, 100, 1000, 100, 1000, 10, 0.99, 0.00)
        for j in range(len(beliefs[0])):
            mc_beleifs3[j] = mc_beleifs3[j] + beliefs[0][j]
        for j in range(len(beliefs[1])):
            mc_beleifs4[j] = mc_beleifs4[j] + beliefs[1][j]
    mc_beleifs3 = mc_beleifs3/10
    mc_beleifs4 = mc_beleifs4/10

    print("1 learning agent, 1 dummy agent with fixed startegy 0")
    plot_results(mc_beleifs1)
    print("Number of non-defective mc runs: ", str(count_nonzero))
    print("Just non-defective mc runs")
    plot_results(mc_beleifs_nonzero)
    print("1 learning agent, 1 dummy agent with fixed startegy 1")
    plot_results(mc_beleifs2)
    print("2 learning agents")
    plot_results(mc_beleifs3)
    plot_results(mc_beleifs4)

In [None]:
'''
def main(config_file_path):
    with open(config_file_path, 'r') as config_file:
        config = json.load(config_file)
    model = Model(config["number_of_agents"],
                  config["R"], config["S"],
                  config["T"], config["P"],
                  config["tag0_initial_ingroup_belief"],
                  config["tag0_initial_outgroup_belief"],
                  config["tag1_initial_ingroup_belief"],
                  config["tag1_initial_outgroup_belief"],
                  config["initial_number_of_0_tags"])
    model.run_simulation(config["random_seed"], config["number_of_steps"],
                         config["rounds_per_step"],
                         config["selection_intensity"],
                         config["perturbation_probability"],
                         config["perturbation_scale"], config["data_recording"],
                         config["data_file_path"], config["write_frequency"])


if __name__ == "__main__":
    main(sys.argv[1])
'''