# Simulate the Boutilier Coordination Game

In [1]:
from environment import Boutilier
from agent import RandomAgent, QLearner
from HystereticQLearner import HystereticQLearner
from policy import epsilon_greedy
from utils import simulate_task
import matplotlib.pyplot as plt
import numpy as np
import random
random.seed()

# Deterministic

## Test Bed

In [2]:
# Simulate
## Parameters

exploration_rate = None
def discount_rate(_): return 0.9
def learning_rate(_): return 0.1
def decrease_rate(_): return 0.01
policy = epsilon_greedy(0.05)
temperature_start = None  # Replace T = num with T = temperature_start maybe?
timesteps = 10000
num_simulations = 200

# Check greedy joint policy

## k = 0

In [3]:
num_converge = 0
k = 0
for i in range(num_simulations):
    task = Boutilier(k)

    agents = [HystereticQLearner(task, exploration_rate, learning_rate, discount_rate, decrease_rate, 
                             policy=policy),
          HystereticQLearner(task, exploration_rate, learning_rate, discount_rate, decrease_rate, 
                             policy=policy)]
#     print("#------------------------#")
#     print("| Simulation {}          |".format(i))
#     print("#------------------------#")

    t = 0
    while(t < timesteps):
        task = Boutilier(0)
        t, _ = simulate_task(agents, task, t, None, timesteps=timesteps)
        
    def find_greedy_action_for_state(agent, state):
        return max(agent.q_values[state], key=agent.q_values[state].get)
    
    # Check if joint greedy action leads to optimal state
    task = Boutilier(0)
    state = 1
    joint_action = [find_greedy_action_for_state(agent, state) for agent in agents]
    state = task.get_new_state(joint_action, state)
    joint_action = [find_greedy_action_for_state(agent, state) for agent in agents]
    state = task.get_new_state(joint_action, state)

    if(state == 4):
        num_converge += 1

print("Percentage of simulations converged: {}%".format(num_converge / num_simulations * 100))

Percentage of simulations converged: 100.0%


## K = -100

In [4]:
num_converge = 0
k = -100
for i in range(num_simulations):
    task = Boutilier(k)

    agents = [HystereticQLearner(task, exploration_rate, learning_rate, discount_rate, decrease_rate, 
                             policy=policy),
          HystereticQLearner(task, exploration_rate, learning_rate, discount_rate, decrease_rate, 
                             policy=policy)]
#     print("#------------------------#")
#     print("| Simulation {}          |".format(i))
#     print("#------------------------#")

    t = 0
    while(t < timesteps):
        task = Boutilier(0)
        t, _ = simulate_task(agents, task, t, None, timesteps=timesteps)
        
    def find_greedy_action_for_state(agent, state):
        return max(agent.q_values[state], key=agent.q_values[state].get)
    
    # Check if joint greedy action leads to optimal state
    task = Boutilier(0)
    state = 1
    joint_action = [find_greedy_action_for_state(agent, state) for agent in agents]
    state = task.get_new_state(joint_action, state)
    joint_action = [find_greedy_action_for_state(agent, state) for agent in agents]
    state = task.get_new_state(joint_action, state)

    if(state == 4):
        num_converge += 1

print("Percentage of simulations converged: {}%".format(num_converge / num_simulations * 100))

Percentage of simulations converged: 100.0%


# Stochastic

In [5]:
# Simulate
## Parameters

exploration_rate = None
def discount_rate(_): return 0.9
def learning_rate(_): return 0.1
def decrease_rate(_): return 0.05
policy = epsilon_greedy(0.05)
temperature_start = None  # Replace T = num with T = temperature_start maybe?
timesteps = 10000
num_simulations = 200

# Check greedy joint policy

## k = 0

In [None]:
num_converge = 0
num_six = 0
k = 0
for i in range(num_simulations):
    task = Boutilier(k, part_stochastic=True)

    agents = [HystereticQLearner(task, exploration_rate, learning_rate, discount_rate, decrease_rate, 
                             policy=policy),
          HystereticQLearner(task, exploration_rate, learning_rate, discount_rate, decrease_rate, 
                             policy=policy)]
#     print("#------------------------#")
#     print("| Simulation {}          |".format(i))
#     print("#------------------------#")

    t = 0
    while(t < timesteps):
        task = Boutilier(k, part_stochastic=True)
        t, _ = simulate_task(agents, task, t, None, timesteps=timesteps)
        
    def find_greedy_action_for_state(agent, state):
        return max(agent.q_values[state], key=agent.q_values[state].get)
    
    # Check if joint greedy action leads to optimal state
    task = Boutilier(k, part_stochastic=True)
    state = 1
    joint_action = [find_greedy_action_for_state(agent, state) for agent in agents]
    state = task.get_new_state(joint_action, state)
    joint_action = [find_greedy_action_for_state(agent, state) for agent in agents]
    state = task.get_new_state(joint_action, state)

    if(state == 4):
        num_converge += 1
        
    elif(state == 6):
        num_six += 1

print("Percentage of simulations converged: {}%".format(num_converge / num_simulations * 100))
print("Percentage of simulations converged to sub-optimal nash equilibrium: {}%".format(num_six / num_simulations * 100))

## k = -100

In [None]:
num_converge = 0
num_six = 0
k = -100
for i in range(num_simulations):
    task = Boutilier(k, part_stochastic=True)

    agents = [HystereticQLearner(task, exploration_rate, learning_rate, discount_rate, decrease_rate, 
                             policy=policy),
          HystereticQLearner(task, exploration_rate, learning_rate, discount_rate, decrease_rate, 
                             policy=policy)]
#     print("#------------------------#")
#     print("| Simulation {}          |".format(i))
#     print("#------------------------#")

    t = 0
    while(t < timesteps):
        task = Boutilier(k, part_stochastic=True)
        t, _ = simulate_task(agents, task, t, None, timesteps=timesteps)
        
    def find_greedy_action_for_state(agent, state):
        return max(agent.q_values[state], key=agent.q_values[state].get)
    
    # Check if joint greedy action leads to optimal state
    task = Boutilier(k, part_stochastic=True)
    state = 1
    joint_action = [find_greedy_action_for_state(agent, state) for agent in agents]
    state = task.get_new_state(joint_action, state)
    joint_action = [find_greedy_action_for_state(agent, state) for agent in agents]
    state = task.get_new_state(joint_action, state)

    if(state == 4):
        num_converge += 1
    elif(state == 6):
        num_six += 1
print("Percentage of simulations converged to optimal: {}%".format(num_converge / num_simulations * 100))
print("Percentage of simulations converged to sub-optimal nash equilibrium: {}%".format(num_six / num_simulations * 100))

### Distributed Q-learning and stochastic

In [None]:
from agent import QLearner

# Simulate
## Parameters

exploration_rate = None
def discount_rate(_): return 0.9
def learning_rate(_): return 0.1

policy = epsilon_greedy(0.05)
temperature_start = None  # Replace T = num with T = temperature_start maybe?
timesteps = 10000
num_simulations = 200

# Check greedy joint policy

num_converge = 0
num_six = 0
k = -100
for i in range(num_simulations):
    task = Boutilier(k, part_stochastic=True)

    agents = [QLearner(task, exploration_rate, learning_rate, discount_rate, 
                             policy=policy),
          QLearner(task, exploration_rate, learning_rate, discount_rate, 
                             policy=policy)]
#     print("#------------------------#")
#     print("| Simulation {}          |".format(i))
#     print("#------------------------#")

    t = 0
    while(t < timesteps):
        task = Boutilier(k, part_stochastic=True)
        t, _ = simulate_task(agents, task, t, None, timesteps=timesteps)
        
    def find_greedy_action_for_state(agent, state):
        return max(agent.q_values[state], key=agent.q_values[state].get)
    
    # Check if joint greedy action leads to optimal state
    task = Boutilier(k, part_stochastic=True)
    state = 1
    joint_action = [find_greedy_action_for_state(agent, state) for agent in agents]
    state = task.get_new_state(joint_action, state)
    joint_action = [find_greedy_action_for_state(agent, state) for agent in agents]
    state = task.get_new_state(joint_action, state)

    if(state == 4):
        num_converge += 1
    elif(state == 6):
        num_six += 1
    
print("Percentage of simulations converged to optimal: {}%".format(num_converge / num_simulations * 100))
print("Percentage of simulations converged to sub-optimal nash equilibrium: {}%".format(num_six / num_simulations * 100))