In [3]:
import numpy as np
from scipy.optimize import linprog

In [4]:

def compute_policy(s, Q, agent_actions, opponent_actions):
    """
    Compute the policy π[s, .] using the given Q-function, considering opponent actions.

    Args:
        s (int): The current state.
        Q (dict): A nested dictionary Q[s][a][o], where:
                  - Q[s][a][o] gives the Q-value for state s, action a, and opponent action o.
        agent_actions (list): List of possible actions for the agent.
        opponent_actions (list): List of possible actions for the opponent.

    Returns:
        dict: A dictionary π[s, a] representing the optimal policy for state s.
    """
    best_policy = None
    max_value = float('-inf')
    
    # Iterate over candidate policies π'
    for candidate_policy in generate_candidate_policies(agent_actions):
        # Calculate the worst-case value (min over opponent actions)
        worst_case_value = float('inf')
        
        for o in opponent_actions:
            # Calculate the expected value for this opponent action o
            expected_value = 0
            for a in agent_actions:
                expected_value += candidate_policy[a] * Q[s][a][o]
            
            # Update the worst-case (minimum over opponent actions)
            worst_case_value = min(worst_case_value, expected_value)
        
        # Check if this policy maximizes the worst-case value
        if worst_case_value > max_value:
            max_value = worst_case_value
            best_policy = candidate_policy
    
    return best_policy



In [5]:

def generate_candidate_policies(actions):
    """
    Generate all candidate policies (distributions over actions).
    This assumes discrete actions and returns deterministic policies.

    Args:
        actions (list): List of possible actions.

    Returns:
        list: A list of dictionaries representing policies.
    """
    policies = []
    num_actions = len(actions)
    
    # Generate simple deterministic policies
    for i in range(num_actions):
        policy = {a: 0.0 for a in actions}
        policy[actions[i]] = 1.0
        policies.append(policy)
    
    return policies

In [6]:
def compute_policy_lp(s, Q, agent_actions, opponent_actions):
    """
    Compute the optimal policy π[s, .] using linear programming.

    Args:
        s (int): The current state.
        Q (dict): A nested dictionary Q[s][a][o], where:
                  - Q[s][a][o] gives the Q-value for state s, action a, and opponent action o.
        agent_actions (list): List of possible actions for the agent.
        opponent_actions (list): List of possible actions for the opponent.

    Returns:
        dict: A dictionary π[s, a] representing the optimal policy for state s.
    """
    num_actions = len(agent_actions)
    num_opponent_actions = len(opponent_actions)
    
    # Decision variables: π[a] for each agent action, plus v (worst-case value)
    # Total decision variables = num_actions + 1
    c = [-1] + [0] * num_actions  # Objective: Maximize v (negative for linprog minimization)
    
    # Constraints: Gx <= h
    G = []
    h = []
    
    # Add constraints for worst-case expected value
    for o in opponent_actions:
        row = [1]  # Coefficient for v
        for a in agent_actions:
            row.append(-Q[s][a][o])  # Coefficients for -π[a] * Q[s, a, o]
        G.append(row)
        h.append(0)  # Constraint: v <= sum(π[a] * Q[s, a, o])
    
    # Add constraints for probability distribution
    # Sum of π[a] = 1
    equality_row = [0] + [1] * num_actions
    A_eq = [equality_row]
    b_eq = [1]
    
    # π[a] >= 0 for all a
    for i in range(num_actions):
        constraint = [0] + [-1 if j == i else 0 for j in range(num_actions)]
        G.append(constraint)
        h.append(0)
    
    # Solve the linear program
    result = linprog(c, A_ub=G, b_ub=h, A_eq=A_eq, b_eq=b_eq, bounds=(None, None), method='highs')
    
    if result.success:
        # Extract policy from result
        policy = {a: prob for a, prob in zip(agent_actions, result.x[1:])}
        return policy
    else:
        raise ValueError("Linear programming failed to find a solution.")

In [9]:
# Example usage
if __name__ == "__main__":
    # Define Q-values: Q[s][a][o] = reward for (state, action, opponent action)
    Q = {
        0: {  # State 0
            'N': {'N': 1, 'S': -1, 'E': 0, 'W': 2, 'Stand': 3},
            'S': {'N': 0, 'S': 2, 'E': -2, 'W': 1, 'Stand': -1},
            'E': {'N': -1, 'S': 3, 'E': 1, 'W': -2, 'Stand': 0},
            'W': {'N': 2, 'S': 1, 'E': -3, 'W': 0, 'Stand': 4},
            'Stand': {'N': 3, 'S': -2, 'E': 0, 'W': 1, 'Stand': 2},
        }
    }

    # Define possible actions for the agent and opponent
    actions = ['N', 'S', 'E', 'W', 'Stand']

    # Compute the policy for state 0
    state = 0
    optimal_policy = compute_policy(state, Q, actions, actions)
    optimal_policy = compute_policy_lp(state, Q, actions, actions)
    print(f"Optimal policy for state {state}: {optimal_policy}")

Optimal policy for state 0: {'N': 0.5384615384615384, 'S': -0.0, 'E': 0.3846153846153846, 'W': -0.0, 'Stand': 0.07692307692307693}


In [23]:
optimal_policy['N']

0.5384615384615384