In [1]:
from scipy.optimize import linprog
import numpy as np


In [2]:
class AdversarialQAgent:
    def __init__(self, states, actions, opponent_actions, alpha=0.1, gamma=0.9, decay=0.99):
        """
        Initialize the Adversarial Q-Learning agent.

        Args:
            states (list): List of possible states.
            actions (list): List of possible actions for the agent.
            opponent_actions (list): List of possible actions for the opponent.
            alpha (float): Learning rate.
            gamma (float): Discount factor.
            decay (float): Learning rate decay.
        """
        self.states = states
        self.actions = actions
        self.opponent_actions = opponent_actions
        self.alpha = alpha
        self.gamma = gamma
        self.decay = decay
        
        # Initialize Q-table and V-table
        self.Q = {s: {a: {o: 0.0 for o in opponent_actions} for a in actions} for s in states}
        self.V = {s: 0.0 for s in states}
        self.policy = {s: {a: 1.0 / len(actions) for a in actions} for s in states}  # Initialize uniform policy

    def update_q_value(self, s, a, o, s_prime, reward):
        """
        Update the Q-value based on the reward and next state value.

        Args:
            s (int): Current state.
            a (str): Action taken by the agent.
            o (str): Action taken by the opponent.
            s_prime (int): Next state.
            reward (float): Received reward.
        """
        self.Q[s][a][o] = (1 - self.alpha) * self.Q[s][a][o] + \
                          self.alpha * (reward + self.gamma * self.V[s_prime])

    def compute_policy_lp(self, s):
        """
        Compute the optimal policy π[s, .] using linear programming.

        Args:
            s (int): The current state.

        Returns:
            dict: A dictionary π[s, a] representing the optimal policy for state s.
        """
        num_actions = len(self.actions)
        
        # Decision variables: π[a] for each action + v (worst-case value)
        c = [-1] + [0] * num_actions  # Maximize v (converted to minimization by negation)

        # Constraints: Gx <= h
        G = []
        h = []
        
        # Worst-case expected value constraint
        for o in self.opponent_actions:
            row = [1]  # Coefficient for v
            for a in self.actions:
                row.append(-self.Q[s][a][o])  # Coefficients for -π[a] * Q[s, a, o]
            G.append(row)
            h.append(0)

        # Probability distribution constraint: sum(π[a]) = 1
        equality_row = [0] + [1] * num_actions
        A_eq = [equality_row]
        b_eq = [1]

        # π[a] >= 0 for all a
        for i in range(num_actions):
            constraint = [0] + [-1 if j == i else 0 for j in range(num_actions)]
            G.append(constraint)
            h.append(0)

        # Solve linear program
        result = linprog(c, A_ub=G, b_ub=h, A_eq=A_eq, b_eq=b_eq, bounds=(None, None), method='highs')

        if result.success:
            return {a: prob for a, prob in zip(self.actions, result.x[1:])}
        else:
            raise ValueError(f"Linear programming failed at state {s}")

    def update_policy_and_value(self, s):
        """
        Update the policy π[s, .] using linear programming and compute the state value V[s].

        Args:
            s (int): The state for which to update the policy and value function.
        """
        self.policy[s] = self.compute_policy_lp(s)
        
        # Compute new V[s] as the worst-case expected reward
        min_value = float('inf')
        for o in self.opponent_actions:
            expected_value = sum(self.policy[s][a] * self.Q[s][a][o] for a in self.actions)
            min_value = min(min_value, expected_value)

        self.V[s] = min_value

    def decay_learning_rate(self):
        """
        Decay the learning rate alpha.
        """
        self.alpha *= self.decay


In [3]:
# Example Usage
if __name__ == "__main__":
    states = [0, 1]  # Example states
    actions = ['N', 'S', 'E', 'W', 'Stand']
    opponent_actions = ['N', 'S', 'E', 'W', 'Stand']

    agent = AdversarialQAgent(states, actions, opponent_actions)

    # Simulate an experience
    state = 0
    action = 'N'
    opponent_action = 'S'
    next_state = 1
    reward = -1

    # Update Q-value
    agent.update_q_value(state, action, opponent_action, next_state, reward)

    # Update policy and value function
    agent.update_policy_and_value(state)

    # Decay learning rate
    agent.decay_learning_rate()

    print(f"Updated Q-value for (s={state}, a={action}, o={opponent_action}): {agent.Q[state][action][opponent_action]}")
    print(f"Optimal policy for state {state}: {agent.policy[state]}")
    print(f"Value function V[{state}]: {agent.V[state]}")

Updated Q-value for (s=0, a=N, o=S): -0.1
Optimal policy for state 0: {'N': -0.0, 'S': 1.0, 'E': -0.0, 'W': -0.0, 'Stand': -0.0}
Value function V[0]: 0.0
