In [None]:
# task:
# buyer prompt stage 1:
    # go and pick object (apple, orange)
    # cost: d(object)
    # reward: r(object)
    # immediate utility: U,B,1 from item i1 in {apple, orange} equals reward - cost (r(i,1) - d(i,1))

    # seller observes the buyer's choice and knows the travel cost

# seller prompt stage 2:
    # use observations from stage one to set price for future purchase of one of the two items m(i,3)
    # this requires inferences over buyer's preferences from observed action (selection of one item) such that prices are set in a way to maximise seller's reward/utility
    # this requires model of buyer's behavior

# buyer prompt stage 3:
    # buyer purchases one of the items for a price m(i,3) and then consumes it
    # again receives reward r(i,3), and utility now is U,B,3 = r(i,3) - m(i,3) 

# discounted accumulated utility is as follows
# buyer: U(i1, i3, d, m) = U,B,1(i1, d(i1)) + U,B,3(i3, m(i3)) 
# seller: U,S(i3) = m(i3)

# d(i1) is set by environment
# m(i3) is set by seller
# preferences sum to 10, walking distances sum to 10, and prices sum to 10

In [171]:
import numpy as np

np.random.seed(42)

class Game():
    def __init__(
        self,
        _id: str = "0",
        max_size: int = 10,
        max_reward: int = 10,
        max_price: int = 10,
        cost: int = -1,  
    ):
        """
        Initializes buyer-seller a game.

        Args:
            _id (str): ID of the game.
            max_size (int): Maximum size of the environment.
            max_reward (int): Maximum reward.
            max_price (int): Maximum price.
            cost (int): Cost of travel for buyer (i.e. one step in environment)
        Returns:
            None
        """
        self._id = _id
        self.max_size = max_size
        self.max_reward = max_reward
        self.max_price = max_price
        self.cost = cost
        # environment (three rows, one for travel cost, rewards, buyer location each)
        self.env = np.zeros((3, self.max_size))
        # set travel cost
        self.env[0,:] = self.cost
        # set rewards
        r_i = np.random.uniform(self.max_reward)
        r_j = self.max_reward - r_i
        self.env[1,0], self.env[1,-1] = r_i, r_j
        # set buyer location
        buyer_location = np.random.randint(self.max_size)
        self.env[2,buyer_location] = 1

    @classmethod
    def create(
        cls, 
        _id: str = "0", 
        max_size: int = 10, 
        max_reward: int = 10, 
        max_price: int = 10, 
        cost: int = -1,
    ) -> "Game":
        """
        Creates and returns an instance of the Game class.

        Args:
            _id (str): ID of the game.
            max_size (int): Maximum size of the environment.
            max_reward (int): Maximum reward.
            max_price (int): Maximum price.
            cost (int): Cost of travel for buyer (i.e. one step in environment)
        Returns:
            Game: A new instance of the Game class.
        """
        return cls(_id, max_size, max_reward, max_price, cost)

game = Game.create()

class BuyerToM_NegOne():
    """ToM -1 Buyer using RL."""
    def __init__(
        self,
        game: Game,
    ) -> None:
        self.game = game
        self.cost = game.env[0,:]
        self.rewards = game.env[1,:]
        self.location = np.where(game.env[2,:] == 1)[0][0]
        # utilities
        self.u_1 = np.zeros(2)
        self.u_3 = np.zeros(2)
        self.u = self.u_1 + self.u_3
        # q values
        self.q_1 = np.zeros(2)
        self.q_3 = np.zeros(2)
        # actions 
        self.a_1 = np.zeros(2)
        self.a_3 = np.zeros(2)
        
    def computue_u_1(self):
        self.u_1[0] = self.rewards[0] + np.sum(self.cost[:self.location + 1])
        self.u_1[1] = self.rewards[-1] + np.sum(self.cost[self.location:])
    
    def compute_u_3(self, price):
        self.u_3[0] = self.rewards[0, 0] - price
        self.u_3[1] = self.rewards[0, 1] - price

    def compute_q_1(self):
        self.q_1 = self.u_1
    
    def compute_q_3(self):
        self.q_3 = self.u_3

    def compute_a_1(self, beta: float = 0.5):
        self.a_1 = beta * np.exp(self.q_1) / (beta * np.exp(self.q_1)).sum()
        
    def compute_a_3(self, beta: float = 0.5):
        self.a_3 = beta * np.exp(self.q_3) / (beta * np.exp(self.q_3)).sum()

    
buyer_tom_neg_one = BuyerToM_NegOne(game)
buyer_tom_neg_one.computue_u_1()
buyer_tom_neg_one.compute_q_1()
buyer_tom_neg_one.compute_a_1()
print(buyer_tom_neg_one.u_1)
print(buyer_tom_neg_one.q_1)
print(buyer_tom_neg_one.a_1)

[-1.37086107  0.37086107]
[-1.37086107  0.37086107]
[0.14909432 0.85090568]


In [None]:
class ToMBuyerOne():
    def __init__(
        self,
        env: 
        ):
    self.env = env

    # def compute_utility(self):
        



# # Stage 2: set price 
# m_i = np.random.uniform(MAX_PRICE)
# m_j = MAX_PRICE - m_i
# environment[4,0], environment[4,-1] = m_i, m_j

