In [None]:
# task:
# buyer prompt stage 1:
    # go and pick object (apple, orange)
    # cost: d(object)
    # reward: r(object)
    # immediate utility: U,B,1 from item i1 in {apple, orange} equals reward - cost (r(i,1) - d(i,1))

    # seller observes the buyer's choice and knows the travel cost

# seller prompt stage 2:
    # use observations from stage one to set price for future purchase of one of the two items m(i,3)
    # this requires inferences over buyer's preferences from observed action (selection of one item) such that prices are set in a way to maximise seller's reward/utility
    # this requires model of buyer's behavior

# buyer prompt stage 3:
    # buyer purchases one of the items for a price m(i,3) and then consumes it
    # again receives reward r(i,3), and utility now is U,B,3 = r(i,3) - m(i,3) 

# discounted accumulated utility is as follows
# buyer: U(i1, i3, d, m) = U,B,1(i1, d(i1)) + U,B,3(i3, m(i3)) 
# seller: U,S(i3) = m(i3)

# d(i1) is set by environment
# m(i3) is set by seller
# preferences sum to 10, walking distances sum to 10, and prices sum to 10

In [None]:
# k=-1 buyer agent
# RL
import numpy as np

# setup
apple_reward = np.random.uniform(0, 10) # Sample a reward for apple
orange_reward = 10 - apple_reward # Orange reward is complementary
apple_dist = np.random.uniform(0, 10) # Sample a distance for apple
orange_dist = 10 - apple_dist # Orange distance is complementary
# Actions
beta = 0.5 # Softmax temperature
apple_q = apple_reward - apple_dist # Q-value for apple
orange_q = orange_reward - orange_dist # Q-value for orange
# Compute softmax probabilities
exp_apple_q = np.exp(beta * apple_q)
exp_orange_q = np.exp(beta * orange_q)
apple_prob = exp_apple_q / (exp_apple_q + exp_orange_q)
orange_prob = exp_orange_q / (exp_apple_q + exp_orange_q)
# print 
print(f"Apple reward: {apple_reward}")
print(f"Orange reward: {orange_reward}")
print(f"Apple distance: {apple_dist}")
print(f"Orange distance: {orange_dist}")
print(f"Apple Q-value: {apple_q}")
print(f"Orange Q-value: {orange_q}")
print(f"Apple probability: {apple_prob}")
print(f"Orange probability: {orange_prob}")

# compute argmax and return apple/orange

if apple_q > orange_q:
    buyer_choice = "apple"
else:
    buyer_choice = "orange"
print(f"Choice: {buyer_choice}")

In [77]:
# k=0 seller agent
# IRL

# Infer posterior over buyer rewards
prior = np.random.uniform(0, 10) # Uniform prior over apple/orange reward
# Likelihood based on buyer policy
likelihood = np.array([apple_prob, orange_prob]) 
posterior = likelihood * prior # Bayes rule
apple_posterior = posterior[0]
orange_posterior = posterior[1]

# Expected profit for each price
apple_prices = np.arange(0, 10, 0.5)
orange_prices = np.arange(0, 10, 0.5)

apple_profits = []
for p in apple_prices:
  prob_purchase = 1 / (1 + np.exp(-apple_posterior + p))
  profit = p * prob_purchase
  apple_profits.append(profit)

orange_profits = []
for p in orange_prices:
  prob_purchase = 1 / (1 + np.exp(-orange_posterior + p))
  profit = p * prob_purchase
  orange_profits.append(profit)
  
# Choose profit maximizing price
apple_price = apple_prices[np.argmax(apple_profits)]
orange_price = orange_prices[np.argmax(orange_profits)]

print("Apple price:", apple_price)
print("Orange price:", orange_price)

Apple price: 4.0
Orange price: 3.5
