In [None]:
import numpy as np
from enum import Enum, auto
from dataclasses import dataclass
from typing import Dict, Tuple, List

In [None]:
# helpful enums

# object types - bottle, can, or glass
class ObjectType(Enum):
    BOTTLE = 1
    CAN = 2
    GLASS = 3

# outcomes
class OutcomeType(Enum):
    SP_SUCCESS = 0 # robot succeeds
    SP_FAIL = 1 # robot fails
    IT = 2 # human intervenes

TRUST_LEVELS = [1,2,3,4,5,6,7]

In [None]:
# reward table

REWARD_TABLE: Dict[Tuple[ObjectType, OutcomeType], float] = {
    (ObjectType.BOTTLE, OutcomeType.SP_SUCCESS): 1.0,
    (ObjectType.BOTTLE, OutcomeType.SP_FAIL):    0.0,
    (ObjectType.BOTTLE, OutcomeType.IT):         0.0,

    (ObjectType.CAN, OutcomeType.SP_SUCCESS):    2.0,
    (ObjectType.CAN, OutcomeType.SP_FAIL):      -4.0,
    (ObjectType.CAN, OutcomeType.IT):            0.0,

    (ObjectType.GLASS, OutcomeType.SP_SUCCESS):  3.0,
    (ObjectType.GLASS, OutcomeType.SP_FAIL):    -9.0,
    (ObjectType.GLASS, OutcomeType.IT):          0.0,
}

In [None]:
def sigmoid(x):
  return 1 / (1 + np.exp(-x))

class HumanDecisionModel:

  # maps object type to discounted reward --> must learn this from data
  gamma: Dict[ObjectType, float]
  # maps object type to 'risk penalty' --> must learn this from data
  eta: Dict[ObjectType, float]

  # does the human think the robot will succeed?
  def success_belief(self, theta, obj):

    g = self.gamma[obj]
    n = self.eta[obj]

    return sigmoid(g * theta + n)

  # what is the probability the human will stay put / not intervene?
  def prob_stay_put(self, theta, obj):

    b_tj = self.success_belief(theta, obj)
    rS = REWARD_TABLE[(obj, OutcomeType.SP_SUCCESS)]
    rF = REWARD_TABLE[(obj, OutcomeType.SP_FAIL)]

    return sigmoid(b_tj * rS + (1 - b_tj) * rF)

  # now we can calculaute the human action!
  def sample_human_action(self, theta, obj):

    p_stay = self.prob_stay_put(theta, obj)
    if np.random.rand() < p_stay:
      return None
    else: return OutcomeType.IT

In [None]:
human_model = HumanDecisionModel()

# bayesion inference for the probability of each observation
def belief(state_theta, action_obj, observation):

    p_stay = human_model.prob_stay_put(state_theta, action_obj)

    if observation == OutcomeType.SP_SUCCESS:
        return p_stay
    else:  # observation == OutcomeType.IT (OutcomeType.Fail isn't possible yet)
        return 1 - p_stay

In [None]:
from scipy.stats import norm

def belief_dist(theta_t, action, observation, theta_next=None, alpha=1, beta=.2, sigma=1):

    # if the robot succeeds
    if observation == OutcomeType.SP_SUCCESS:
        mean = theta_t + alpha  # trust increases
    else:  # otherwise, trust decreases slightly (by beta)
        mean = theta_t - beta

    # calculate probabilities for all trust levels
    # ex. if the robot succeeds, higher trust level becomes more probably
    probs = {}
    for theta in TRUST_LEVELS:
        lower = theta - 0.5
        upper = theta + 0.5
        probs[theta] = norm.cdf(upper, loc=mean, scale=sigma) - norm.cdf(lower, loc=mean, scale=sigma)

    total = sum(probs.values())
    probs = {theta: p / total for theta, p in probs.items()}

    if theta_next is not None:
        return probs[theta_next]
    else:
        return probs

In [None]:
def belief_update(belief, action, observation, human_model):

    predicted = {}
    for theta_next in TRUST_LEVELS:
        predicted[theta_next] = 0.0
        for theta_curr in TRUST_LEVELS:
            prior = belief[theta_curr]
            trans_prob = belief_dist(theta_curr, action, observation)[theta_next]
            predicted[theta_next] += prior * trans_prob

    new_belief = {}
    for theta_next in TRUST_LEVELS:
        if observation == OutcomeType.SP_SUCCESS:
            obs_prob = human_model.prob_stay_put(theta_next, action)
        else:
            obs_prob = 1 - human_model.prob_stay_put(theta_next, action)

        new_belief[theta_next] = predicted[theta_next] * obs_prob

    total = sum(new_belief.values())
    new_belief = {theta: p / total for theta, p in new_belief.items()}

    return new_belief


In [None]:
from functools import lru_cache

def simple_trust_aware_policy(belief, human_model, available_actions):
    """
    Full-horizon trust-aware planning with memoization.
    Explores all trajectories until remaining_objects is empty.
    Returns (best_action, best_total_expected_reward).
    """

    # Convert initial belief to tuple (because dicts aren't hashable)
    init_belief_tuple = tuple(belief[theta] for theta in TRUST_LEVELS)

    # Convert available_actions to a tuple of integers (ObjectType values)
    init_actions_tuple = tuple(sorted([obj.value for obj in available_actions]))

    # ---- 1. One-step expected reward (unchanged) ----
    def immediate_expected_reward(belief_state, action_obj):
        expected = 0.0
        for theta in TRUST_LEVELS:
            p_theta = belief_state[theta]
            p_stay = human_model.prob_stay_put(theta, action_obj)

            reward_if_stay = REWARD_TABLE[(action_obj, OutcomeType.SP_SUCCESS)]
            reward_if_intervene = REWARD_TABLE[(action_obj, OutcomeType.IT)]

            expected += p_theta * (
                p_stay * reward_if_stay +
                (1 - p_stay) * reward_if_intervene
            )
        return expected

    # ---- 2. Cached wrapper for belief_update ----
    @lru_cache(maxsize=None)
    def cached_belief_update(belief_tuple, action_value, obs_value):
        belief_dict = {theta: belief_tuple[i] for i, theta in enumerate(TRUST_LEVELS)}
        action_obj = ObjectType(action_value)
        obs = OutcomeType(obs_value)
        new_b = belief_update(belief_dict, action_obj, obs, human_model)
        return tuple(new_b[theta] for theta in TRUST_LEVELS)

    # ---- 3. Main recursive value function, fully memoized ----
    @lru_cache(maxsize=None)
    def expected_return(belief_tuple, actions_tuple):
        """
        belief_tuple: tuple of 7 floats (belief over trust levels)
        actions_tuple: sorted tuple of remaining object values (ints)
        """

        # Base case: no objects left
        if not actions_tuple:
            return (None, 0.0)

        # Convert tuple → usable dict
        belief_state = {theta: belief_tuple[i] for i, theta in enumerate(TRUST_LEVELS)}

        best_act = None
        best_val = -float('inf')

        # Try each remaining action
        for idx, action_value in enumerate(actions_tuple):
            action_obj = ObjectType(action_value)

            # 1-step reward
            immediate = immediate_expected_reward(belief_state, action_obj)

            # Expected future reward
            future = 0.0
            for obs in (OutcomeType.SP_SUCCESS, OutcomeType.IT):
                obs_value = obs.value

                # Compute P(obs | belief, action)
                if obs == OutcomeType.SP_SUCCESS:
                    p_obs = sum(
                        belief_state[t] * human_model.prob_stay_put(t, action_obj)
                        for t in TRUST_LEVELS
                    )
                else:
                    p_obs = sum(
                        belief_state[t] * (1 - human_model.prob_stay_put(t, action_obj))
                        for t in TRUST_LEVELS
                    )

                if p_obs == 0:
                    continue

                # Updated belief (cached!)
                next_b_tuple = cached_belief_update(belief_tuple, action_value, obs_value)

                # Remove this action from remaining list
                next_actions = list(actions_tuple)
                next_actions.pop(idx)
                next_actions_tuple = tuple(sorted(next_actions))

                # Recursive call (cached!)
                _, v_next = expected_return(next_b_tuple, next_actions_tuple)

                future += p_obs * v_next

            total = immediate + future

            if total > best_val:
                best_val = total
                best_act = action_obj

        return (best_act, best_val)

    # ---- Call planner ----
    return expected_return(init_belief_tuple, init_actions_tuple)


In [None]:
def simulate_task(human_model, initial_belief, alpha=0.3, beta=0.0, sigma=0.5, initial_theta=None):
    """
    Simulate a full table-clearing task with DYNAMIC trust.

    true_theta_t evolves according to belief_dist(theta_t, action, observation).
    """
    # Belief over trust (robot's internal state)
    belief = initial_belief.copy()
    total_reward = 0.0
    history = []

    # Hidden true trust state
    if initial_theta is None:
        # sample initial trust from prior belief
        true_theta = np.random.choice(TRUST_LEVELS, p=list(belief.values()))
    else:
        true_theta = initial_theta

    remaining_objects = [
        ObjectType.BOTTLE, ObjectType.BOTTLE, ObjectType.BOTTLE,
        ObjectType.CAN, ObjectType.GLASS
    ]

    while remaining_objects:
        # Robot chooses action from its belief
        action, expected_reward = simple_trust_aware_policy(belief, human_model, remaining_objects)

        # # Make sure action is available
        # if action not in remaining_objects:
        #     for obj in [ObjectType.BOTTLE, ObjectType.CAN, ObjectType.GLASS]:
        #         if obj in remaining_objects:
        #             action = obj
        #             break

        # --- ENVIRONMENT STEP (uses true_theta_t) ---
        # Human decides whether to intervene
        observation = human_model.sample_human_action(true_theta, action)
        if observation is None:
            observation = OutcomeType.SP_SUCCESS

        # Reward from outcome
        reward = REWARD_TABLE[(action, observation)]
        total_reward += reward

        # --- TRUE TRUST DYNAMICS: theta_{t+1} ---
        trans_probs = belief_dist(true_theta, action, observation)
        true_theta = np.random.choice(
            TRUST_LEVELS,
            p=[trans_probs[theta] for theta in TRUST_LEVELS]
        )

        # --- BELIEF UPDATE (robot’s update about trust) ---
        belief = belief_update(belief, action, observation, human_model)

        # Task progress
        remaining_objects.remove(action)

        # record also true_theta for debugging
        history.append((action, observation, belief.copy(), true_theta))

    return total_reward, history


In [None]:
def run_multiple_simulations(human_model, true_theta, initial_belief, n_runs=100):
    all_rewards = []
    action_sequences = []
    first_action_counts = {
        ObjectType.BOTTLE: 0,
        ObjectType.CAN: 0,
        ObjectType.GLASS: 0
    }

    for _ in range(n_runs):
        total_reward, history = simulate_task(
            human_model,
            initial_belief,
            initial_theta=true_theta  # <-- now used!
        )
        all_rewards.append(total_reward)

        action_seq = [action.name for (action, obs, belief, theta) in history]
        action_sequences.append(action_seq)

        first_action = history[0][0]
        first_action_counts[first_action] += 1

    print(f"Average reward: {np.mean(all_rewards):.2f} ± {np.std(all_rewards):.2f}")
    print(f"\nFirst action distribution (out of {n_runs} runs):")
    for obj, count in first_action_counts.items():
        print(f"  {obj.name}: {count} times ({100*count/n_runs:.1f}%)")

    from collections import Counter
    seq_counts = Counter([tuple(seq) for seq in action_sequences])
    print(f"\nMost common action sequences:")
    for seq, count in seq_counts.most_common(5):
        print(f"  {' → '.join(seq)}: {count} times")

    return all_rewards


In [None]:
# Initialize human model (you'll need to set gamma and eta)
human_model = HumanDecisionModel()

human_model.gamma = {
    ObjectType.BOTTLE: 1,   # moderate slope
    ObjectType.CAN:    1.2,   # sharp slope, only grows at medium-high θ
    ObjectType.GLASS:  1.5,   # very sharp slope, requires high θ
}

human_model.eta = {
    ObjectType.BOTTLE: 0,   # threshold ~ 1.6
    ObjectType.CAN:   -2.5,   # threshold ~ 4.0
    ObjectType.GLASS: -5,   # threshold ~ 6.0
}

initial_belief =   {
    1: 0.0625,
    2: 0.125,
    3: 0.1825,
    4: 0.375,
    5: 0.1825,
    6: 0.125,
    7: 0.0625
}

run_multiple_simulations(
    human_model,
    true_theta=4,
    initial_belief=initial_belief,
    n_runs=1
)

Average reward: 8.00 ± 0.00

First action distribution (out of 1 runs):
  BOTTLE: 1 times (100.0%)
  CAN: 0 times (0.0%)
  GLASS: 0 times (0.0%)

Most common action sequences:
  BOTTLE → BOTTLE → BOTTLE → CAN → GLASS: 1 times


[8.0]

In [None]:
for theta in TRUST_LEVELS:
    print(theta,
          human_model.prob_stay_put(theta, ObjectType.BOTTLE),
          human_model.prob_stay_put(theta, ObjectType.CAN),
          human_model.prob_stay_put(theta, ObjectType.GLASS))

1 0.6750375273768237 0.06209344479271049 0.0001754022502924929
2 0.7069873680001046 0.2405118927418583 0.0005156508238289689
3 0.7216325609518421 0.6228260157313329 0.011323220046987948
4 0.7275076135036415 0.8104999872916171 0.44341912568990316
5 0.729740651093834 0.8610605518001789 0.889898215476072
6 0.7305721537541839 0.8750011002019935 0.9418141344568051
7 0.7308794173912335 0.8790652768279943 0.9503182066279046
