In [None]:
import numpy as np
from enum import Enum, auto
from dataclasses import dataclass
from typing import Dict, Tuple, List

In [None]:
# helpful enums

# object types - bottle, can, or glass
class ObjectType(Enum):
    BOTTLE = 1
    CAN = 2
    GLASS = 3

# outcomes
class OutcomeType(Enum):
    SP_SUCCESS = 0 # robot succeeds
    SP_FAIL = 1 # robot fails
    IT = 2 # human intervenes

TRUST_LEVELS = [1,2,3,4,5,6,7]

In [None]:
# reward table

REWARD_TABLE: Dict[Tuple[ObjectType, OutcomeType], float] = {
    (ObjectType.BOTTLE, OutcomeType.SP_SUCCESS): 1.0,
    (ObjectType.BOTTLE, OutcomeType.SP_FAIL):    0.0,
    (ObjectType.BOTTLE, OutcomeType.IT):         0.0,

    (ObjectType.CAN, OutcomeType.SP_SUCCESS):    2.0,
    (ObjectType.CAN, OutcomeType.SP_FAIL):      -4.0,
    (ObjectType.CAN, OutcomeType.IT):            0.0,

    (ObjectType.GLASS, OutcomeType.SP_SUCCESS):  3.0,
    (ObjectType.GLASS, OutcomeType.SP_FAIL):    -12.0,
    (ObjectType.GLASS, OutcomeType.IT):          0.0,
}

In [None]:
# Intrinsic robot failure rates for each object type
ROBOT_FAIL_RATE = {
    ObjectType.BOTTLE: 0.00,
    ObjectType.CAN:    0.00,
    ObjectType.GLASS:  0.25,
}


In [None]:
def sigmoid(x):
  return 1 / (1 + np.exp(-x))

class HumanDecisionModel:

  gamma: Dict[ObjectType, float]
  eta: Dict[ObjectType, float]

  # does the human think the robot will succeed?
  def success_belief(self, theta, obj):

    g = self.gamma[obj]
    n = self.eta[obj]

    return sigmoid(g * theta + n)

  # what is the probability the human will stay put / not intervene?
  def prob_stay_put(self, theta, obj):

    b_tj = self.success_belief(theta, obj)
    rS = REWARD_TABLE[(obj, OutcomeType.SP_SUCCESS)]
    rF = REWARD_TABLE[(obj, OutcomeType.SP_FAIL)]

    return sigmoid(b_tj * rS + (1 - b_tj) * rF)

  def sample_human_action(self, theta, obj):

    p_stay = self.prob_stay_put(theta, obj)
    if np.random.rand() < p_stay:
      return None
    else: return OutcomeType.IT

In [None]:
human_model = HumanDecisionModel()

def belief(state_theta, action_obj, observation):

    p_stay = human_model.prob_stay_put(state_theta, action_obj)
    p_fail = ROBOT_FAIL_RATE[action_obj]
    p_succ_given_no_intervention = 1 - p_fail

    if observation == OutcomeType.SP_SUCCESS:
        return p_stay * p_succ_given_no_intervention

    elif observation == OutcomeType.SP_FAIL:
        return p_stay * p_fail

    else:
        return 1 - p_stay



In [None]:
from scipy.stats import norm

def belief_dist(theta_t, action, observation, theta_next=None, alpha=1.0, beta=0.2, epsilon=1.0, sigma=1):
    """
    P(θ_{t+1} | θ_t, a_t, o_t)

    - If robot succeeds: trust increases by +alpha
    - If robot fails:    trust decreases by -beta
    - If human intervenes (IT): trust stays the same
    """
    if observation == OutcomeType.SP_SUCCESS:
        mean = theta_t + alpha
    elif observation == OutcomeType.SP_FAIL:
        mean = theta_t - epsilon
    else:
        mean = theta_t - beta

    probs = {}
    for theta in TRUST_LEVELS:
        lower = theta - 0.5
        upper = theta + 0.5
        probs[theta] = norm.cdf(upper, loc=mean, scale=sigma) - norm.cdf(lower, loc=mean, scale=sigma)

    total = sum(probs.values())
    probs = {theta: p / total for theta, p in probs.items()}

    if theta_next is not None:
        return probs[theta_next]
    else:
        return probs


In [None]:
def belief_update(belief, action, observation, human_model):

    predicted = {}
    for theta_next in TRUST_LEVELS:
        predicted[theta_next] = 0.0
        for theta_curr in TRUST_LEVELS:
            prior = belief[theta_curr]
            trans_prob = belief_dist(theta_curr, action, observation)[theta_next]
            predicted[theta_next] += prior * trans_prob

    new_belief = {}
    for theta_next in TRUST_LEVELS:

        p_stay = human_model.prob_stay_put(theta_next, action)
        p_fail = ROBOT_FAIL_RATE[action]
        p_succ = 1 - p_fail

        if observation == OutcomeType.SP_SUCCESS:
            obs_prob = p_stay * p_succ

        elif observation == OutcomeType.SP_FAIL:
            obs_prob = p_stay * p_fail

        elif observation == OutcomeType.IT:
            obs_prob = 1 - p_stay

        else:
            raise ValueError("Unknown observation")

        new_belief[theta_next] = predicted[theta_next] * obs_prob

    # normalize
    total = sum(new_belief.values())
    new_belief = {theta: p / total for theta, p in new_belief.items()}

    return new_belief


In [None]:
from functools import lru_cache

def simple_trust_aware_policy(belief, human_model, available_actions):
    """
    Full-horizon trust-aware planning with memoization.
    Explores all trajectories until remaining_objects is empty.
    Returns (best_action, best_total_expected_reward).
    """

    init_belief_tuple = tuple(belief[theta] for theta in TRUST_LEVELS)
    init_actions_tuple = tuple(sorted([obj.value for obj in available_actions]))

    def immediate_expected_reward(belief_state, action_obj):
        expected = 0.0
        fail_rate = ROBOT_FAIL_RATE[action_obj]

        for theta in TRUST_LEVELS:
            p_theta = belief_state[theta]
            p_stay = human_model.prob_stay_put(theta, action_obj)

            r_success = REWARD_TABLE[(action_obj, OutcomeType.SP_SUCCESS)]
            r_fail    = REWARD_TABLE[(action_obj, OutcomeType.SP_FAIL)]
            r_it      = REWARD_TABLE[(action_obj, OutcomeType.IT)]

            # Outcome probabilities given theta, action
            p_IT   = 1 - p_stay
            p_fail = p_stay * fail_rate
            p_succ = p_stay * (1 - fail_rate)

            one_step = (
                p_succ * r_success +
                p_fail * r_fail +
                p_IT   * r_it
            )

            expected += p_theta * one_step

        return expected

    @lru_cache(maxsize=None)
    def cached_belief_update(belief_tuple, action_value, obs_value):
        belief_dict = {theta: belief_tuple[i] for i, theta in enumerate(TRUST_LEVELS)}
        action_obj = ObjectType(action_value)
        obs = OutcomeType(obs_value)
        new_b = belief_update(belief_dict, action_obj, obs, human_model)
        return tuple(new_b[theta] for theta in TRUST_LEVELS)

    @lru_cache(maxsize=None)
    def expected_return(belief_tuple, actions_tuple):
        if not actions_tuple:
            return (None, 0.0)

        belief_state = {theta: belief_tuple[i] for i, theta in enumerate(TRUST_LEVELS)}

        best_act = ObjectType(actions_tuple[0])
        best_val = -float('inf')

        for idx, action_value in enumerate(actions_tuple):
            action_obj = ObjectType(action_value)

            # One-step reward
            immediate = immediate_expected_reward(belief_state, action_obj)

            # Expected future reward
            future = 0.0
            for obs in (OutcomeType.SP_SUCCESS, OutcomeType.SP_FAIL, OutcomeType.IT):
                obs_value = obs.value

                if obs == OutcomeType.SP_SUCCESS:
                    p_obs = sum(
                        belief_state[t] *
                        (human_model.prob_stay_put(t, action_obj) *
                         (1 - ROBOT_FAIL_RATE[action_obj]))
                        for t in TRUST_LEVELS
                    )
                elif obs == OutcomeType.SP_FAIL:
                    p_obs = sum(
                        belief_state[t] *
                        (human_model.prob_stay_put(t, action_obj) *
                         ROBOT_FAIL_RATE[action_obj])
                        for t in TRUST_LEVELS
                    )
                elif obs == OutcomeType.IT:
                    p_obs = sum(
                        belief_state[t] *
                        (1 - human_model.prob_stay_put(t, action_obj))
                        for t in TRUST_LEVELS
                    )

                if p_obs == 0.0:
                    continue

                # Next belief (cached)
                next_b_tuple = cached_belief_update(belief_tuple, action_value, obs_value)

                # Next actions (remove current)
                next_actions = list(actions_tuple)
                next_actions.pop(idx)
                next_actions_tuple = tuple(sorted(next_actions))

                _, v_next = expected_return(next_b_tuple, next_actions_tuple)

                future += p_obs * v_next

            total = immediate + future

            if total > best_val:
                best_val = total
                best_act = action_obj

        return (best_act, best_val)

    return expected_return(init_belief_tuple, init_actions_tuple)


In [None]:
def simulate_task(human_model, initial_belief, initial_theta=None):
    """
    Simulate a full table-clearing task with DYNAMIC trust.
    true_theta_t evolves according to belief_dist(theta_t, action, observation).
    """

    belief = initial_belief.copy()
    total_reward = 0.0
    history = []

    # Hidden true trust state
    if initial_theta is None:
        true_theta = np.random.choice(TRUST_LEVELS, p=list(belief.values()))
    else:
        true_theta = initial_theta

    remaining_objects = [
        ObjectType.BOTTLE, ObjectType.BOTTLE, ObjectType.BOTTLE,
        ObjectType.CAN, ObjectType.GLASS
    ]

    while remaining_objects:
        # 1) Robot chooses action from its *belief*
        action, expected_reward = simple_trust_aware_policy(
            belief, human_model, remaining_objects
        )

        # 2) Environment step: intrinsic robot failure THEN human response
        fail_rate = ROBOT_FAIL_RATE[action]
        rand = np.random.rand()

        if rand < fail_rate:
            observation = OutcomeType.SP_FAIL
        else:
            human_obs = human_model.sample_human_action(true_theta, action)
            if human_obs is None:
                observation = OutcomeType.SP_SUCCESS
            else:
                observation = OutcomeType.IT

        # 3) Reward
        reward = REWARD_TABLE[(action, observation)]
        total_reward += reward

        # 4) True trust dynamics
        trans_probs = belief_dist(true_theta, action, observation)
        true_theta = np.random.choice(
            TRUST_LEVELS,
            p=[trans_probs[theta] for theta in TRUST_LEVELS]
        )

        # 5) Robot's belief update
        belief = belief_update(belief, action, observation, human_model)

        # 6) Remove object and log
        remaining_objects.remove(action)
        history.append((action, observation, belief.copy(), true_theta))

    return total_reward, history


In [None]:
def run_multiple_simulations(human_model, true_theta, initial_belief, n_runs=100):
    all_rewards = []
    action_sequences = []
    first_action_counts = {
        ObjectType.BOTTLE: 0,
        ObjectType.CAN: 0,
        ObjectType.GLASS: 0
    }

    for _ in range(n_runs):
        total_reward, history = simulate_task(
            human_model,
            initial_belief,
            initial_theta=true_theta  # <-- now used!
        )
        all_rewards.append(total_reward)

        action_seq = [action.name for (action, obs, belief, theta) in history]
        action_sequences.append(action_seq)

        first_action = history[0][0]
        first_action_counts[first_action] += 1

    print(f"Average reward: {np.mean(all_rewards):.2f} ± {np.std(all_rewards):.2f}")
    print(f"\nFirst action distribution (out of {n_runs} runs):")
    for obj, count in first_action_counts.items():
        print(f"  {obj.name}: {count} times ({100*count/n_runs:.1f}%)")

    from collections import Counter
    seq_counts = Counter([tuple(seq) for seq in action_sequences])
    print(f"\nMost common action sequences:")
    for seq, count in seq_counts.most_common(5):
        print(f"  {' → '.join(seq)}: {count} times")

    return all_rewards


In [None]:
# Initialize human model (you'll need to set gamma and eta)
human_model = HumanDecisionModel()

human_model.gamma = {
    ObjectType.BOTTLE: 1,   # moderate slope
    ObjectType.CAN:    1.2,   # sharp slope, only grows at medium-high θ
    ObjectType.GLASS:  1.5,   # very sharp slope, requires high θ
}

human_model.eta = {
    ObjectType.BOTTLE: 0,   # threshold ~ 1.6
    ObjectType.CAN:   -2.5,   # threshold ~ 4.0
    ObjectType.GLASS: -5,   # threshold ~ 6.0
}

initial_belief =   {
    1: 0.0625,
    2: 0.125,
    3: 0.1825,
    4: 0.375,
    5: 0.1825,
    6: 0.125,
    7: 0.0625
}

available = [
    ObjectType.BOTTLE, ObjectType.BOTTLE, ObjectType.BOTTLE,
    ObjectType.CAN, ObjectType.GLASS
]

# best_act, best_val = simple_trust_aware_policy(initial_belief, human_model, available)
# print("Best first action:", best_act, "with value", best_val)


run_multiple_simulations(
    human_model,
    true_theta=4,
    initial_belief=initial_belief,
    n_runs=1
)

Average reward: 1.00 ± 0.00

First action distribution (out of 1 runs):
  BOTTLE: 0 times (0.0%)
  CAN: 0 times (0.0%)
  GLASS: 1 times (100.0%)

Most common action sequences:
  GLASS → BOTTLE → BOTTLE → BOTTLE → CAN: 1 times


[1.0]

In [None]:
for theta in TRUST_LEVELS:
    print(theta,
          human_model.prob_stay_put(theta, ObjectType.BOTTLE),
          human_model.prob_stay_put(theta, ObjectType.CAN),
          human_model.prob_stay_put(theta, ObjectType.GLASS))

1 0.6750375273768237 0.06209344479271049 9.537052563699948e-06
2 0.7069873680001046 0.2405118927418583 3.67272013022538e-05
3 0.7216325609518421 0.6228260157313329 0.00176667116371655
4 0.7275076135036415 0.8104999872916172 0.26228588073449527
5 0.729740651093834 0.8610605518001789 0.8655447095136148
6 0.7305721537541839 0.8750011002019935 0.9387857253587264
7 0.7308794173912335 0.8790652768279943 0.9497385299392656
