In [7]:
import numpy as np
from functools import lru_cache
from typing import Dict, List, Tuple, Any
import time

# Paste the entire TwoObjectDPSolver class definition here
class TwoObjectDPSolver:
    """
    An exact Dynamic Programming solver for the two-object search problem.

    This class uses backward recursion to find the optimal policy and the
    maximum probability of detecting the primary target (O1).
    """

    def __init__(self, n: int, T: int, p0: np.ndarray, gamma1: np.ndarray, gamma2: np.ndarray, c: np.ndarray):
        """
        Initializes the solver with the problem parameters.

        Args:
            n (int): The number of cells.
            T (int): The time horizon.
            p0 (np.ndarray): The n x n prior joint probability matrix.
            gamma1 (np.ndarray): Miss-detection rates for O1.
            gamma2 (np.ndarray): Miss-detection rates for O2.
            c (np.ndarray): The cost vector for actions.
        """
        self.n = n
        self.T = T
        self.p0 = p0
        self.gamma1 = gamma1
        self.gamma2 = gamma2
        self.c = c

        # Data structures to store the results
        self.J_values: Dict[int, Dict[Tuple, float]] = {}  # Value function J_t(s_t)
        self.Policy: Dict[int, Dict[Tuple, int]] = {}      # Policy mu_t(s_t)

        # Pre-calculate initial conditional priors for efficiency
        self._p0_conditionals = self._precompute_p0_conditionals()

    def _precompute_p0_conditionals(self) -> List[np.ndarray]:
        """Pre-calculates P(O1=i | O2=k) for all k."""
        conditionals = []
        for k in range(self.n):
            marginal_o2 = np.sum(self.p0[:, k])
            if marginal_o2 > 0:
                conditionals.append(self.p0[:, k] / marginal_o2)
            else:
                # If O2 can never be in cell k, the conditional is undefined. Use zeros.
                conditionals.append(np.zeros(self.n))
        return conditionals

    @staticmethod
    @lru_cache(maxsize=None) # Memoization for performance
    def _generate_z_vectors(t: int, n: int) -> List[Tuple[int, ...]]:
        """Recursively generates all state vectors z_t where sum(z_i) = t."""
        if n == 1:
            return [(t,)]

        vectors = []
        for i in range(t + 1):
            for sub_vector in TwoObjectDPSolver._generate_z_vectors(t - i, n - 1):
                vectors.append((i,) + sub_vector)
        """if n == 5:
           for vec in vectors:
               print(vec)"""
        return vectors


    def _calculate_joint_posterior(self, z_vector: np.ndarray) -> np.ndarray:
        """Calculates p(O1=i, O2=j | z)."""
        g1_z = np.power(self.gamma1, z_vector)
        g2_z = np.power(self.gamma2, z_vector)
        likelihood = np.outer(g1_z, g2_z)
        numerator = self.p0 * likelihood
        norm = np.sum(numerator)
        return numerator / norm if norm > 0 else numerator

    def _calculate_conditional_posterior(self, z_vector: np.ndarray, o2_loc: int) -> np.ndarray:
        """Calculates p(O1=i | z, O2=o2_loc)."""
        p0_cond = self._p0_conditionals[o2_loc]
        g1_z = np.power(self.gamma1, z_vector)
        numerator = g1_z * p0_cond
        norm = np.sum(numerator)
        return numerator / norm if norm > 0 else numerator

    def solve(self):
        """
        Executes the backward recursion to solve the DP problem.
        """
        #print("Starting DP solver...")
        # --- Initialization at T ---
        #print(f"Initializing for T={self.T}...")
        self.J_values[self.T] = {}
        z_vectors_T = self._generate_z_vectors(self.T, self.n)
        for z_T in z_vectors_T:
            for theta2 in range(self.n + 1):
                state = (z_T, theta2)
                self.J_values[self.T][state] = 0.0


        # --- Backward Recursion ---
        for t in range(self.T - 1, -1, -1):
            start_time = time.time()
            J_t = {}
            policy_t = {}
            z_vectors_t = self._generate_z_vectors(t, self.n)


            for z_t_tuple in z_vectors_t:
                z_t = np.array(z_t_tuple)
                for theta2 in range(self.n + 1):
                    if t == 0 and theta2 > 0:
                        break
                    current_state = (z_t_tuple, theta2)
                    action_values = []

                    # Calculate belief based on current state
                    if theta2 == 0:
                        belief = self._calculate_joint_posterior(z_t)
                    else:
                        o2_loc = theta2 - 1
                        belief = self._calculate_conditional_posterior(z_t, o2_loc)

                    # Iterate over all possible actions
                    for a_t in range(self.n):
                        next_z_tuple = tuple(z_t + np.eye(self.n, dtype=int)[a_t])

                        # --- Case 1: Both objects hidden ---
                        if theta2 == 0:
                            p_marginal_o1_at_a = np.sum(belief[a_t, :])

                            # Prob of success (finding O1)
                            p_success = (1 - self.gamma1[a_t]) * p_marginal_o1_at_a

                            # Prob of finding O2 only
                            p_cond_sum = np.sum(belief[np.arange(self.n) != a_t, a_t])
                            p_find_o2_only = (1 - self.gamma2[a_t]) * (self.gamma1[a_t] * belief[a_t, a_t] + p_cond_sum)


                            # Prob of finding nothing
                            p_nothing = 1 - p_success - p_find_o2_only

                            # Future values from next stage
                            val_if_nothing = self.J_values[t + 1][(next_z_tuple, 0)]
                            val_if_found_o2 = self.J_values[t + 1][(next_z_tuple, a_t + 1)]

                            # Expected value for this action
                            expected_value = (p_success * 1.0) - self.c[a_t] + \
                                             (p_nothing * val_if_nothing) + \
                                             (p_find_o2_only * val_if_found_o2)
                            action_values.append(expected_value)

                        # --- Case 2: O2 has been found ---
                        else:
                            p_o1_at_a = belief[a_t]
                            p_success = (1 - self.gamma1[a_t]) * p_o1_at_a
                            p_fail = 1 - p_success

                            val_if_fail = self.J_values[t + 1][(next_z_tuple, theta2)]

                            expected_value = (p_success * 1.0) - self.c[a_t] + (p_fail * val_if_fail)
                            action_values.append(expected_value)


                    # Find best action and store value/policy
                    best_value = np.max(action_values)
                    best_action = np.argmax(action_values)
                    J_t[current_state] = best_value
                    policy_t[current_state] = best_action

            self.J_values[t] = J_t
            self.Policy[t] = policy_t
            end_time = time.time()
            #print(f"Completed t={t}. Found {len(z_vectors_t)} states. Took {end_time - start_time:.2f}s.")

        #print("DP solver finished.")

    def get_optimal_value(self) -> float:
        """Returns the optimal value J(s_0)."""
        initial_z = tuple([0] * self.n)
        initial_state = (initial_z, 0)
        return self.J_values[0][initial_state]


# Assume SingleObjectDPSolver and TwoObjectDPSolver are defined elsewhere and available

# =============================================================================
# 1. Adaptive Re-planning Episode Simulation (Copied from e87hajzv5p50)
# =============================================================================

def run_adaptive_replanning_episode(n: int, T: int, p0_joint: np.ndarray,
                                    gamma1: np.ndarray, gamma2: np.ndarray,
                                    initial_two_obj_policy: Dict, c: np.ndarray, episode_seed: int) -> Tuple[bool, int, float, int]:
    """
    Simulates a single episode using a given two-object DP policy with adaptive re-planning if O2 is found.
    Returns success status, time of detection (or -1 if failed), accumulated reward, and episode length.
    """
    # Create a local RNG for this episode for statistical independence
    rng = np.random.default_rng(episode_seed)

    # 1. Secretly determine the true locations
    flat_p0 = p0_joint.flatten()
    if np.sum(flat_p0) == 0: # Handle case where prior is all zeros
         true_pos_o1, true_pos_o2 = -1, -1 # Indicate no target
    else:
        choice_index = rng.choice(n * n, p=flat_p0 / np.sum(flat_p0)) # Normalize using local RNG
        true_pos_o1, true_pos_o2 = np.unravel_index(choice_index, (n, n))


    # 2. Initialize state and reward
    z_vector = np.zeros(n, dtype=int)
    theta2 = 0
    current_policy = initial_two_obj_policy # Use the pre-computed initial policy
    accumulated_reward = 0.0
    detection_time = -1
    episode_length = 0

    # 3. Run the search for T steps
    for t in range(T):
        episode_length += 1 # Increment episode length at each time step
        # --- Get action from the current policy ---
        state = (tuple(z_vector), theta2) # State is always (z_vector, theta2)
        if t not in current_policy or state not in current_policy[t]:
             print(f"Warning: Policy not found for state {state} at time {t}. Using greedy action (fallback).")
             # Fallback to greedy action if policy is missing (shouldn't happen with a full DP policy)
             # For simplicity and assuming the initial_two_obj_policy is complete, we won't implement the fallback here
             return False, detection_time, accumulated_reward, episode_length # End episode if policy missing


        action = current_policy[t][state]


        # Deduct cost for the action
        accumulated_reward -= c[action]

        # --- Simulate outcome of the action ---
        found_o1 = (action == true_pos_o1) and (rng.random() > gamma1[action]) # Use local RNG
        if found_o1:
            accumulated_reward += 1.0 # Reward for finding O1
            detection_time = t + 1 # Time is absolute
            return True, detection_time, accumulated_reward, episode_length # Mission Success!

        # --- Check if we found O2 ---
        if theta2 == 0 and action == true_pos_o2 and (rng.random() > gamma2[action]): # Use local RNG
            #print(f"    (Episode Info) O2 found at t={t} in cell {action}. Updating state.")
            theta2 = action + 1 # Update theta2


        # Update state for the next time step
        z_vector[action] += 1


    return False, detection_time, accumulated_reward, episode_length # Mission Failed


# =============================================================================
# 2. MAIN EXPERIMENT SCRIPT (Modified for multiple seeds)
# =============================================================================

if __name__ == '__main__':
    # --- Problem Definition ---
    NUM_CELLS = 5
    TIME_HORIZON = 10
    NUM_EPISODES_PER_SEED = 100 # Number of episodes to run for each seed
    NUM_SEEDS = 100 # Number of different seeds to run

    # Problem parameters (same for all seeds)
    prior = np.array([
        [0.152,  0.0039, 0.003,  0.0108, 0.011],
        [0.0038, 0.0052, 0.117,  0.0162, 0.165],
        [0.0057, 0.195,  0.015,  0.009,  0.011],
        [0.0038, 0.0091, 0.0075, 0.027,  0.011],
        [0.0247, 0.0468, 0.0075, 0.117,  0.022]
    ])

    gammas1 = np.array([0.8, 0.65, 0.82, 0.75, 0.7])
    gammas2 = np.array([0.2, 0.1, 0.25, 0.15, 0.2])
    c= np.array([0.15,0.2,0.25,0.1,0.2])

    all_success_rates = []
    all_detection_times = []
    all_rewards = []
    all_episode_lengths = [] # New list to store episode lengths
    total_experiment_start_time = time.time()

    print("--- Running Empirical Experiment for Adaptive Re-planning Policy over Multiple Seeds ---")
    print(f"Number of seeds: {NUM_SEEDS}")
    print(f"Number of evaluation episodes per seed: {NUM_EPISODES_PER_SEED}")
    print("-" * 20)


    # --- Pre-computation Phase (initial policy calculation - done once) ---
    print("--- Pre-computing the initial Two-Object DP policy ---")
    initial_solver = TwoObjectDPSolver(NUM_CELLS, TIME_HORIZON, prior, gammas1, gammas2, c)
    initial_solver.solve()
    initial_policy = initial_solver.Policy
    print("probility comuted :", initial_solver.J_values[0])
    print("-" * 20)


    for seed in range(NUM_SEEDS):
        print(f"\n--- Running with Seed: {seed} ---")
        # np.random.seed(seed) # Removed global seeding here to rely on per-episode seeding

        # --- Run the Experiment for this seed ---
        num_successes = 0
        detection_times_this_seed = []
        rewards_this_seed = []
        episode_lengths_this_seed = [] # New list for this seed
        start_time_this_seed = time.time()

        for i in range(NUM_EPISODES_PER_SEED):
            # The run_adaptive_replanning_episode function now handles true location sampling internally
            # It also now uses the pre-computed initial_policy directly
            # Pass a unique seed for each episode to ensure statistical independence
            episode_unique_seed = seed * NUM_EPISODES_PER_SEED + i
            success, detection_time, reward, episode_length = run_adaptive_replanning_episode(
                NUM_CELLS, TIME_HORIZON, prior, gammas1, gammas2, initial_policy, c, episode_unique_seed
            )
            rewards_this_seed.append(reward) # Append reward to the list
            episode_lengths_this_seed.append(episode_length) # Append episode length

            if success:
                num_successes += 1
                detection_times_this_seed.append(detection_time)
                #print(f"    (Episode {i+1} Result) SUCCESS at time {detection_time} with reward {reward:.4f}")
            #else:
                #print(f"    (Episode {i+1} Result) FAILURE with reward {reward:.4f}")


        end_time_this_seed = time.time()
        success_rate_this_seed = num_successes / NUM_EPISODES_PER_SEED
        all_success_rates.append(success_rate_this_seed)
        all_detection_times.extend(detection_times_this_seed)
        all_rewards.extend(rewards_this_seed)
        all_episode_lengths.extend(episode_lengths_this_seed) # Extend overall list


        print(f"--- Seed {seed} Results ---")
        print(f"Success Rate: {success_rate_this_seed:.4f} ({success_rate_this_seed*100:.2f}%)")
        if detection_times_this_seed:
            print(f"Average Detection Time (Successful Episodes): {np.mean(detection_times_this_seed):.2f}")
        else:
            print("Average Detection Time (Successful Episodes): N/A (no successes)")
        print(f"Average Episode Reward: {np.mean(rewards_this_seed):.4f}")
        print(f"Average Episode Length: {np.mean(episode_lengths_this_seed):.2f}") # Display average episode length
        print(f"Time taken for this seed: {end_time_this_seed - start_time_this_seed:.2f} seconds")


    total_experiment_end_time = time.time()

    # --- 3.4. Display Final Results ---
    mean_success_rate = np.mean(all_success_rates)
    std_success_rate = np.std(all_success_rates)
    mean_detection_time = np.mean(all_detection_times) if all_detection_times else -1
    std_detection_time = np.std(all_detection_times) if all_detection_times else -1
    mean_reward = np.mean(all_rewards)
    std_reward = np.std(all_rewards)
    mean_episode_length = np.mean(all_episode_lengths) # Calculate overall average episode length
    std_episode_length = np.std(all_episode_lengths) # Calculate overall standard deviation of episode length


    print("\n--- Overall Adaptive Re-planning Policy Results ---")
    print(f"Total experiment time across {NUM_SEEDS} seeds: {total_experiment_end_time - total_experiment_start_time:.2f} seconds")
    print(f"Average Success Rate over {NUM_SEEDS} seeds and {NUM_EPISODES_PER_SEED} episodes each: {mean_success_rate:.4f} ({mean_success_rate*100:.2f}%)")
    print(f"Standard Deviation of Success Rate: {std_success_rate:.4f}")
    if all_detection_times:
        print(f"Average Detection Time (Successful Episodes) across all seeds: {mean_detection_time:.2f}")
        print(f"Standard Deviation of Detection Time (Successful Episodes) across all seeds: {std_detection_time:.2f}")
    else:
        print("Average Detection Time (Successful Episodes) across all seeds: N/A (no successes)")

    print(f"Average Episode Reward across all seeds: {mean_reward:.4f}")
    print(f"Standard Deviation of Reward: {std_reward:.4f}")
    print(f"Average Episode Length across all seeds: {mean_episode_length:.2f}") # Display overall average episode length
    print(f"Standard Deviation of Episode Length: {std_episode_length:.2f}") # Display overall standard deviation of episode length


--- Running Empirical Experiment for Adaptive Re-planning Policy over Multiple Seeds ---
Number of seeds: 100
Number of evaluation episodes per seed: 100
--------------------
--- Pre-computing the initial Two-Object DP policy ---
probility comuted : {((0, 0, 0, 0, 0), 0): np.float64(-0.46008379604604555)}
--------------------

--- Running with Seed: 0 ---
--- Seed 0 Results ---
Success Rate: 0.5900 (59.00%)
Average Detection Time (Successful Episodes): 3.92
Average Episode Reward: -0.5120
Average Episode Length: 6.41
Time taken for this seed: 0.02 seconds

--- Running with Seed: 1 ---
--- Seed 1 Results ---
Success Rate: 0.5700 (57.00%)
Average Detection Time (Successful Episodes): 4.54
Average Episode Reward: -0.5820
Average Episode Length: 6.89
Time taken for this seed: 0.01 seconds

--- Running with Seed: 2 ---
--- Seed 2 Results ---
Success Rate: 0.5900 (59.00%)
Average Detection Time (Successful Episodes): 4.03
Average Episode Reward: -0.5215
Average Episode Length: 6.48
Time take