In [1]:
import numpy as np
from typing import Tuple, Any
import time

# Helper function definitions (copied from cell CNgLojO6pzq_ for self-containment)
# Note: These functions are expected to be available in the environment.
# If they were not defined in a previous cell, they would need to be moved here.

def calculate_joint_posterior(z_vector: np.ndarray, initial_prior_joint: np.ndarray,
                              gamma1: np.ndarray, gamma2: np.ndarray) -> np.ndarray:
    """Calculates the joint posterior belief p(O1=i, O2=j | z) based on initial joint prior.""" #5
    g1_z = np.power(gamma1, z_vector)
    g2_z = np.power(gamma2, z_vector)
    likelihood = np.outer(g1_z, g2_z)
    numerator = initial_prior_joint * likelihood
    norm = np.sum(numerator)
    return numerator / norm if norm > 0 else numerator

def calculate_conditional_posterior(z_vector: np.ndarray, o2_loc: int,
                                    precomputed_conditionals: list, gamma1: np.ndarray) -> np.ndarray:
    """Calculates the conditional posterior belief p(O1=i | z, O2=k) based on precomputed conditionals and current z_vector."""
    p0_conditional = precomputed_conditionals[o2_loc]
    g1_z = np.power(gamma1, z_vector)
    numerator = g1_z * p0_conditional
    norm = np.sum(numerator)
    return numerator / norm if norm > 0 else numerator

def precompute_p0_conditionals(n: int, p0_joint: np.ndarray) -> list:
    """Pre-calculates P(O1=i | O2=k) for all k from a joint prior matrix."""
    if p0_joint.ndim != 2: # Should be (n,n) for joint prior
        raise ValueError("p0_joint must be a 2D joint probability matrix for precomputing conditionals.")
    conditionals = []
    for k in range(n):
        marginal_o2 = np.sum(p0_joint[:, k])
        conditionals.append(p0_joint[:, k] / marginal_o2 if marginal_o2 > 0 else np.zeros(n))
    return conditionals


def run_greedy_episode(n: int, T: int, p0_joint: np.ndarray,
                           gamma1: np.ndarray, gamma2: np.ndarray, c: np.ndarray, episode_seed: int) -> Tuple[bool, int, float, int]:
    """
    Simulates a single episode using a purely greedy policy, which always selects the action
    maximizing immediate expected reward based on the current belief state.
    Returns success status, time of detection (or -1 if failed), accumulated reward, and episode length.
    """
    # 1. Initialize a local random number generator
    rng = np.random.default_rng(episode_seed)

    # 2. Secretly determine the true locations of O1 and O2
    flat_p0 = p0_joint.flatten()
    if np.sum(flat_p0) == 0: # Handle case where prior is all zeros
         true_pos_o1, true_pos_o2 = -1, -1 # Indicate no target
    else:
        choice_index = rng.choice(n * n, p=flat_p0 / np.sum(flat_p0)) # Normalize
        true_pos_o1, true_pos_o2 = np.unravel_index(choice_index, (n, n))

    # 3. Initialize state variables
    z_vector = np.zeros(n, dtype=int)
    theta2 = 0 # 0=hidden, >0=found at cell (theta2-1)
    current_belief_joint = np.copy(p0_joint) # Initial joint belief

    # 4. Precompute the conditional priors from p0_joint (used if O2 is found)
    p0_conditionals = precompute_p0_conditionals(n, p0_joint)

    # 5. Initialize reward and time tracking
    accumulated_reward = 0.0
    detection_time = -1
    episode_length = 0

    # 6. Loop for T steps
    for t in range(T):
        episode_length += 1

        # 7. Calculate the immediate expected reward for each possible action
        action_rewards = np.zeros(n)

        if theta2 == 0: # O2 is hidden, use joint posterior
            # Update current_belief_joint based on z_vector
            current_posterior = calculate_joint_posterior(z_vector, p0_joint, gamma1, gamma2)
            for a in range(n):
                p_marginal_o1_at_a = np.sum(current_posterior[a, :])
                action_rewards[a] = (1 - gamma1[a]) * p_marginal_o1_at_a - c[a]
        else: # O2 is found at theta2 - 1, use conditional posterior for O1
            o2_loc = theta2 - 1
            # Update current_belief_conditional based on z_vector
            current_posterior = calculate_conditional_posterior(z_vector, o2_loc, p0_conditionals, gamma1)
            for a in range(n):
                p_o1_at_a = current_posterior[a]
                action_rewards[a] = (1 - gamma1[a]) * p_o1_at_a - c[a]

        # 8. Select the action maximizing immediate expected reward
        a_t = np.argmax(action_rewards)

        # 9. Deduct cost for the action
        accumulated_reward -= c[a_t]

        # 10. Simulate the outcome of searching cell a_t
        found_o1 = (a_t == true_pos_o1) and (rng.random() > gamma1[a_t])
        if found_o1:
            accumulated_reward += 1.0 # Reward for finding O1
            detection_time = t + 1 # Absolute time of detection
            return True, detection_time, accumulated_reward, episode_length # Mission Success!

        # If O1 not found, check for O2 discovery (only if O2 was previously hidden)
        if theta2 == 0 and (a_t == true_pos_o2) and (rng.random() > gamma2[a_t]):
            theta2 = a_t + 1 # Update theta2 state (O2 found at a_t)

        # 11. Update z_vector
        z_vector[a_t] += 1

        # 12. Update the belief state for the next time step
        # The belief `current_posterior` calculated above is the *past* belief for decision making.
        # The `calculate_posterior` functions directly use the current `z_vector` which has been updated.
        # No need to explicitly re-assign `current_belief_joint` or `current_belief_conditional` here,
        # as they are re-calculated at the start of the next loop iteration (step 7) based on `z_vector` and `theta2`.

    # 13. If loop completes without finding O1
    return False, detection_time, accumulated_reward, episode_length


if __name__ == '__main__':
    # Example usage for testing the greedy policy
    NUM_CELLS = 5
    TIME_HORIZON = 10
    NUM_EPISODES = 100 # Number of episodes to run for this benchmark

    # Problem parameters (same as in previous cells)
    prior = np.array([
        [0.152,  0.0039, 0.003,  0.0108, 0.011],
        [0.0038, 0.0052, 0.117,  0.0162, 0.165],
        [0.0057, 0.195,  0.015,  0.009,  0.011],
        [0.0038, 0.0091, 0.0075, 0.027,  0.011],
        [0.0247, 0.0468, 0.0075, 0.117,  0.022]
    ])

    gammas1 = np.array([0.8, 0.65, 0.82, 0.75, 0.7])
    gammas2 = np.array([0.2, 0.1, 0.25, 0.15, 0.2])
    c = np.array([0.15,0.2,0.25,0.1,0.2])

    print("--- Running Empirical Experiment for Greedy Policy ---")
    print(f"Number of episodes: {NUM_EPISODES}")
    print("-" * 20)


    all_success_rates = []
    all_detection_times = []
    all_rewards = []
    all_episode_lengths = []
    total_experiment_start_time = time.time()

    # Running multiple seeds for statistical robustness (similar to other policies)
    NUM_SEEDS = 100
    NUM_EPISODES_PER_SEED = 100 # Total episodes will be NUM_SEEDS * NUM_EPISODES_PER_SEED

    for seed in range(NUM_SEEDS):
        # Use a consistent seed for np.random in the greedy episode function
        # to ensure reproducibility of the episode outcomes for a given seed.
        # This seed is then used by the rng object inside run_greedy_episode.
        # np.random.seed(seed) # Not needed as rng is passed per episode

        num_successes = 0
        detection_times_this_seed = []
        rewards_this_seed = []
        episode_lengths_this_seed = []
        start_time_this_seed = time.time()

        for i in range(NUM_EPISODES_PER_SEED):
            episode_unique_seed = seed * NUM_EPISODES_PER_SEED + i
            success, detection_time, reward, episode_length = run_greedy_episode(
                NUM_CELLS, TIME_HORIZON, prior, gammas1, gammas2, c, episode_unique_seed
            )
            rewards_this_seed.append(reward)
            episode_lengths_this_seed.append(episode_length)

            if success:
                num_successes += 1
                detection_times_this_seed.append(detection_time)

        end_time_this_seed = time.time()
        success_rate_this_seed = num_successes / NUM_EPISODES_PER_SEED
        all_success_rates.append(success_rate_this_seed)
        all_detection_times.extend(detection_times_this_seed)
        all_rewards.extend(rewards_this_seed)
        all_episode_lengths.extend(episode_lengths_this_seed)

        print(f"--- Seed {seed} Results ---")
        print(f"Success Rate: {success_rate_this_seed:.4f} ({success_rate_this_seed*100:.2f}%)")
        if detection_times_this_seed:
            print(f"Average Detection Time (Successful Episodes): {np.mean(detection_times_this_seed):.2f}")
        else:
            print("Average Detection Time (Successful Episodes): N/A (no successes)")
        print(f"Average Episode Reward: {np.mean(rewards_this_seed):.4f}")
        print(f"Average Episode Length: {np.mean(episode_lengths_this_seed):.2f}")
        print(f"Time taken for this seed: {end_time_this_seed - start_time_this_seed:.2f} seconds")

    total_experiment_end_time = time.time()

    mean_success_rate = np.mean(all_success_rates)
    std_success_rate = np.std(all_success_rates)
    mean_detection_time = np.mean(all_detection_times) if all_detection_times else -1
    std_detection_time = np.std(all_detection_times) if all_detection_times else -1
    mean_reward = np.mean(all_rewards)
    std_reward = np.std(all_rewards)
    mean_episode_length = np.mean(all_episode_lengths)
    std_episode_length = np.std(all_episode_lengths)

    print("\n--- Overall Greedy Policy Results ---")
    print(f"Total experiment time across {NUM_SEEDS} seeds: {total_experiment_end_time - total_experiment_start_time:.2f} seconds")
    print(f"Average Success Rate over {NUM_SEEDS} seeds and {NUM_EPISODES_PER_SEED} episodes each: {mean_success_rate:.4f} ({mean_success_rate*100:.2f}%)")
    print(f"Standard Deviation of Success Rate: {std_success_rate:.4f}")
    if all_detection_times:
        print(f"Average Detection Time (Successful Episodes) across all seeds: {mean_detection_time:.2f}")
        print(f"Standard Deviation of Detection Time: {std_detection_time:.2f}")
    else:
        print("Average Detection Time (Successful Episodes) across all seeds: N/A (no successes)")

    print(f"Average Episode Reward across all seeds: {mean_reward:.4f}")
    print(f"Standard Deviation of Reward: {std_reward:.4f}")
    print(f"Average Episode Length across all seeds: {mean_episode_length:.2f}")
    print(f"Standard Deviation of Episode Length: {std_episode_length:.2f}")

--- Running Empirical Experiment for Greedy Policy ---
Number of episodes: 100
--------------------
--- Seed 0 Results ---
Success Rate: 0.4500 (45.00%)
Average Detection Time (Successful Episodes): 3.76
Average Episode Reward: -0.5565
Average Episode Length: 7.19
Time taken for this seed: 0.04 seconds
--- Seed 1 Results ---
Success Rate: 0.4400 (44.00%)
Average Detection Time (Successful Episodes): 4.55
Average Episode Reward: -0.6160
Average Episode Length: 7.60
Time taken for this seed: 0.04 seconds
--- Seed 2 Results ---
Success Rate: 0.4800 (48.00%)
Average Detection Time (Successful Episodes): 3.67
Average Episode Reward: -0.4640
Average Episode Length: 6.96
Time taken for this seed: 0.06 seconds
--- Seed 3 Results ---
Success Rate: 0.5700 (57.00%)
Average Detection Time (Successful Episodes): 3.95
Average Episode Reward: -0.3525
Average Episode Length: 6.55
Time taken for this seed: 0.07 seconds
--- Seed 4 Results ---
Success Rate: 0.5300 (53.00%)
Average Detection Time (Success