In [2]:
import numpy as np

def generate_problem_parameters(n: int, seed: int = 42) -> tuple:
    """
    Generates correlated joint prior, cost vector, and miss-detection rates
    for a two-object search problem.

    Args:
        n (int): The number of cells.
        seed (int): Seed for reproducibility.

    Returns:
        tuple: (prior_joint, gamma1, gamma2, c)
    """
    np.random.seed(seed)

    # 1. Generate gamma1 in a range that allows gamma2 to be smaller
    gamma1 = np.random.uniform(low=0.6, high=0.85, size=n)

    # 2. Generate gamma2 such that gamma2 < gamma1, and within its own desired range [0.15, 0.25]
    gamma2 = np.zeros(n)
    min_gamma2_val = 0.15
    max_gamma2_val = 0.25
    for i in range(n):
        # Upper bound for gamma2 is min(max_gamma2_val, gamma1[i] - 0.02)
        upper_bound_for_this_gamma2 = min(max_gamma2_val, gamma1[i] - 0.02) # Small buffer to ensure gamma2 < gamma1
        if upper_bound_for_this_gamma2 < min_gamma2_val: # If buffer makes it too small, ensure min
            gamma2[i] = np.random.uniform(low=min_gamma2_val, high=min_gamma2_val + 0.01) # Force a small valid range
        else:
            gamma2[i] = np.random.uniform(low=min_gamma2_val, high=upper_bound_for_this_gamma2)
    gamma2 = np.clip(gamma2, None, max_gamma2_val) # Ensure max bound

    # 3. Generate cost vector c
    c = np.random.uniform(low=0.1, high=0.3, size=n)

    # 4. Generate correlated joint prior (n x n matrix)
    prior_joint = np.zeros((n, n))
    correlation_strength = 2.0 # Adjust this value to control correlation strength

    for i in range(n):
        for j in range(n):
            # Higher probability when O1 and O2 are in the same or adjacent cells
            distance = abs(i - j)
            prior_joint[i, j] = np.exp(-correlation_strength * distance) # Exponential decay with distance

    # Add some random noise to ensure variety (but keep it small)
    prior_joint += np.random.uniform(low=0.01, high=0.05, size=(n, n))
    prior_joint[prior_joint < 0.001] = 0.001 # Ensure no zero probabilities

    # Normalize the joint prior
    prior_joint /= np.sum(prior_joint)

    # Explicitly ensure the sum is exactly 1.0 by adjusting the last element
    # This handles potential floating-point inaccuracies after normalization.
    prior_joint[-1, -1] += (1.0 - np.sum(prior_joint))

    return prior_joint, gamma1, gamma2, c

if __name__ == '__main__':
    # Example usage:
    """NUM_CELLS = 25
    SEED = 20

    prior_joint, gamma1, gamma2, c = generate_problem_parameters(NUM_CELLS, SEED)

    print(f"Generated Parameters (N={NUM_CELLS}, Seed={SEED}):\n")
    print("Joint Prior (p(O1, O2)):\n", np.round(prior_joint, 4))
    print("\nSum of Joint Prior: ", np.sum(prior_joint))
    print("\nMiss-detection rates for O1 (gamma1):", np.round(gamma1, 4))
    print("Miss-detection rates for O2 (gamma2):", np.round(gamma2, 4))
    print("\nCost vector (c):", np.round(c, 4))

    # Verify gamma1 > gamma2
    print("\nIs gamma1 > gamma2 for all elements?:", np.all(gamma1 > gamma2))"""
    print(" ")

 


In [4]:
import numpy as np
from typing import Tuple, Any
import time

# Helper function definitions 
# Note: These functions are expected to be available in the environment.

def calculate_joint_posterior(z_vector: np.ndarray, initial_prior_joint: np.ndarray,
                              gamma1: np.ndarray, gamma2: np.ndarray) -> np.ndarray:
    """Calculates the joint posterior belief p(O1=i, O2=j | z) based on initial joint prior.""" #5
    g1_z = np.power(gamma1, z_vector)
    g2_z = np.power(gamma2, z_vector)
    likelihood = np.outer(g1_z, g2_z)
    numerator = initial_prior_joint * likelihood
    norm = np.sum(numerator)
    return numerator / norm if norm > 0 else numerator

def calculate_conditional_posterior(z_vector: np.ndarray, o2_loc: int,
                                    precomputed_conditionals: list, gamma1: np.ndarray) -> np.ndarray:
    """Calculates the conditional posterior belief p(O1=i | z, O2=k) based on precomputed conditionals and current z_vector."""
    p0_conditional = precomputed_conditionals[o2_loc]
    g1_z = np.power(gamma1, z_vector)
    numerator = g1_z * p0_conditional
    norm = np.sum(numerator)
    return numerator / norm if norm > 0 else numerator

def precompute_p0_conditionals(n: int, p0_joint: np.ndarray) -> list:
    """Pre-calculates P(O1=i | O2=k) for all k from a joint prior matrix."""
    if p0_joint.ndim != 2: # Should be (n,n) for joint prior
        raise ValueError("p0_joint must be a 2D joint probability matrix for precomputing conditionals.")
    conditionals = []
    for k in range(n):
        marginal_o2 = np.sum(p0_joint[:, k])
        conditionals.append(p0_joint[:, k] / marginal_o2 if marginal_o2 > 0 else np.zeros(n))
    return conditionals


def run_greedy_episode(n: int, T: int, p0_joint: np.ndarray,
                           gamma1: np.ndarray, gamma2: np.ndarray, c: np.ndarray, episode_seed: int) -> Tuple[bool, int, float, int]:
    """
    Simulates a single episode using a purely greedy policy, which always selects the action
    maximizing immediate expected reward based on the current belief state.
    Returns success status, time of detection (or -1 if failed), accumulated reward, and episode length.
    """
    # 1. Initialize a local random number generator
    rng = np.random.default_rng(episode_seed)

    # 2. Secretly determine the true locations of O1 and O2
    flat_p0 = p0_joint.flatten()
    if np.sum(flat_p0) == 0: # Handle case where prior is all zeros
         true_pos_o1, true_pos_o2 = -1, -1 # Indicate no target
    else:
        choice_index = rng.choice(n * n, p=flat_p0 / np.sum(flat_p0)) # Normalize
        true_pos_o1, true_pos_o2 = np.unravel_index(choice_index, (n, n))

    # 3. Initialize state variables
    z_vector = np.zeros(n, dtype=int)
    theta2 = 0 # 0=hidden, >0=found at cell (theta2-1)
    current_belief_joint = np.copy(p0_joint) # Initial joint belief

    # 4. Precompute the conditional priors from p0_joint (used if O2 is found)
    p0_conditionals = precompute_p0_conditionals(n, p0_joint)

    # 5. Initialize reward and time tracking
    accumulated_reward = 0.0
    detection_time = -1
    episode_length = 0

    # 6. Loop for T steps
    for t in range(T):
        episode_length += 1

        # 7. Calculate the immediate expected reward for each possible action
        action_rewards = np.zeros(n)

        if theta2 == 0: # O2 is hidden, use joint posterior
            # Update current_belief_joint based on z_vector
            current_posterior = calculate_joint_posterior(z_vector, p0_joint, gamma1, gamma2)
            for a in range(n):
                p_marginal_o1_at_a = np.sum(current_posterior[a, :])
                action_rewards[a] = (1 - gamma1[a]) * p_marginal_o1_at_a - c[a]
        else: # O2 is found at theta2 - 1, use conditional posterior for O1
            o2_loc = theta2 - 1
            # Update current_belief_conditional based on z_vector
            current_posterior = calculate_conditional_posterior(z_vector, o2_loc, p0_conditionals, gamma1)
            for a in range(n):
                p_o1_at_a = current_posterior[a]
                action_rewards[a] = (1 - gamma1[a]) * p_o1_at_a - c[a]

        # 8. Select the action maximizing immediate expected reward
        a_t = np.argmax(action_rewards)

        # 9. Deduct cost for the action
        accumulated_reward -= c[a_t]

        # 10. Simulate the outcome of searching cell a_t
        found_o1 = (a_t == true_pos_o1) and (rng.random() > gamma1[a_t])
        if found_o1:
            accumulated_reward += 1.0 # Reward for finding O1
            detection_time = t + 1 # Absolute time of detection
            return True, detection_time, accumulated_reward, episode_length # Mission Success!

        # If O1 not found, check for O2 discovery (only if O2 was previously hidden)
        if theta2 == 0 and (a_t == true_pos_o2) and (rng.random() > gamma2[a_t]):
            theta2 = a_t + 1 # Update theta2 state (O2 found at a_t)

        # 11. Update z_vector
        z_vector[a_t] += 1

        # 12. Update the belief state for the next time step
        # The belief `current_posterior` calculated above is the *past* belief for decision making.
        # The `calculate_posterior` functions directly use the current `z_vector` which has been updated.
        # No need to explicitly re-assign `current_belief_joint` or `current_belief_conditional` here,
        # as they are re-calculated at the start of the next loop iteration (step 7) based on `z_vector` and `theta2`.

    # 13. If loop completes without finding O1
    return False, detection_time, accumulated_reward, episode_length


if __name__ == '__main__':
    # Example usage for testing the greedy policy
    NUM_CELLS = 25
    TIME_HORIZON = 20
    NUM_EPISODES = 10 # Number of episodes to run for this benchmark


    print("--- Running Empirical Experiment for Greedy Policy ---")
    print(f"Number of episodes: {NUM_EPISODES}")
    print("-" * 20)


    SEED = 42

    prior, gammas1, gammas2, c = generate_problem_parameters(NUM_CELLS, SEED)

    ## Ensure reproducibility 
    assert (prior[0][0], prior[24][24]) == (0.020334334191265974, 0.020140323245187997)
    assert (gammas1[0], gammas1[1]) == (0.6936350297118405, 0.837678576602479)
    assert (gammas2[0], gammas2[1]) == (0.22851759613930137, 0.16996737821583596)
    assert (c[0], c[1]) ==( 0.2939169255529117, 0.2550265646722229)



    all_success_rates = []
    all_detection_times = []
    all_rewards = []
    all_episode_lengths = []
    total_experiment_start_time = time.time()

    # Running multiple seeds for statistical robustness (similar to other policies)
    NUM_SEEDS = 10
    NUM_EPISODES_PER_SEED = 10 # Total episodes will be NUM_SEEDS * NUM_EPISODES_PER_SEED

    for seed in range(NUM_SEEDS):
        # Use a consistent seed for np.random in the greedy episode function
        # to ensure reproducibility of the episode outcomes for a given seed.
        # This seed is then used by the rng object inside run_greedy_episode.
        # np.random.seed(seed) # Not needed as rng is passed per episode

        num_successes = 0
        detection_times_this_seed = []
        rewards_this_seed = []
        episode_lengths_this_seed = []
        start_time_this_seed = time.time()

        for i in range(NUM_EPISODES_PER_SEED):
            episode_unique_seed = seed * NUM_EPISODES_PER_SEED + i
            success, detection_time, reward, episode_length = run_greedy_episode(
                NUM_CELLS, TIME_HORIZON, prior, gammas1, gammas2, c, episode_unique_seed
            )
            rewards_this_seed.append(reward)
            episode_lengths_this_seed.append(episode_length)

            if success:
                num_successes += 1
                detection_times_this_seed.append(detection_time)

        end_time_this_seed = time.time()
        success_rate_this_seed = num_successes / NUM_EPISODES_PER_SEED
        all_success_rates.append(success_rate_this_seed)
        all_detection_times.extend(detection_times_this_seed)
        all_rewards.extend(rewards_this_seed)
        all_episode_lengths.extend(episode_lengths_this_seed)

        print(f"--- Seed {seed} Results ---")
        print(f"Success Rate: {success_rate_this_seed:.4f} ({success_rate_this_seed*100:.2f}%)")
        if detection_times_this_seed:
            print(f"Average Detection Time (Successful Episodes): {np.mean(detection_times_this_seed):.2f}")
        else:
            print("Average Detection Time (Successful Episodes): N/A (no successes)")
        print(f"Average Episode Reward: {np.mean(rewards_this_seed):.4f}")
        print(f"Average Episode Length: {np.mean(episode_lengths_this_seed):.2f}")
        print(f"Time taken for this seed: {end_time_this_seed - start_time_this_seed:.2f} seconds")

    total_experiment_end_time = time.time()

    mean_success_rate = np.mean(all_success_rates)
    std_success_rate = np.std(all_success_rates)
    mean_detection_time = np.mean(all_detection_times) if all_detection_times else -1
    std_detection_time = np.std(all_detection_times) if all_detection_times else -1
    mean_reward = np.mean(all_rewards)
    std_reward = np.std(all_rewards)
    mean_episode_length = np.mean(all_episode_lengths)
    std_episode_length = np.std(all_episode_lengths)

    print("\n--- Overall Greedy Policy Results ---")
    print(f"Total experiment time across {NUM_SEEDS} seeds: {total_experiment_end_time - total_experiment_start_time:.2f} seconds")
    print(f"Average Success Rate over {NUM_SEEDS} seeds and {NUM_EPISODES_PER_SEED} episodes each: {mean_success_rate:.4f} ({mean_success_rate*100:.2f}%)")
    print(f"Standard Deviation of Success Rate: {std_success_rate:.4f}")
    if all_detection_times:
        print(f"Average Detection Time (Successful Episodes) across all seeds: {mean_detection_time:.2f}")
        print(f"Standard Deviation of Detection Time: {std_detection_time:.2f}")
    else:
        print("Average Detection Time (Successful Episodes) across all seeds: N/A (no successes)")

    print(f"Average Episode Reward across all seeds: {mean_reward:.4f}")
    print(f"Standard Deviation of Reward: {std_reward:.4f}")
    print(f"Average Episode Length across all seeds: {mean_episode_length:.2f}")
    print(f"Standard Deviation of Episode Length: {std_episode_length:.2f}")

--- Running Empirical Experiment for Greedy Policy ---
Number of episodes: 10
--------------------
--- Seed 0 Results ---
Success Rate: 0.2000 (20.00%)
Average Detection Time (Successful Episodes): 7.50
Average Episode Reward: -1.5972
Average Episode Length: 17.50
Time taken for this seed: 0.05 seconds
--- Seed 1 Results ---
Success Rate: 0.1000 (10.00%)
Average Detection Time (Successful Episodes): 8.00
Average Episode Reward: -1.8410
Average Episode Length: 18.80
Time taken for this seed: 0.04 seconds
--- Seed 2 Results ---
Success Rate: 0.1000 (10.00%)
Average Detection Time (Successful Episodes): 8.00
Average Episode Reward: -1.8268
Average Episode Length: 18.80
Time taken for this seed: 0.04 seconds
--- Seed 3 Results ---
Success Rate: 0.2000 (20.00%)
Average Detection Time (Successful Episodes): 4.00
Average Episode Reward: -1.5267
Average Episode Length: 16.80
Time taken for this seed: 0.03 seconds
--- Seed 4 Results ---
Success Rate: 0.1000 (10.00%)
Average Detection Time (Succ