In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

### Scrip for the large Problem

In [1]:
import numpy as np

def generate_problem_parameters(n: int, seed: int = 42) -> tuple:
    """
    Generates correlated joint prior, cost vector, and miss-detection rates
    for a two-object search problem.

    Args:
        n (int): The number of cells.
        seed (int): Seed for reproducibility.

    Returns:
        tuple: (prior_joint, gamma1, gamma2, c)
    """
    np.random.seed(seed)

    # 1. Generate gamma1 in a range that allows gamma2 to be smaller
    gamma1 = np.random.uniform(low=0.6, high=0.85, size=n)

    # 2. Generate gamma2 such that gamma2 < gamma1, and within its own desired range [0.15, 0.25]
    gamma2 = np.zeros(n)
    min_gamma2_val = 0.15
    max_gamma2_val = 0.25
    for i in range(n):
        # Upper bound for gamma2 is min(max_gamma2_val, gamma1[i] - 0.02)
        upper_bound_for_this_gamma2 = min(max_gamma2_val, gamma1[i] - 0.02) # Small buffer to ensure gamma2 < gamma1
        if upper_bound_for_this_gamma2 < min_gamma2_val: # If buffer makes it too small, ensure min
            gamma2[i] = np.random.uniform(low=min_gamma2_val, high=min_gamma2_val + 0.01) # Force a small valid range
        else:
            gamma2[i] = np.random.uniform(low=min_gamma2_val, high=upper_bound_for_this_gamma2)
    gamma2 = np.clip(gamma2, None, max_gamma2_val) # Ensure max bound

    # 3. Generate cost vector c
    c = np.random.uniform(low=0.1, high=0.3, size=n)

    # 4. Generate correlated joint prior (n x n matrix)
    prior_joint = np.zeros((n, n))
    correlation_strength = 2.0 # Adjust this value to control correlation strength

    for i in range(n):
        for j in range(n):
            # Higher probability when O1 and O2 are in the same or adjacent cells
            distance = abs(i - j)
            prior_joint[i, j] = np.exp(-correlation_strength * distance) # Exponential decay with distance

    # Add some random noise to ensure variety (but keep it small)
    prior_joint += np.random.uniform(low=0.01, high=0.05, size=(n, n))
    prior_joint[prior_joint < 0.001] = 0.001 # Ensure no zero probabilities

    # Normalize the joint prior
    prior_joint /= np.sum(prior_joint)

    # Explicitly ensure the sum is exactly 1.0 by adjusting the last element
    # This handles potential floating-point inaccuracies after normalization.
    prior_joint[-1, -1] += (1.0 - np.sum(prior_joint))

    return prior_joint, gamma1, gamma2, c

if __name__ == '__main__':
    # Example usage:
    """NUM_CELLS = 100
    SEED = 42

    prior_joint, gamma1, gamma2, c = generate_problem_parameters(NUM_CELLS, SEED)

    print(f"Generated Parameters (N={NUM_CELLS}, Seed={SEED}):\n")
    print("Joint Prior (p(O1, O2)):\n", np.round(prior_joint, 4))
    print("\nSum of Joint Prior: ", np.sum(prior_joint))
    print("\nMiss-detection rates for O1 (gamma1):", np.round(gamma1, 4))
    print("Miss-detection rates for O2 (gamma2):", np.round(gamma2, 4))
    print("\nCost vector (c):", np.round(c, 4))

    # Verify gamma1 > gamma2
    print("\nIs gamma1 > gamma2 for all elements?:", np.all(gamma1 > gamma2))"""
    print(" ")


 


In [2]:
import numpy as np
import gymnasium as gym
from gymnasium import spaces
from typing import Optional, Dict, Any, Tuple
import os
import time

from stable_baselines3 import DQN
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.utils import set_random_seed
from stable_baselines3.common.logger import configure

# =============================================================================
# 1. THE (p0, z, theta) ENVIRONMENT
# =============================================================================

class SearchEnvVec(gym.Env):
    """
    A custom Gymnasium environment for the two-object search problem.

    This version uses a flat state representation based on:
    1. The search count vector (z_t)
    2. The static prior (p_0)
    3. The related object status (theta)
    """

    def __init__(self, p0: np.ndarray, gamma1: np.ndarray, gamma2: np.ndarray, T: int, cost_vector: np.ndarray):
        super(SearchEnvVec, self).__init__()

        self.n = p0.shape[0]
        self.T = T
        self.p0 = p0.astype(np.float32)
        self.p0_flat = self.p0.flatten()
        self.gamma1 = gamma1.astype(np.float32)
        self.gamma2 = gamma2.astype(np.float32)
        self.cost_vector = cost_vector.astype(np.float32)

        self.action_space = spaces.Discrete(self.n)

        # --- Define the observation space ---
        # 1. z_vector: n elements
        # 2. prior: n*n elements
        # 3. theta: n+1 elements (one-hot)
        obs_size = self.n + (self.n * self.n) + (self.n + 1)

        self.observation_space = spaces.Box(
            low=0.0,
            high=float(self.T),
            shape=(obs_size,),
            dtype=np.float32
        )

        self._precompute_conditionals()

        # Environment state variables
        self.z_vector = np.zeros(self.n, dtype=np.int32)
        self.theta = 0
        self.current_step = 0
        self.true_pos_o1 = 0
        self.true_pos_o2 = 0

    def _precompute_conditionals(self):
        self.conditionals = np.zeros_like(self.p0)
        for j in range(self.n):
            col_sum = np.sum(self.p0[:, j])
            if col_sum > 0:
                self.conditionals[:, j] = self.p0[:, j] / col_sum
            else:
                self.conditionals[:, j] = 1.0 / self.n

    def _get_obs(self) -> np.ndarray:
        """
        Constructs the observation vector from z_t, p_0, and theta.
        """
        z_flat = self.z_vector.astype(np.float32)
        theta_one_hot = np.zeros(self.n + 1, dtype=np.float32)
        theta_one_hot[self.theta] = 1.0

        # Concatenate all parts into a single flat vector
        obs = np.concatenate([z_flat, self.p0_flat, theta_one_hot])
        return obs

    def reset(self, seed: Optional[int] = None, options: Optional[Dict] = None) -> Tuple[np.ndarray, Dict]:
        super().reset(seed=seed)

        self.z_vector = np.zeros(self.n, dtype=np.int32)
        self.theta = 0
        self.current_step = 0

        p_flat = self.p0.flatten()
        # Handle case where prior is all zeros by adding a small epsilon or checking sum
        prior_sum = np.sum(p_flat)
        if prior_sum == 0:
             true_idx = self.np_random.choice(self.n * self.n) # Choose randomly if prior is zero
        else:
             true_idx = self.np_random.choice(self.n * self.n, p=p_flat / prior_sum) # Normalize

        self.true_pos_o1, self.true_pos_o2 = np.unravel_index(true_idx, (self.n, self.n))

        info = {}
        return self._get_obs(), info

    def step(self, action: int) -> Tuple[np.ndarray, float, bool, bool, Dict]:
        if not self.action_space.contains(action):
            raise ValueError(f"Invalid action: {action}")

        self.current_step += 1

        reward = -float(self.cost_vector[action])
        terminated = False
        found_o1 = False
        found_o2_this_step = False

        # --- Check for O1 (Target) Detection ---
        # O1 is present if its true location is the action cell AND
        # (O2 is hidden AND O1 is at action cell) OR (O2 is found at its true location AND O1 is at action cell)
        is_o1_present = (action == self.true_pos_o1)

        if is_o1_present:
            if self.np_random.random() > self.gamma1[action]:
                found_o1 = True

        if found_o1:
            reward = 1.0
            terminated = True
        else:
            # --- Check for O2 (Related) Detection ---
            if self.theta == 0: # Only check for O2 if it's currently hidden
                if action == self.true_pos_o2:
                    if self.np_random.random() > self.gamma2[action]:
                        self.theta = action + 1 # O2 found, update theta

            self.z_vector[action] += 1 # Increment search count for the actioned cell


        if self.current_step >= self.T:
            terminated = True

        info = {'found_o1': found_o1, 'found_o2': found_o2_this_step}
        truncated = False # We use terminated for end of horizon
        return self._get_obs(), reward, terminated, truncated, info

# =============================================================================
# 2. DQN TRAINING FUNCTION FOR A SINGLE RUN
# =============================================================================

def run_single_experiment(run_id: int, base_log_dir: str, seed: int, total_timesteps: int) -> Dict[str, float]:
    """
    Runs a single DQN training and evaluation experiment.
    Returns a dictionary of final evaluation metrics.
    """
    print(f"\n===== STARTING DQN RUN {run_id} with SEED {seed} ====")

    # --- 3.1. Problem and Environment Definition ---
    NUM_CELLS = 25
    TIME_HORIZON = 20

    set_random_seed(seed)

    

    SEED = 42

    prior, gammas1, gammas2, costs = generate_problem_parameters(NUM_CELLS, SEED)
    

    assert (prior[0][0], prior[24][24]) == (0.020334334191265974, 0.020140323245187997)
    assert (gammas1[0], gammas1[1]) == (0.6936350297118405, 0.837678576602479)
    assert (gammas2[0], gammas2[1]) == (0.22851759613930137, 0.16996737821583596)
    assert (costs[0], costs[1]) ==( 0.2939169255529117, 0.2550265646722229)

    # --- 3.2. Create Environments with unique seed for this run ---
    env = SearchEnvVec(p0=prior, gamma1=gammas1, gamma2=gammas2, T=TIME_HORIZON, cost_vector=costs)
    eval_env = SearchEnvVec(p0=prior, gamma1=gammas1, gamma2=gammas2, T=TIME_HORIZON, cost_vector=costs)

    # --- 3.3. Setup Logging and Callbacks for this specific run ---
    run_log_dir = os.path.join(base_log_dir, f"run_{run_id}")
    best_model_dir = os.path.join(run_log_dir, "best_model")
    os.makedirs(run_log_dir, exist_ok=True)
    os.makedirs(best_model_dir, exist_ok=True)

    new_logger = configure(run_log_dir, ["csv", "tensorboard"])

    # Set eval_freq lower to get more frequent evaluation points
    eval_callback = EvalCallback(eval_env,
                                 best_model_save_path=best_model_dir,
                                 log_path=run_log_dir, # Save eval results in run dir
                                 eval_freq=10000, # Evaluate more frequently
                                 n_eval_episodes=10, # Evaluate on 100 episodes during training
                                 deterministic=True,
                                 render=False)

    # --- 3.4. DQN Model Training ---
    model = DQN(
        "MlpPolicy",
        env,
        verbose=0,
        buffer_size=50000,
        learning_starts=1000,
        batch_size=64,
        gamma=0.95, #
        train_freq=(10, "step"),
        gradient_steps=10,
        target_update_interval=1000,
        exploration_fraction=0.2,
        exploration_final_eps=0.01,
        seed=seed,
        device="cpu"
    )

    model.set_logger(new_logger)

    model.learn(total_timesteps=total_timesteps, callback=eval_callback)

    # --- 3.5. Final Evaluation of the Best Model ---
    print(f"Evaluating best model for run {run_id}...")
    best_model_path = os.path.join(best_model_dir, "best_model.zip")
    final_metrics = {}

    if os.path.exists(best_model_path):
        best_model = DQN.load(best_model_path, env=eval_env)
        num_final_eval_episodes = 100
        total_reward = 0
        num_successes = 0
        detection_times = []
        episode_lengths = []

        # Each episode will now have a unique seed based on the run_id and episode index
        for i_episode in range(num_final_eval_episodes):
            episode_seed = seed * num_final_eval_episodes + i_episode # Unique seed for each episode
            obs, info = eval_env.reset(seed=episode_seed)
            done = False
            episode_reward = 0
            episode_len = 0

            while not done:
                action, _ = best_model.predict(obs, deterministic=True)
                obs, reward, terminated, truncated, info = eval_env.step(action)
                done = terminated or truncated
                episode_reward += reward
                episode_len += 1

                if info.get('found_o1'):
                    detection_times.append(episode_len)

            total_reward += episode_reward
            episode_lengths.append(episode_len)
            if info.get('found_o1'):
                 num_successes += 1

        mean_reward = total_reward / num_final_eval_episodes
        success_rate = num_successes / num_final_eval_episodes
        mean_detection_time = np.mean(detection_times) if detection_times else -1
        mean_episode_length = np.mean(episode_lengths)

        final_metrics = {
            'success_rate': success_rate,
            'avg_detection_time': mean_detection_time,
            'avg_episode_reward': mean_reward,
            'avg_episode_length': mean_episode_length
        }


        print(f"Run {run_id} Final Evaluation Results (Best Model):")
        print(f"  Success Rate: {final_metrics['success_rate']:.4f} ({final_metrics['success_rate']*100:.2f}%)")
        print(f"  Average Detection Time (Successful Episodes): {final_metrics['avg_detection_time']:.2f}")
        print(f"  Average Episode Reward: {final_metrics['avg_episode_reward']:.4f}")
        print(f"  Average Episode Length: {final_metrics['avg_episode_length']:.2f}")
    else:
        print(f"No best model found for run {run_id}.")


    print(f"===== COMPLETED DQN RUN {run_id} ====")
    return final_metrics


if __name__ == '__main__':
    # --- Configuration for Multiple Runs ---
    NUM_RUNS = 10 # Set to 10 seeds as requested
    TOTAL_TIMESTEPS = 200_000
    BASE_LOG_DIR = "./dqn_z_vector_logs/"

    # --- Execute All Runs ---
    all_final_success_rates = []
    all_final_detection_times = []
    all_final_rewards = []
    all_final_episode_lengths = []

    total_experiment_start_time = time.time()

    for i in range(NUM_RUNS):
        run_seed = i # Seeds go from 0 to 99
        final_metrics = run_single_experiment(
            run_id=i,
            base_log_dir=BASE_LOG_DIR,
            seed=run_seed,
            total_timesteps=TOTAL_TIMESTEPS
        )

        # Collect metrics from each run
        if final_metrics: # Only collect if evaluation was successful
            all_final_success_rates.append(final_metrics['success_rate'])
            if final_metrics['avg_detection_time'] != -1:
                 all_final_detection_times.append(final_metrics['avg_detection_time'])
            all_final_rewards.append(final_metrics['avg_episode_reward'])
            all_final_episode_lengths.append(final_metrics['avg_episode_length'])


    total_experiment_end_time = time.time()

    # --- Calculate and Display Overall Metrics ---
    print(f"\nAll {NUM_RUNS} DQN training runs are complete.")
    print(f"Log files are saved in '{BASE_LOG_DIR}'.")
    print(f"Total time for all runs: {total_experiment_end_time - total_experiment_start_time:.2f} seconds")

    print("\n--- Overall DQN Policy Results Across All Seeds ---")
    print(f"Average Success Rate: {np.mean(all_final_success_rates):.4f} ({np.mean(all_final_success_rates)*100:.2f}%)")
    print(f"Standard Deviation of Success Rate: {np.std(all_final_success_rates):.4f}")

    if all_final_detection_times:
        print(f"Average Detection Time (Successful Episodes): {np.mean(all_final_detection_times):.2f}")
        print(f"Standard Deviation of Detection Time: {np.std(all_final_detection_times):.2f}")
    else:
        print("Average Detection Time (Successful Episodes): N/A (no successful episodes across all runs)")

    print(f"Average Episode Reward: {np.mean(all_final_rewards):.4f}")
    print(f"Standard Deviation of Episode Reward: {np.std(all_final_rewards):.4f}")

    print(f"Average Episode Length: {np.mean(all_final_episode_lengths):.2f}")
    print(f"Standard Deviation of Episode Length: {np.std(all_final_episode_lengths):.2f}")


2025-11-08 14:52:27.337050: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-11-08 14:52:27.426187: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.



===== STARTING DQN RUN 0 with SEED 0 ====




Eval num_timesteps=10000, episode_reward=-2.32 +/- 0.97
Episode length: 18.10 +/- 3.96
New best mean reward!
Eval num_timesteps=20000, episode_reward=-2.07 +/- 1.38
Episode length: 16.60 +/- 6.80
New best mean reward!
Eval num_timesteps=30000, episode_reward=-2.57 +/- 1.19
Episode length: 18.10 +/- 5.70
Eval num_timesteps=40000, episode_reward=-1.76 +/- 0.79
Episode length: 18.50 +/- 4.50
New best mean reward!
Eval num_timesteps=50000, episode_reward=-5.10 +/- 0.00
Episode length: 20.00 +/- 0.00
Eval num_timesteps=60000, episode_reward=-4.02 +/- 1.69
Episode length: 18.10 +/- 5.70
Eval num_timesteps=70000, episode_reward=-3.17 +/- 1.39
Episode length: 18.10 +/- 5.70
Eval num_timesteps=80000, episode_reward=-1.95 +/- 0.53
Episode length: 19.40 +/- 1.80
Eval num_timesteps=90000, episode_reward=-2.80 +/- 0.57
Episode length: 19.40 +/- 1.80
Eval num_timesteps=100000, episode_reward=-1.79 +/- 0.90
Episode length: 18.20 +/- 5.40
Eval num_timesteps=110000, episode_reward=-4.32 +/- 0.83
Episod