### Setup and evaluation helpers
- Imports, environment creators, and random seed
- `reward_scalar` to safely convert reward arrays to Python floats
- `epsilon_greedy_action`, `is_terminal`, and `evaluate_policy`


In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
# RL: SARSA and Q-Learning on Grid World
# Uses env.create_standard_grid and env.create_four_room

import numpy as np
from typing import Tuple, List
from env import create_standard_grid, create_four_room

np.random.seed(0)

# Safe reward scalar extractor for 1-element arrays
def reward_scalar(r):
    return float(np.ravel(r)[0])

# epsilon greedy function
# Chose to use correct datatype outputs as it becomes a problem in downstream tasks.

def epsilon_greedy_action(Q: np.ndarray, state: int, epsilon: float) -> int:
    """Pick epsilon-greedy action from Q[state]."""
    if np.random.rand() < epsilon:
        return np.random.randint(Q.shape[1])
    return int(np.argmax(Q[state]))

# --- Episode termination check ---
def is_terminal(state: int, goal_states_seq: np.ndarray) -> bool:
    return state in set(map(int, np.array(goal_states_seq).flatten()))

# --- Rollouts for evaluation ---
def evaluate_policy(env, Q: np.ndarray, episodes: int = 20, max_steps: int = 100) -> float:
    total = 0.0
    for _ in range(episodes):
        s = env.reset()
        ep_ret = 0.0
        for _ in range(max_steps):
            a = int(np.argmax(Q[s]))
            s_next, r = env.step(s, a)
            ep_ret += reward_scalar(r)
            s = int(s_next)
            if is_terminal(s, env.goal_states_seq):
                break
        total += ep_ret
    return total / episodes



In [None]:
# Beginner-friendly helpers (no type hints)
import numpy as np

def epsilon_greedy_action(Q, state, epsilon):
    if np.random.rand() < float(epsilon):
        return int(np.random.randint(Q.shape[1]))
    return int(np.argmax(Q[state]))


def is_terminal(state, goal_states_seq):
    # goal_states_seq may be nested arrays; flatten and compare as ints
    return int(state) in set(map(int, np.array(goal_states_seq).flatten()))


def evaluate_policy(env, Q, episodes=20, max_steps=100):
    total = 0.0
    for _ in range(int(episodes)):
        s = int(env.reset())
        ep_ret = 0.0
        for _ in range(int(max_steps)):
            a = int(np.argmax(Q[s]))
            s_next, r = env.step(s, a)
            ep_ret += reward_scalar(r)
            s = int(s_next)
            if is_terminal(s, env.goal_states_seq):
                break
        total += ep_ret
    return total / float(episodes)



In [None]:
# Beginner-friendly training loops (SARSA and Q-Learning)
import numpy as np


def train_sarsa(env, episodes=2000, alpha=0.1, gamma=0.99,
                epsilon_start=1.0, epsilon_end=0.05, epsilon_decay_episodes=1500,
                max_steps=100):
    num_states, num_actions = env.num_states, env.num_actions
    Q = np.zeros((num_states, num_actions), dtype=float)

    def eps_schedule(ep):
        if epsilon_decay_episodes <= 0:
            return float(epsilon_end)
        frac = min(1.0, float(ep) / float(epsilon_decay_episodes))
        return float(epsilon_start) + frac * (float(epsilon_end) - float(epsilon_start))

    returns = []
    for ep in range(int(episodes)):
        s = int(env.reset())
        eps = eps_schedule(ep)
        a = epsilon_greedy_action(Q, s, eps)
        ep_ret = 0.0
        for _ in range(int(max_steps)):
            s_next, r = env.step(s, a)
            ep_ret += reward_scalar(r)
            s_next = int(s_next)

            if is_terminal(s_next, env.goal_states_seq):
                td_target = reward_scalar(r)
                Q[s, a] += alpha * (td_target - Q[s, a])
                break

            a_next = epsilon_greedy_action(Q, s_next, eps)
            td_target = reward_scalar(r) + gamma * Q[s_next, a_next]
            Q[s, a] += alpha * (td_target - Q[s, a])
            s, a = s_next, a_next
        returns.append(ep_ret)
    return Q, returns


def train_q_learning(env, episodes=2000, alpha=0.1, gamma=0.99,
                      epsilon_start=1.0, epsilon_end=0.05, epsilon_decay_episodes=1500,
                      max_steps=100):
    num_states, num_actions = env.num_states, env.num_actions
    Q = np.zeros((num_states, num_actions), dtype=float)

    def eps_schedule(ep):
        if epsilon_decay_episodes <= 0:
            return float(epsilon_end)
        frac = min(1.0, float(ep) / float(epsilon_decay_episodes))
        return float(epsilon_start) + frac * (float(epsilon_end) - float(epsilon_start))

    returns = []
    for ep in range(int(episodes)):
        s = int(env.reset())
        eps = eps_schedule(ep)
        ep_ret = 0.0
        for _ in range(int(max_steps)):
            a = epsilon_greedy_action(Q, s, eps)
            s_next, r = env.step(s, a)
            ep_ret += reward_scalar(r)
            s_next = int(s_next)

            if is_terminal(s_next, env.goal_states_seq):
                td_target = reward_scalar(r)
                Q[s, a] += alpha * (td_target - Q[s, a])
                break

            best_next = float(np.max(Q[s_next]))
            td_target = reward_scalar(r) + gamma * best_next
            Q[s, a] += alpha * (td_target - Q[s, a])
            s = s_next
        returns.append(ep_ret)
    return Q, returns



In [None]:
import numpy as np


def softmax_action(Q, state, tau):
    tau = float(tau)
    if tau <= 0:
        return int(np.argmax(Q[state]))
    z = Q[state] / tau
    z = z - np.max(z)
    p = np.exp(z)
    p = p / np.sum(p)
    return int(np.random.choice(len(p), p=p))


def select_action(Q, state, strategy, param):
    if strategy == 'eps_greedy':
        if np.random.rand() < float(param):
            return int(np.random.randint(Q.shape[1]))
        return int(np.argmax(Q[state]))
    elif strategy == 'softmax':
        return softmax_action(Q, state, float(param))
    else:
        raise ValueError('Unknown exploration strategy: ' + str(strategy))




In [None]:
def train_sarsa_fixed(env, episodes, alpha, gamma, strategy, param, max_steps=200,
                      early_stop=True, early_stop_window=200, early_stop_tol=0.05):
    num_states, num_actions = env.num_states, env.num_actions
    Q = np.zeros((num_states, num_actions), dtype=float)
    returns = []

    for _ in range(int(episodes)):
        s = int(env.reset())
        a = select_action(Q, s, strategy, param)
        ep_ret = 0.0
        for _ in range(int(max_steps)):
            s_next, r = env.step(s, a)
            ep_ret += reward_scalar(r)
            s_next = int(s_next)

            if is_terminal(s_next, env.goal_states_seq):
                td_target = reward_scalar(r)
                Q[s, a] += alpha * (td_target - Q[s, a])
                break

            a_next = select_action(Q, s_next, strategy, param)
            td_target = reward_scalar(r) + gamma * Q[s_next, a_next]
            Q[s, a] += alpha * (td_target - Q[s, a])
            s, a = s_next, a_next
        returns.append(ep_ret)
    return Q, returns


In [None]:


def train_q_learning_fixed(env, episodes, alpha, gamma, strategy, param, max_steps=200):
    num_states, num_actions = env.num_states, env.num_actions
    Q = np.zeros((num_states, num_actions), dtype=float)
    returns = []

    for _ in range(int(episodes)):
        s = int(env.reset())
        ep_ret = 0.0
        for _ in range(int(max_steps)):
            a = select_action(Q, s, strategy, param)
            s_next, r = env.step(s, a)
            ep_ret += reward_scalar(r)
            s_next = int(s_next)

            if is_terminal(s_next, env.goal_states_seq):
                td_target = reward_scalar(r)
                Q[s, a] += alpha * (td_target - Q[s, a])
                break

            best_next = float(np.max(Q[s_next]))
            td_target = reward_scalar(r) + gamma * best_next
            Q[s, a] += alpha * (td_target - Q[s, a])
            s = s_next
        returns.append(ep_ret)
    return Q, returns

### SARSA (epsilon-decay baseline)
- Reference implementation of SARSA with epsilon decay
- Used for quick demos; sweeps use fixed-parameter trainer instead
- Returns Q-table and per-episode returns


In [None]:
def train_sarsa(
    env,
    episodes: int = 2000,
    alpha: float = 0.1,
    gamma: float = 0.99,
    epsilon_start: float = 1.0,
    epsilon_end: float = 0.05,
    epsilon_decay_episodes: int = 1500,
    max_steps: int = 100,
):

    """On-policy TD control (SARSA)."""
    num_states, num_actions = env.num_states, env.num_actions
    Q = np.zeros((num_states, num_actions), dtype=float)

    def eps_schedule(ep):
        if epsilon_decay_episodes <= 0:
            return epsilon_end
        frac = min(1.0, ep / epsilon_decay_episodes)
        return epsilon_start + frac * (epsilon_end - epsilon_start)

    returns: List[float] = []
    for ep in range(episodes):
        s = env.reset()
        eps = eps_schedule(ep)
        a = epsilon_greedy_action(Q, s, eps)
        ep_ret = 0.0

        for _ in range(max_steps):
            s_next, r = env.step(s, a)
            ep_ret += reward_scalar(r)

            if is_terminal(int(s_next), env.goal_states_seq):
                td_target = reward_scalar(r)
                Q[s, a] += alpha * (td_target - Q[s, a])
                break

            a_next = epsilon_greedy_action(Q, int(s_next), eps)
            td_target = reward_scalar(r) + gamma * Q[int(s_next), a_next]
            Q[s, a] += alpha * (td_target - Q[s, a])

            s, a = int(s_next), a_next

        returns.append(ep_ret)

    return Q, returns



### Q-learning (epsilon-decay baseline)
- Reference implementation of Q-learning with epsilon decay
- Used for quick demos; sweeps use fixed-parameter trainer instead
- Returns Q-table and per-episode returns


In [None]:
def train_q_learning(
    env,
    episodes: int = 2000,
    alpha: float = 0.1,
    gamma: float = 0.99,
    epsilon_start: float = 1.0,
    epsilon_end: float = 0.05,
    epsilon_decay_episodes: int = 1500,
    max_steps: int = 100,
):
    """Off-policy TD control (Q-Learning)."""
    num_states, num_actions = env.num_states, env.num_actions
    Q = np.zeros((num_states, num_actions), dtype=float)

    def eps_schedule(ep):
        if epsilon_decay_episodes <= 0:
            return epsilon_end
        frac = min(1.0, ep / epsilon_decay_episodes)
        return epsilon_start + frac * (epsilon_end - epsilon_start)

    returns: List[float] = []
    for ep in range(episodes):
        s = env.reset()
        eps = eps_schedule(ep)
        ep_ret = 0.0

        for _ in range(max_steps):
            a = epsilon_greedy_action(Q, s, eps)
            s_next, r = env.step(s, a)
            ep_ret += reward_scalar(r)

            if is_terminal(int(s_next), env.goal_states_seq):
                td_target = reward_scalar(r)
                Q[s, a] += alpha * (td_target - Q[s, a])
                break

            best_next = np.max(Q[int(s_next)])
            td_target = reward_scalar(r) + gamma * best_next
            Q[s, a] += alpha * (td_target - Q[s, a])

            s = int(s_next)

        returns.append(ep_ret)

    return Q, returns



### 10×10 quick training demo
- Demonstration runs of Q-learning and SARSA on the 10×10 grid
- Uses simple epsilon decay (older helpers) to sanity-check training
- Prints average evaluation return


In [None]:
# --- Run: 10x10 Standard Grid (stochastic and wind optional) ---
std_env = create_standard_grid(start_state=np.array([[0,4]]), transition_prob=0.7, wind=False)

# Q-Learning on standard grid
Q_q, ret_q = train_q_learning(std_env, episodes=3000, alpha=0.1, gamma=0.99,
                              epsilon_start=1.0, epsilon_end=0.05, epsilon_decay_episodes=2500)
print("Standard Grid — Q-Learning eval avg return:", evaluate_policy(std_env, Q_q))

# SARSA on standard grid
Q_s, ret_s = train_sarsa(std_env, episodes=3000, alpha=0.1, gamma=0.99,
                         epsilon_start=1.0, epsilon_end=0.05, epsilon_decay_episodes=2500)
print("Standard Grid — SARSA eval avg return:", evaluate_policy(std_env, Q_s))



In [19]:
# Saveable learning curve plots and sweep plotting flags
import os
import matplotlib.pyplot as plt

PLOT_SWEEP = True  # set False to skip saving plots during sweeps
PLOT_DIR = 'sweep_plots'
os.makedirs(PLOT_DIR, exist_ok=True)


def save_learning_curves(returns_list, labels, window=100, title='Episode returns', filepath=None):
    plt.figure(figsize=(8,4))
    for returns, label in zip(returns_list, labels):
        plt.plot(returns, alpha=0.3, label=str(label) + ' (raw)')
        # simple rolling mean (same as earlier helper)
        x = np.array(returns, dtype=float)
        w = min(int(window), len(x)) if len(x) > 0 else 1
        csum = np.cumsum(np.insert(x, 0, 0.0))
        rm = (csum[w:] - csum[:-w]) / float(w)
        pad = np.full(w-1, np.nan)
        y = np.concatenate([pad, rm]) if len(x) > 0 else x
        plt.plot(y, linewidth=2, label=str(label) + f' (rolling {window})')
    plt.xlabel('Episode')
    plt.ylabel('Return')
    plt.title(title)
    plt.legend()
    plt.grid(True, alpha=0.3)
    if filepath:
        plt.savefig(filepath, bbox_inches='tight', dpi=120)
        plt.close()
    else:
        plt.show()



### Four-Room quick training demo
- Demonstration runs of Q-learning and SARSA on the Four-Room environment
- Uses simple epsilon decay (older helpers) to sanity-check training
- Prints average evaluation return


In [None]:
# --- Run: Four-Room Grid (deterministic, dynamic goal) ---
fr_env = create_four_room(goal_change=True, transition_prob=1.0)

Q_q_fr, ret_q_fr = train_q_learning(fr_env, episodes=3000, alpha=0.1, gamma=0.99,
                                    epsilon_start=1.0, epsilon_end=0.05, epsilon_decay_episodes=2500)
print("Four-Room — Q-Learning eval avg return:", evaluate_policy(fr_env, Q_q_fr))

Q_s_fr, ret_s_fr = train_sarsa(fr_env, episodes=3000, alpha=0.1, gamma=0.99,
                               epsilon_start=1.0, epsilon_end=0.05, epsilon_decay_episodes=2500)
print("Four-Room — SARSA eval avg return:", evaluate_policy(fr_env, Q_s_fr))



### Plotting utilities and convergence heuristic
- Rolling mean computation and learning curve plotting
- Simple convergence check based on stability of rolling mean tail
- Example plots for prior quick runs


In [None]:
import matplotlib.pyplot as plt

def rolling_mean(x, window=100):
    if len(x) == 0:
        return []
    w = min(window, len(x))
    cumsum = np.cumsum(np.insert(np.array(x, dtype=float), 0, 0.0))
    rm = (cumsum[w:] - cumsum[:-w]) / w
    # pad to match length
    pad = np.full(w-1, np.nan)
    return np.concatenate([pad, rm])

def plot_learning_curves(returns_list, labels, window=100, title="Episode returns"):
    plt.figure(figsize=(8,4))
    for returns, label in zip(returns_list, labels):
        plt.plot(returns, alpha=0.3, label=f"{label} (raw)")
        plt.plot(rolling_mean(returns, window), linewidth=2, label=f"{label} (rolling {window})")
    plt.xlabel("Episode")
    plt.ylabel("Return")
    plt.title(title)
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

# Heuristic: convergence when rolling mean change is small over last K episodes
def has_converged(reward_list, window=100, min_avg=-150, std_threshold=10):
    if len(reward_list) < window:
        return False
    window_rewards = reward_list[-window:]
    mean = np.mean(window_rewards)
    std = np.std(window_rewards)
    # Converged if mean reward > target and variance is low
    return (mean > min_avg) and (std < std_threshold)

# If you already ran training above, you can visualize now:
try:
    plot_learning_curves([ret_q, ret_s], ["Q-Learning (standard)", "SARSA (standard)"], window=100,
                         title="Standard Grid: Episode Returns")
    print("Standard Q-Learning converged?", has_converged(ret_q, window=200))
    print("Standard SARSA converged?", has_converged(ret_s, window=200))
except NameError:
    pass

try:
    plot_learning_curves([ret_q_fr, ret_s_fr], ["Q-Learning (four-room)", "SARSA (four-room)"], window=100,
                         title="Four-Room: Episode Returns")
    print("Four-Room Q-Learning converged?", has_converged(ret_q_fr, window=200))
    print("Four-Room SARSA converged?", has_converged(ret_s_fr, window=200))
except NameError:
    pass



In [16]:
# -----------------------
# Experiment configuration
# -----------------------

EPISODES = 3000          # total training episodes per run
MAX_STEPS = 100          # max steps per episode
EVAL_EPISODES = 100      # for post-training evaluation
SEEDS = [42, 100, 202, 303, 404]

# Hyperparameter ranges (from assignment)
ALPHAS = [0.001, 0.01, 0.1, 1.0]
GAMMAS = [0.7, 0.8, 0.9, 1.0]
EPSILONS = [0.001, 0.01, 0.05, 0.1]
TAUS = [0.01, 0.1, 1.0, 2.0]

# Convergence and plotting params
WINDOW = 100
THRESHOLD = 5.0
RESULTS_DIR = "./results"

# Ensure result folder exists
import os
os.makedirs(RESULTS_DIR, exist_ok=True)


In [11]:
import numpy as np
from env import create_standard_grid, create_four_room

all_configs = []

# -------------------------------
# 10x10 Grid World: Q-learning (8)
# -------------------------------
for tp in [0.7, 1.0]:
    for start_state in [(0, 4), (3, 6)]:
        for strat in ['eps_greedy', 'softmax']:
            name = f"std_q_tp{tp}_ss{start_state}_strat{strat}"
            cfg = {
                'name': name,
                'alg': 'q_learning',
                'env_builder': create_standard_grid,
                'env_kwargs': {
                    # ✅ FIX: convert tuple to 2D numpy array
                    'start_state': np.array([[start_state[0], start_state[1]]]),
                    'transition_prob': tp,
                    'wind': False
                },
                'strategy': strat
            }
            all_configs.append(cfg)

# -------------------------------
# 10x10 Grid World: SARSA (8)
# -------------------------------
for wind in [True, False]:
    for start_state in [(0, 4), (3, 6)]:
        for strat in ['eps_greedy', 'softmax']:
            name = f"std_sarsa_wind{wind}_ss{start_state}_strat{strat}"
            cfg = {
                'name': name,
                'alg': 'sarsa',
                'env_builder': create_standard_grid,
                'env_kwargs': {
                    # ✅ FIX here too
                    'start_state': np.array([[start_state[0], start_state[1]]]),
                    'transition_prob': 1.0,
                    'wind': wind
                },
                'strategy': strat
            }
            all_configs.append(cfg)

# -------------------------------
# Four-Room configs (no fix needed)
# -------------------------------
for goal_change in [True, False]:
    all_configs.append({
        'name': f"four_q_goalchange{goal_change}_strateps_greedy",
        'alg': 'q_learning',
        'env_builder': create_four_room,
        'env_kwargs': {'goal_change': goal_change},
        'strategy': 'eps_greedy'
    })
for goal_change in [True, False]:
    all_configs.append({
        'name': f"four_sarsa_goalchange{goal_change}_strateps_greedy",
        'alg': 'sarsa',
        'env_builder': create_four_room,
        'env_kwargs': {'goal_change': goal_change},
        'strategy': 'eps_greedy'
    })




In [12]:
import matplotlib.pyplot as plt
import numpy as np

def plot_convergence_curve(reward_list, window=100, title="Convergence Plot", config_name=None):
    """
    Plot rewards and their rolling average to visually inspect convergence.
    
    Parameters:
    - reward_list: list or array of episode rewards
    - window: size of smoothing window for moving average
    - title: custom plot title
    - config_name: optional configuration name to display on plot
    """
    rewards = np.array(reward_list)
    episodes = np.arange(len(rewards)) + 1

    # Rolling average
    rolling = np.convolve(rewards, np.ones(window)/window, mode='valid')

    plt.figure(figsize=(8, 4))
    plt.plot(episodes, rewards, color='lightgray', alpha=0.6, label='Raw Episode Reward')
    plt.plot(episodes[window-1:], rolling, color='blue', linewidth=2.0, label=f'{window}-Episode Moving Average')

    # Annotate convergence visually
    plt.axvline(len(rewards) - window, color='red', linestyle='--', alpha=0.4)
    plt.text(len(rewards) - window + 10, np.mean(rolling[-50:]),
             'Recent avg\n(Converged region?)', color='red', fontsize=8)

    plt.xlabel('Episode', fontsize=11)
    plt.ylabel('Reward per Episode', fontsize=11)
    if config_name:
        plt.title(f"{title}\n({config_name})", fontsize=12)
    else:
        plt.title(title, fontsize=12)

    plt.legend()
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()


### Exploration helpers
- Epsilon-greedy and softmax action selection
- Numerically-stable softmax with temperature τ
- Unified `select_action` dispatcher


In [13]:
# Beginner-friendly sweep helpers with convergence tracking and optional plots
import itertools
import pandas as pd

# Overwrite lightweight runners to also return returns for convergence checks

def run_one_training(alg, env, alpha, gamma, strategy, param):
    if alg == 'q_learning':
        Q, returns = train_q_learning_fixed(env, episodes=EPISODES, alpha=alpha, gamma=gamma,
                                            strategy=strategy, param=param, max_steps=MAX_STEPS)
    elif alg == 'sarsa':
        Q, returns = train_sarsa_fixed(env, episodes=EPISODES, alpha=alpha, gamma=gamma,
                                       strategy=strategy, param=param, max_steps=MAX_STEPS)
    else:
        raise ValueError("alg must be 'q_learning' or 'sarsa'")
    avg_eval = float(evaluate_policy(env, Q, episodes=EVAL_EPISODES, max_steps=MAX_STEPS))
    return avg_eval, returns


def sweep_hparams_for_config(alg, env_builder, env_kwargs, strategy):
    records = []
    explore_params = EPSILONS if strategy == 'eps_greedy' else TAUS

    for alpha, gamma, param in itertools.product(ALPHAS, GAMMAS, explore_params):
        evals = []
        converged_flags = []
        drifts = []
        per_seed_returns = []
        for seed in SEEDS:
            np.random.seed(seed)
            env = env_builder(**env_kwargs)
            avg_eval, returns = run_one_training(alg, env, alpha, gamma, strategy, param)
            evals.append(avg_eval)
            # Skip numerical convergence check — using visual plots instead
            cflag, drift = False, np.nan
            converged_flags.append(cflag)
            drifts.append(drift)
            per_seed_returns.append(returns)


        mean_eval = float(np.mean(evals))
        std_eval = float(np.std(evals))
        frac_converged = float(np.mean(converged_flags))
        

        # Optional: save a plot for this hyperparameter triple averaged across seeds
        if PLOT_SWEEP:
            title = f"{alg} | {strategy} param={param} | alpha={alpha}, gamma={gamma}"
            fname = f"{alg}_{strategy}_a{alpha}_g{gamma}_p{param}.png".replace(' ', '')
            path = os.path.join(PLOT_DIR, fname)
            labels = [f"seed {s}" for s in SEEDS]
            save_learning_curves(per_seed_returns, labels, window=100, title=title, filepath=path)

        rec = {
            'alg': alg,
            'strategy': strategy,
            'alpha': alpha,
            'gamma': gamma,
            'param': param,
            'mean_eval_return': mean_eval,
            'std_eval_return': std_eval,
            **{f'cfg_{k}': v for k, v in env_kwargs.items()},
        }
        records.append(rec)

    df = pd.DataFrame.from_records(records)
    best_row = df.loc[df['mean_eval_return'].idxmax()]
    return df, dict(best_row)



In [17]:
# Helper to run exactly one configuration by name
import pandas as pd

def run_one_config_by_name(config_name, episodes=None, eval_episodes=None, save_prefix=None):
    # local override of episode counts if provided
    global EPISODES, EVAL_EPISODES
    old_EP, old_EVAL = EPISODES, EVAL_EPISODES
    if episodes is not None:
        EPISODES = int(episodes)
    if eval_episodes is not None:
        EVAL_EPISODES = int(eval_episodes)

    cfg = None
    for c in all_configs:
        if c['name'] == config_name:
            cfg = c
            break
    if cfg is None:
        raise ValueError('Unknown config name: ' + str(config_name))

    print('Running single config:', cfg['name'])
    df, best = sweep_hparams_for_config(
        alg=cfg['alg'],
        env_builder=cfg['env_builder'],
        env_kwargs=cfg['env_kwargs'],
        strategy=cfg['strategy'],
    )
    df['config_name'] = cfg['name']
    best['config_name'] = cfg['name']

    # Save CSVs using a readable prefix
    prefix = save_prefix if save_prefix else cfg['name']
    results_path = f'{prefix}_results.csv'
    best_path = f'{prefix}_best.csv'
    df.to_csv(results_path, index=False)
    pd.DataFrame([best]).to_csv(best_path, index=False)
    print('Saved:', results_path, best_path)

    # Restore globals
    EPISODES, EVAL_EPISODES = old_EP, old_EVAL
    return df, best



### Run exactly one configuration per cell
- Use the cells below; each runs one config end-to-end (sweep + save + plots).
- You can tweak `episodes` and `eval_episodes` when calling `run_one_config_by_name`.
- Files are saved as `<config-name>_results.csv` and `<config-name>_best.csv`.



In [None]:
for c in all_configs:
    print(c['name'])


In [20]:
# Config 1 – Q-Learning | tp=0.7 | start=(0,4) | ε-greedy
print(" Config 1: std_q_tp0.7_ss(0,4)_strateps_greedy")
df1, best1 = run_one_config_by_name('std_q_tp0.7_ss(0, 4)_strateps_greedy', episodes=600, eval_episodes=20)
print(best1)


 Config 1: std_q_tp0.7_ss(0,4)_strateps_greedy
Running single config: std_q_tp0.7_ss(0, 4)_strateps_greedy
Saved: std_q_tp0.7_ss(0, 4)_strateps_greedy_results.csv std_q_tp0.7_ss(0, 4)_strateps_greedy_best.csv
{'alg': 'q_learning', 'strategy': 'eps_greedy', 'alpha': 0.1, 'gamma': 1.0, 'param': 0.1, 'mean_eval_return': -16.45, 'std_eval_return': 2.9959973297718405, 'cfg_start_state': array([[0, 4]]), 'cfg_transition_prob': 0.7, 'cfg_wind': False, 'config_name': 'std_q_tp0.7_ss(0, 4)_strateps_greedy'}


In [21]:
# Config 2 – Q-Learning | tp=0.7 | start=(0,4) | Softmax
print(" Config 2: std_q_tp0.7_ss(0,4)_stratsoftmax")
df2, best2 = run_one_config_by_name('std_q_tp0.7_ss(0, 4)_stratsoftmax', episodes=600, eval_episodes=20)
print(best2)


 Config 2: std_q_tp0.7_ss(0,4)_stratsoftmax
Running single config: std_q_tp0.7_ss(0, 4)_stratsoftmax
Saved: std_q_tp0.7_ss(0, 4)_stratsoftmax_results.csv std_q_tp0.7_ss(0, 4)_stratsoftmax_best.csv
{'alg': 'q_learning', 'strategy': 'softmax', 'alpha': 0.1, 'gamma': 1.0, 'param': 0.01, 'mean_eval_return': -15.26, 'std_eval_return': 0.8151073548925933, 'cfg_start_state': array([[0, 4]]), 'cfg_transition_prob': 0.7, 'cfg_wind': False, 'config_name': 'std_q_tp0.7_ss(0, 4)_stratsoftmax'}


In [22]:
# Config 3 – Q-Learning | tp=0.7 | start=(3,6) | ε-greedy
print(" Config 3: std_q_tp0.7_ss(3,6)_strateps_greedy")
df3, best3 = run_one_config_by_name('std_q_tp0.7_ss(3, 6)_strateps_greedy', episodes=600, eval_episodes=20)
print(best3)


 Config 3: std_q_tp0.7_ss(3,6)_strateps_greedy
Running single config: std_q_tp0.7_ss(3, 6)_strateps_greedy
Saved: std_q_tp0.7_ss(3, 6)_strateps_greedy_results.csv std_q_tp0.7_ss(3, 6)_strateps_greedy_best.csv
{'alg': 'q_learning', 'strategy': 'eps_greedy', 'alpha': 0.1, 'gamma': 0.9, 'param': 0.1, 'mean_eval_return': -17.52, 'std_eval_return': 0.6592419889539806, 'cfg_start_state': array([[3, 6]]), 'cfg_transition_prob': 0.7, 'cfg_wind': False, 'config_name': 'std_q_tp0.7_ss(3, 6)_strateps_greedy'}


In [23]:
# Config 4 – Q-Learning | tp=0.7 | start=(3,6) | Softmax
print(" Config 4: std_q_tp0.7_ss(3,6)_stratsoftmax ")
df4, best4 = run_one_config_by_name('std_q_tp0.7_ss(3, 6)_stratsoftmax', episodes=600, eval_episodes=20)
print(best4)


 Config 4: std_q_tp0.7_ss(3,6)_stratsoftmax 
Running single config: std_q_tp0.7_ss(3, 6)_stratsoftmax
Saved: std_q_tp0.7_ss(3, 6)_stratsoftmax_results.csv std_q_tp0.7_ss(3, 6)_stratsoftmax_best.csv
{'alg': 'q_learning', 'strategy': 'softmax', 'alpha': 0.1, 'gamma': 1.0, 'param': 0.1, 'mean_eval_return': -17.759999999999998, 'std_eval_return': 2.5925662961629348, 'cfg_start_state': array([[3, 6]]), 'cfg_transition_prob': 0.7, 'cfg_wind': False, 'config_name': 'std_q_tp0.7_ss(3, 6)_stratsoftmax'}


In [24]:
# Config 5 – Q-Learning | tp=1.0 | start=(0,4) | ε-greedy
print("Config 5: std_q_tp1.0_ss(0,4)_strateps_greedy")
df5, best5 = run_one_config_by_name('std_q_tp1.0_ss(0, 4)_strateps_greedy', episodes=600, eval_episodes=20)
print(best5)


Config 5: std_q_tp1.0_ss(0,4)_strateps_greedy
Running single config: std_q_tp1.0_ss(0, 4)_strateps_greedy
Saved: std_q_tp1.0_ss(0, 4)_strateps_greedy_results.csv std_q_tp1.0_ss(0, 4)_strateps_greedy_best.csv
{'alg': 'q_learning', 'strategy': 'eps_greedy', 'alpha': 0.1, 'gamma': 0.7, 'param': 0.001, 'mean_eval_return': -6.0, 'std_eval_return': 0.0, 'cfg_start_state': array([[0, 4]]), 'cfg_transition_prob': 1.0, 'cfg_wind': False, 'config_name': 'std_q_tp1.0_ss(0, 4)_strateps_greedy'}


In [25]:
# Config 6 – Q-Learning | tp=1.0 | start=(0,4) | Softmax
print("Config 6: std_q_tp1.0_ss(0,4)_stratsoftmax")
df6, best6 = run_one_config_by_name('std_q_tp1.0_ss(0, 4)_stratsoftmax', episodes=600, eval_episodes=20)
print(best6)


Config 6: std_q_tp1.0_ss(0,4)_stratsoftmax
Running single config: std_q_tp1.0_ss(0, 4)_stratsoftmax
Saved: std_q_tp1.0_ss(0, 4)_stratsoftmax_results.csv std_q_tp1.0_ss(0, 4)_stratsoftmax_best.csv
{'alg': 'q_learning', 'strategy': 'softmax', 'alpha': 0.1, 'gamma': 0.7, 'param': 0.01, 'mean_eval_return': -6.0, 'std_eval_return': 0.0, 'cfg_start_state': array([[0, 4]]), 'cfg_transition_prob': 1.0, 'cfg_wind': False, 'config_name': 'std_q_tp1.0_ss(0, 4)_stratsoftmax'}


In [26]:
# Config 7 – Q-Learning | tp=1.0 | start=(3,6) | ε-greedy
print("Config 7: std_q_tp1.0_ss(3,6)_strateps_greedy")
df7, best7 = run_one_config_by_name('std_q_tp1.0_ss(3, 6)_strateps_greedy', episodes=600, eval_episodes=20)
print(best7)


Config 7: std_q_tp1.0_ss(3,6)_strateps_greedy
Running single config: std_q_tp1.0_ss(3, 6)_strateps_greedy
Saved: std_q_tp1.0_ss(3, 6)_strateps_greedy_results.csv std_q_tp1.0_ss(3, 6)_strateps_greedy_best.csv
{'alg': 'q_learning', 'strategy': 'eps_greedy', 'alpha': 0.1, 'gamma': 0.7, 'param': 0.001, 'mean_eval_return': -1.0, 'std_eval_return': 0.0, 'cfg_start_state': array([[3, 6]]), 'cfg_transition_prob': 1.0, 'cfg_wind': False, 'config_name': 'std_q_tp1.0_ss(3, 6)_strateps_greedy'}


In [27]:
# Config 8 – Q-Learning | tp=1.0 | start=(3,6) | Softmax
print("Config 8: std_q_tp1.0_ss(3,6)_stratsoftmax")
df8, best8 = run_one_config_by_name('std_q_tp1.0_ss(3, 6)_stratsoftmax', episodes=600, eval_episodes=20)
print(best8)


Config 8: std_q_tp1.0_ss(3,6)_stratsoftmax
Running single config: std_q_tp1.0_ss(3, 6)_stratsoftmax
Saved: std_q_tp1.0_ss(3, 6)_stratsoftmax_results.csv std_q_tp1.0_ss(3, 6)_stratsoftmax_best.csv
{'alg': 'q_learning', 'strategy': 'softmax', 'alpha': 0.1, 'gamma': 0.7, 'param': 0.01, 'mean_eval_return': -1.0, 'std_eval_return': 0.0, 'cfg_start_state': array([[3, 6]]), 'cfg_transition_prob': 1.0, 'cfg_wind': False, 'config_name': 'std_q_tp1.0_ss(3, 6)_stratsoftmax'}


In [28]:
# Config 9 – SARSA | wind=True | start=(0,4) | ε-greedy
print("Config 9: std_sarsa_windTrue_ss(0,4)_strateps_greedy")
df9, best9 = run_one_config_by_name('std_sarsa_windTrue_ss(0, 4)_strateps_greedy', episodes=600, eval_episodes=20)
print(best9)


Config 9: std_sarsa_windTrue_ss(0,4)_strateps_greedy
Running single config: std_sarsa_windTrue_ss(0, 4)_strateps_greedy
Saved: std_sarsa_windTrue_ss(0, 4)_strateps_greedy_results.csv std_sarsa_windTrue_ss(0, 4)_strateps_greedy_best.csv
{'alg': 'sarsa', 'strategy': 'eps_greedy', 'alpha': 0.1, 'gamma': 0.9, 'param': 0.1, 'mean_eval_return': -7.720000000000001, 'std_eval_return': 0.5921148537234988, 'cfg_start_state': array([[0, 4]]), 'cfg_transition_prob': 1.0, 'cfg_wind': True, 'config_name': 'std_sarsa_windTrue_ss(0, 4)_strateps_greedy'}


In [29]:
# Config 10 – SARSA | wind=True | start=(0,4) | Softmax
print("Config 10: std_sarsa_windTrue_ss(0,4)_stratsoftmax")
df10, best10 = run_one_config_by_name('std_sarsa_windTrue_ss(0, 4)_stratsoftmax', episodes=600, eval_episodes=20)
print(best10)


Config 10: std_sarsa_windTrue_ss(0,4)_stratsoftmax
Running single config: std_sarsa_windTrue_ss(0, 4)_stratsoftmax
Saved: std_sarsa_windTrue_ss(0, 4)_stratsoftmax_results.csv std_sarsa_windTrue_ss(0, 4)_stratsoftmax_best.csv
{'alg': 'sarsa', 'strategy': 'softmax', 'alpha': 0.1, 'gamma': 0.9, 'param': 0.01, 'mean_eval_return': -7.590000000000001, 'std_eval_return': 0.5902541825349481, 'cfg_start_state': array([[0, 4]]), 'cfg_transition_prob': 1.0, 'cfg_wind': True, 'config_name': 'std_sarsa_windTrue_ss(0, 4)_stratsoftmax'}


In [30]:
# Config 11 – SARSA | wind=True | start=(3,6) | ε-greedy
print("Config 11: std_sarsa_windTrue_ss(3,6)_strateps_greedy")
df11, best11 = run_one_config_by_name('std_sarsa_windTrue_ss(3, 6)_strateps_greedy', episodes=600, eval_episodes=20)
print(best11)


Config 11: std_sarsa_windTrue_ss(3,6)_strateps_greedy
Running single config: std_sarsa_windTrue_ss(3, 6)_strateps_greedy
Saved: std_sarsa_windTrue_ss(3, 6)_strateps_greedy_results.csv std_sarsa_windTrue_ss(3, 6)_strateps_greedy_best.csv
{'alg': 'sarsa', 'strategy': 'eps_greedy', 'alpha': 0.1, 'gamma': 0.9, 'param': 0.01, 'mean_eval_return': -4.1, 'std_eval_return': 0.4037325847637268, 'cfg_start_state': array([[3, 6]]), 'cfg_transition_prob': 1.0, 'cfg_wind': True, 'config_name': 'std_sarsa_windTrue_ss(3, 6)_strateps_greedy'}


In [31]:
# Config 12 – SARSA | wind=True | start=(3,6) | Softmax
print("Config 12: std_sarsa_windTrue_ss(3,6)_stratsoftmax")
df12, best12 = run_one_config_by_name('std_sarsa_windTrue_ss(3, 6)_stratsoftmax', episodes=600, eval_episodes=20)
print(best12)


Config 12: std_sarsa_windTrue_ss(3,6)_stratsoftmax
Running single config: std_sarsa_windTrue_ss(3, 6)_stratsoftmax
Saved: std_sarsa_windTrue_ss(3, 6)_stratsoftmax_results.csv std_sarsa_windTrue_ss(3, 6)_stratsoftmax_best.csv
{'alg': 'sarsa', 'strategy': 'softmax', 'alpha': 0.1, 'gamma': 1.0, 'param': 0.1, 'mean_eval_return': -3.6399999999999997, 'std_eval_return': 0.534228415567723, 'cfg_start_state': array([[3, 6]]), 'cfg_transition_prob': 1.0, 'cfg_wind': True, 'config_name': 'std_sarsa_windTrue_ss(3, 6)_stratsoftmax'}


In [32]:
# Config 13 – SARSA | wind=False | start=(0,4) | ε-greedy
print("Config 13: std_sarsa_windFalse_ss(0,4)_strateps_greedy")
df13, best13 = run_one_config_by_name('std_sarsa_windFalse_ss(0, 4)_strateps_greedy', episodes=600, eval_episodes=20)
print(best13)


Config 13: std_sarsa_windFalse_ss(0,4)_strateps_greedy
Running single config: std_sarsa_windFalse_ss(0, 4)_strateps_greedy
Saved: std_sarsa_windFalse_ss(0, 4)_strateps_greedy_results.csv std_sarsa_windFalse_ss(0, 4)_strateps_greedy_best.csv
{'alg': 'sarsa', 'strategy': 'eps_greedy', 'alpha': 0.1, 'gamma': 0.7, 'param': 0.001, 'mean_eval_return': -6.0, 'std_eval_return': 0.0, 'cfg_start_state': array([[0, 4]]), 'cfg_transition_prob': 1.0, 'cfg_wind': False, 'config_name': 'std_sarsa_windFalse_ss(0, 4)_strateps_greedy'}


In [33]:
# Config 14 – SARSA | wind=False | start=(0,4) | Softmax
print("Config 14: std_sarsa_windFalse_ss(0,4)_stratsoftmax")
df14, best14 = run_one_config_by_name('std_sarsa_windFalse_ss(0, 4)_stratsoftmax', episodes=600, eval_episodes=20)
print(best14)


Config 14: std_sarsa_windFalse_ss(0,4)_stratsoftmax
Running single config: std_sarsa_windFalse_ss(0, 4)_stratsoftmax
Saved: std_sarsa_windFalse_ss(0, 4)_stratsoftmax_results.csv std_sarsa_windFalse_ss(0, 4)_stratsoftmax_best.csv
{'alg': 'sarsa', 'strategy': 'softmax', 'alpha': 0.1, 'gamma': 0.7, 'param': 0.01, 'mean_eval_return': -6.0, 'std_eval_return': 0.0, 'cfg_start_state': array([[0, 4]]), 'cfg_transition_prob': 1.0, 'cfg_wind': False, 'config_name': 'std_sarsa_windFalse_ss(0, 4)_stratsoftmax'}


In [34]:
# Config 15 – SARSA | wind=False | start=(3,6) | ε-greedy
print("Config 15: std_sarsa_windFalse_ss(3,6)_strateps_greedy")
df15, best15 = run_one_config_by_name('std_sarsa_windFalse_ss(3, 6)_strateps_greedy', episodes=600, eval_episodes=20)
print(best15)


Config 15: std_sarsa_windFalse_ss(3,6)_strateps_greedy
Running single config: std_sarsa_windFalse_ss(3, 6)_strateps_greedy
Saved: std_sarsa_windFalse_ss(3, 6)_strateps_greedy_results.csv std_sarsa_windFalse_ss(3, 6)_strateps_greedy_best.csv
{'alg': 'sarsa', 'strategy': 'eps_greedy', 'alpha': 0.1, 'gamma': 0.7, 'param': 0.001, 'mean_eval_return': -1.0, 'std_eval_return': 0.0, 'cfg_start_state': array([[3, 6]]), 'cfg_transition_prob': 1.0, 'cfg_wind': False, 'config_name': 'std_sarsa_windFalse_ss(3, 6)_strateps_greedy'}


In [35]:
# Config 16 – SARSA | wind=False | start=(3,6) | Softmax
print("Config 16: std_sarsa_windFalse_ss(3,6)_stratsoftmax")
df16, best16 = run_one_config_by_name('std_sarsa_windFalse_ss(3, 6)_stratsoftmax', episodes=600, eval_episodes=20)
print(best16)


Config 16: std_sarsa_windFalse_ss(3,6)_stratsoftmax
Running single config: std_sarsa_windFalse_ss(3, 6)_stratsoftmax
Saved: std_sarsa_windFalse_ss(3, 6)_stratsoftmax_results.csv std_sarsa_windFalse_ss(3, 6)_stratsoftmax_best.csv
{'alg': 'sarsa', 'strategy': 'softmax', 'alpha': 0.1, 'gamma': 0.7, 'param': 0.01, 'mean_eval_return': -1.0, 'std_eval_return': 0.0, 'cfg_start_state': array([[3, 6]]), 'cfg_transition_prob': 1.0, 'cfg_wind': False, 'config_name': 'std_sarsa_windFalse_ss(3, 6)_stratsoftmax'}


In [None]:
# Config 17 – Four-Room | Q-Learning | goalChange=True
print("Config 17: four_q_goalchangeTrue_strateps_greedy ")
df17, best17 = run_one_config_by_name('four_q_goalchangeTrue_strateps_greedy', episodes=600, eval_episodes=20)
print(best17)


In [None]:
# Config 18 – Four-Room | Q-Learning | goalChange=False
print("\n===== Config 18: four_q_goalchangeFalse_strateps_greedy =====")
df18, best18 = run_one_config_by_name('four_q_goalchangeFalse_strateps_greedy', episodes=600, eval_episodes=10)
print(best18)
