# PART A

In [1]:
# imports
import os
import time
import glob
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import wandb
from env import create_standard_grid, create_four_room

# Optional: configure matplotlib for notebooks
%matplotlib inline


### Create environment

In [3]:
all_configs = []

# -------------------------------
# 10x10 Grid World: Q-learning (8)
# -------------------------------
for tp in [0.7, 1.0]:
    for start_state in [(0, 4), (3, 6)]:
        for strat in ['eps_greedy', 'softmax']:
            name = f"std_q_tp{tp}_ss{start_state}_strat{strat}"
            cfg = {
                'name': name,
                'alg': 'q_learning',
                'env_builder': create_standard_grid,
                'env_kwargs': {
                    # ✅ FIX: convert tuple to 2D numpy array
                    'start_state': np.array([[start_state[0], start_state[1]]]),
                    'transition_prob': tp,
                    'wind': False
                },
                'strategy': strat
            }
            all_configs.append(cfg)

# -------------------------------
# 10x10 Grid World: SARSA (8)
# -------------------------------
for wind in [True, False]:
    for start_state in [(0, 4), (3, 6)]:
        for strat in ['eps_greedy', 'softmax']:
            name = f"std_sarsa_wind{wind}_ss{start_state}_strat{strat}"
            cfg = {
                'name': name,
                'alg': 'sarsa',
                'env_builder': create_standard_grid,
                'env_kwargs': {
                    # ✅ FIX here too
                    'start_state': np.array([[start_state[0], start_state[1]]]),
                    'transition_prob': 1.0,
                    'wind': wind
                },
                'strategy': strat
            }
            all_configs.append(cfg)

# -------------------------------
# Four-Room configs (no fix needed)
# -------------------------------
for goal_change in [True, False]:
    all_configs.append({
        'name': f"four_q_goalchange{goal_change}_strateps_greedy",
        'alg': 'q_learning',
        'env_builder': create_four_room,
        'env_kwargs': {'goal_change': goal_change},
        'strategy': 'eps_greedy'
    })
for goal_change in [True, False]:
    all_configs.append({
        'name': f"four_sarsa_goalchange{goal_change}_strateps_greedy",
        'alg': 'sarsa',
        'env_builder': create_four_room,
        'env_kwargs': {'goal_change': goal_change},
        'strategy': 'eps_greedy'
    })

### Functions

In [8]:
# Safe reward scalar extractor for 1-element arrays
def reward_scalar(r):
    return float(np.ravel(r)[0])

# epsilon greedy function
# Chose to use correct datatype outputs as it becomes a problem in downstream tasks.

def epsilon_greedy_action(Q: np.ndarray, state: int, epsilon: float) -> int:
    """Pick epsilon-greedy action from Q[state]."""
    if np.random.rand() < epsilon:
        return np.random.randint(Q.shape[1])
    return int(np.argmax(Q[state]))

# --- Episode termination check ---
def is_terminal(state: int, goal_states_seq: np.ndarray) -> bool:
    return state in set(map(int, np.array(goal_states_seq).flatten()))

# --- Rollouts for evaluation ---
def evaluate_policy(env, Q: np.ndarray, episodes: int = 20, max_steps: int = 100) -> float:
    total = 0.0
    for _ in range(episodes):
        s = env.reset()
        ep_ret = 0.0
        for _ in range(max_steps):
            a = int(np.argmax(Q[s]))
            s_next, r = env.step(s, a)
            ep_ret += reward_scalar(r)
            s = int(s_next)
            if is_terminal(s, env.goal_states_seq):
                break
        total += ep_ret
    return total / episodes
    
def softmax_action(Q, state, tau):
    tau = float(tau)
    if tau <= 0:
        return int(np.argmax(Q[state]))
    z = Q[state] / tau
    z = z - np.max(z)
    p = np.exp(z)
    p = p / np.sum(p)
    return int(np.random.choice(len(p), p=p))


def select_action(Q, state, strategy, param):
    if strategy == 'eps_greedy':
        if np.random.rand() < float(param):
            return int(np.random.randint(Q.shape[1]))
        return int(np.argmax(Q[state]))
    elif strategy == 'softmax':
        return softmax_action(Q, state, float(param))
    else:
        raise ValueError('Unknown exploration strategy: ' + str(strategy))

### Q-learning function

In [6]:
def train_q_learning_fixed(env, episodes, alpha, gamma, strategy, param, max_steps=100, run=None, seed=None):
    """
    Q-Learning with optional WandB logging.
    """
    n_states, n_actions = env.num_states, env.num_actions
    Q = np.zeros((n_states, n_actions))
    returns = []

    for ep in range(episodes):
        state = int(env.reset())
        total_reward = 0.0

        for _ in range(max_steps):
            a = select_action(Q, state, strategy, param)
            s_next, r = env.step(state, a)
            r = reward_scalar(r)
            total_reward += r
            s_next = int(s_next)

            if is_terminal(s_next, env.goal_states_seq):
                Q[state, a] += alpha * (r - Q[state, a])
                break

            td_target = r + gamma * np.max(Q[s_next])
            Q[state, a] += alpha * (td_target - Q[state, a])
            state = s_next

        returns.append(total_reward)

        # ✅ WandB logging
        if run is not None:
            wandb.log({
                "episode": ep,
                "episode_return": total_reward,
                "algorithm": "Q-learning",
                "alpha": alpha,
                "gamma": gamma,
                "param": param,
                "strategy": strategy,
                "seed": seed
            })

        if (ep + 1) % 200 == 0:
            print(f"[Q] Episode {ep+1}/{episodes} | Mean(Last50): {np.mean(returns[-50:]):.2f}")

    return Q, returns


### SARSA function

In [7]:
def train_sarsa_fixed(env, episodes, alpha, gamma, strategy, param, max_steps=100, run=None, seed=None):
    """
    SARSA with optional WandB logging.
    """
    n_states, n_actions = env.num_states, env.num_actions
    Q = np.zeros((n_states, n_actions))
    returns = []

    for ep in range(episodes):
        s = int(env.reset())
        a = select_action(Q, s, strategy, param)
        total_reward = 0.0

        for _ in range(max_steps):
            s_next, r = env.step(s, a)
            r = reward_scalar(r)
            total_reward += r
            s_next = int(s_next)

            if is_terminal(s_next, env.goal_states_seq):
                Q[s, a] += alpha * (r - Q[s, a])
                break

            a_next = select_action(Q, s_next, strategy, param)
            td_target = r + gamma * Q[s_next, a_next]
            Q[s, a] += alpha * (td_target - Q[s, a])
            s, a = s_next, a_next

        returns.append(total_reward)

        # ✅ WandB logging
        if run is not None:
            wandb.log({
                "episode": ep,
                "episode_return": total_reward,
                "algorithm": "SARSA",
                "alpha": alpha,
                "gamma": gamma,
                "param": param,
                "strategy": strategy,
                "seed": seed
            })

        if (ep + 1) % 200 == 0:
            print(f"[SARSA] Episode {ep+1}/{episodes} | Mean(Last50): {np.mean(returns[-50:]):.2f}")

    return Q, returns


### Setting up Wandb

In [4]:
wandb.init()

# 
sweep_config_1 = {
    'method': 'grid',   # try every combination
    'metric': {
        'name': 'mean_eval_return',
        'goal': 'maximize'
    },
    'parameters': {
        'alpha': {'values': [0.001, 0.01, 0.1, 1.0]},
        'gamma': {'values': [0.7, 0.8, 0.9, 1.0]},
        'param': {'values': [0.001, 0.01, 0.05, 0.1]},  # Use param if your code expects epsilon as "param"
        'algorithm': {'values': ['q_learning']},
        'strategy': {'values': ['eps_greedy']},
        'env_name': {'values': ['std_q_tp0.7_ss(0,4)_strateps_greedy']}
    }
}

[34m[1mwandb[0m: Currently logged in as: [33mleenh050418[0m ([33mleenh050418-iit-madras-foundation[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Exception in thread IntMsgThr:
Traceback (most recent call last):
  File "c:\Users\Kutral\AppData\Local\Programs\Python\Python311\Lib\threading.py", line 1038, in _bootstrap_inner
    self.run()
  File "C:\Users\Kutral\AppData\Roaming\Python\Python311\site-packages\ipykernel\ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "c:\Users\Kutral\AppData\Local\Programs\Python\Python311\Lib\threading.py", line 975, in run
    self._target(*self._args, **self._kwargs)
  File "c:\Users\Kutral\AppData\Local\Programs\Python\Python311\Lib\site-packages\wandb\sdk\wandb_run.py", line 333, in check_internal_messages
    self._loop_check_status(
  File "c:\Users\Kutral\AppData\Local\Programs\Python\Python311\Lib\site-packages\wandb\sdk\wandb_run.py", line 236, in _loop_check_status
    local_handle = request()
                   ^^^^^^^^^
  File "c:\Users\Kutral\AppData\Local\Programs\Python\Python311\Lib\site-packages\wandb\sdk\interface\interface.py", line 1022, in delive

In [5]:
sweep_id = wandb.sweep(sweep_config_1, project="GPA_2_PART_A")
print("Sweep ID:", sweep_id)


Create sweep with ID: umdu1k6v
Sweep URL: https://wandb.ai/leenh050418-iit-madras-foundation/GPA_2_PART_A/sweeps/umdu1k6v
Sweep ID: umdu1k6v


In [9]:
def train_one_sweep_run():
    wandb.init()
    cfg = wandb.config

    # setup environment
    env = create_standard_grid(transition_prob=0.7, start_state=np.array([[0,4]]))

    # pick the correct algorithm
    if cfg.algorithm == "q_learning":
        Q, returns = train_q_learning_fixed(
            env,
            episodes=1500,
            alpha=cfg.alpha,
            gamma=cfg.gamma,
            strategy=cfg.strategy,
            param=cfg.param,
            max_steps=100,
            run=wandb.run
        )
    else:
        Q, returns = train_sarsa_fixed(
            env,
            episodes=1500,
            alpha=cfg.alpha,
            gamma=cfg.gamma,
            strategy=cfg.strategy,
            param=cfg.param,
            max_steps=100,
            run=wandb.run
        )

    # Evaluate
    avg_eval = evaluate_policy(env, Q, episodes=20, max_steps=100)

    wandb.log({
        'mean_eval_return': avg_eval,
        'final_mean_reward_last50': np.mean(returns[-50:])
    })

    print(f"Run finished | α={cfg.alpha}, γ={cfg.gamma}, param={cfg.param} → avg_eval={avg_eval:.2f}")

In [10]:
wandb.agent(sweep_id, function=train_one_sweep_run, count=None)


[34m[1mwandb[0m: Agent Starting Run: ukomeosi with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.001
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.7
[34m[1mwandb[0m: 	param: 0.001
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -116.94
[Q] Episode 400/1500 | Mean(Last50): -113.38
[Q] Episode 600/1500 | Mean(Last50): -112.16
[Q] Episode 800/1500 | Mean(Last50): -113.00
[Q] Episode 1000/1500 | Mean(Last50): -107.60
[Q] Episode 1200/1500 | Mean(Last50): -111.10
[Q] Episode 1400/1500 | Mean(Last50): -107.98
Run finished | α=0.001, γ=0.7, param=0.001 → avg_eval=-71.50


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇████
episode_return,▅▇▆▇▁▇▆▇▇▅▇▆▇▆▁█▇▇▇▇▇▇▆▇▆▇▇▇▇▇▇▇▇▇▇▆▆▇▆▁
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.001
episode,1499
episode_return,-100
final_mean_reward_last50,-107
gamma,0.7
mean_eval_return,-71.5
param,0.001
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: eu14tnfb with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.001
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.7
[34m[1mwandb[0m: 	param: 0.01
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -115.16
[Q] Episode 400/1500 | Mean(Last50): -108.74
[Q] Episode 600/1500 | Mean(Last50): -108.06
[Q] Episode 800/1500 | Mean(Last50): -109.44
[Q] Episode 1000/1500 | Mean(Last50): -111.20
[Q] Episode 1200/1500 | Mean(Last50): -103.92
[Q] Episode 1400/1500 | Mean(Last50): -103.26
Run finished | α=0.001, γ=0.7, param=0.01 → avg_eval=-100.00


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▅▅▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇███
episode_return,▆▆▆▆▅▆▆▅▇▆▅▆▆▆▆▅▆▁▄▆▆▅█▆▆▅▃▇▆▅▄▅▄▆▇▆▅▆▆▆
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.001
episode,1499
episode_return,-110
final_mean_reward_last50,-104.08
gamma,0.7
mean_eval_return,-100
param,0.01
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: 2gpxjvpb with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.001
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.7
[34m[1mwandb[0m: 	param: 0.05
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -110.48
[Q] Episode 400/1500 | Mean(Last50): -116.08
[Q] Episode 600/1500 | Mean(Last50): -110.40
[Q] Episode 800/1500 | Mean(Last50): -113.02
[Q] Episode 1000/1500 | Mean(Last50): -104.90
[Q] Episode 1200/1500 | Mean(Last50): -104.78
[Q] Episode 1400/1500 | Mean(Last50): -109.96
Run finished | α=0.001, γ=0.7, param=0.05 → avg_eval=-100.00


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▇▇▇▇▇▇▇▇▇▇██
episode_return,▇▄█▄▃▃▄▃▃▃▃▄▃▄▄▄▄▂▄▃▃▂▁▄▄▃▄▄▄▄▄▄▂▃▄▄▄█▄▃
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.001
episode,1499
episode_return,-100
final_mean_reward_last50,-104.92
gamma,0.7
mean_eval_return,-100
param,0.05
strategy,eps_greedy


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: g75qaxz5 with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.001
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.7
[34m[1mwandb[0m: 	param: 0.1
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -117.32
[Q] Episode 400/1500 | Mean(Last50): -121.70
[Q] Episode 600/1500 | Mean(Last50): -110.22
[Q] Episode 800/1500 | Mean(Last50): -118.44
[Q] Episode 1000/1500 | Mean(Last50): -121.08
[Q] Episode 1200/1500 | Mean(Last50): -106.96
[Q] Episode 1400/1500 | Mean(Last50): -115.48
Run finished | α=0.001, γ=0.7, param=0.1 → avg_eval=-100.00


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇█████
episode_return,▅▅▃▅▅▅▅▅▅▅▅▅▅▇▂▅▅▅▂▅▅▅▅▂▅▂▅▅█▅▅▁▅▅▅▅▅█▅▅
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.001
episode,1499
episode_return,-100
final_mean_reward_last50,-116.9
gamma,0.7
mean_eval_return,-100
param,0.1
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: bpk2cv2s with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.001
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.8
[34m[1mwandb[0m: 	param: 0.001
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -114.40
[Q] Episode 400/1500 | Mean(Last50): -121.26
[Q] Episode 600/1500 | Mean(Last50): -112.36
[Q] Episode 800/1500 | Mean(Last50): -113.30
[Q] Episode 1000/1500 | Mean(Last50): -109.38
[Q] Episode 1200/1500 | Mean(Last50): -105.24
[Q] Episode 1400/1500 | Mean(Last50): -113.02
Run finished | α=0.001, γ=0.8, param=0.001 → avg_eval=-100.25


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇█████
episode_return,▅▅▅▆▁▃█▅▄▅▅▅▅▅▅▅▅▅▄▃▅▅▅▅▅▁▅▅▅▅▅▁▅▅▅▅▁▅▅▅
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.001
episode,1499
episode_return,-100
final_mean_reward_last50,-104.06
gamma,0.8
mean_eval_return,-100.25
param,0.001
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: hfgy5hqf with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.001
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.8
[34m[1mwandb[0m: 	param: 0.01
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -119.18
[Q] Episode 400/1500 | Mean(Last50): -115.40
[Q] Episode 600/1500 | Mean(Last50): -111.26
[Q] Episode 800/1500 | Mean(Last50): -110.02
[Q] Episode 1000/1500 | Mean(Last50): -112.46
[Q] Episode 1200/1500 | Mean(Last50): -110.70
[Q] Episode 1400/1500 | Mean(Last50): -114.30
Run finished | α=0.001, γ=0.8, param=0.01 → avg_eval=-100.00


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▂▂▂▂▃▃▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇█
episode_return,▆▇▆▆▄▁▇▄█▁▅▇▆▆▆▆▄▆▆▆▆▆▆▁▆▆▆▅▆▆▆▇▅▇▆▆▅▆▆▅
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.001
episode,1499
episode_return,-110
final_mean_reward_last50,-105.4
gamma,0.8
mean_eval_return,-100
param,0.01
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: usn7dvks with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.001
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.8
[34m[1mwandb[0m: 	param: 0.05
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -123.48
[Q] Episode 400/1500 | Mean(Last50): -114.56
[Q] Episode 600/1500 | Mean(Last50): -116.14
[Q] Episode 800/1500 | Mean(Last50): -117.94
[Q] Episode 1000/1500 | Mean(Last50): -115.88
[Q] Episode 1200/1500 | Mean(Last50): -111.54
[Q] Episode 1400/1500 | Mean(Last50): -115.52
Run finished | α=0.001, γ=0.8, param=0.05 → avg_eval=-109.00


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇█
episode_return,▅▅▁▅▅▁▅▄▅▄▅▅▅▇▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅█▅▅▅▅▅▅▅▅▁
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.001
episode,1499
episode_return,-100
final_mean_reward_last50,-112.42
gamma,0.8
mean_eval_return,-109
param,0.05
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: xq7v94gs with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.001
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.8
[34m[1mwandb[0m: 	param: 0.1
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -119.12
[Q] Episode 400/1500 | Mean(Last50): -127.32
[Q] Episode 600/1500 | Mean(Last50): -115.70
[Q] Episode 800/1500 | Mean(Last50): -116.32
[Q] Episode 1000/1500 | Mean(Last50): -113.04
[Q] Episode 1200/1500 | Mean(Last50): -117.92
[Q] Episode 1400/1500 | Mean(Last50): -106.54
Run finished | α=0.001, γ=0.8, param=0.1 → avg_eval=-100.00


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇██
episode_return,▇▇▇▇▇▇▄▇▇▇▇▇▇▇█▃▇▇▇▁▇▇█▇█▇▇▇▄▄▇▇▇▆▇▄▇▇▇█
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.001
episode,1499
episode_return,-78
final_mean_reward_last50,-115.68
gamma,0.8
mean_eval_return,-100
param,0.1
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: ssap5i4b with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.001
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.9
[34m[1mwandb[0m: 	param: 0.001
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -111.64
[Q] Episode 400/1500 | Mean(Last50): -113.28
[Q] Episode 600/1500 | Mean(Last50): -112.00
[Q] Episode 800/1500 | Mean(Last50): -110.56
[Q] Episode 1000/1500 | Mean(Last50): -110.62
[Q] Episode 1200/1500 | Mean(Last50): -106.80
[Q] Episode 1400/1500 | Mean(Last50): -109.58
Run finished | α=0.001, γ=0.9, param=0.001 → avg_eval=-100.00


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
episode_return,▆▇▁▆▆▆▆▂▆▆▇▆▆▅▆▅▆▆█▆▂▆█▆▇▆▆▆▆▆▆▆▆▃▆▆▅▆▆▆
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.001
episode,1499
episode_return,-86
final_mean_reward_last50,-105.14
gamma,0.9
mean_eval_return,-100
param,0.001
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: w29muwt8 with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.001
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.9
[34m[1mwandb[0m: 	param: 0.01
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -116.04
[Q] Episode 400/1500 | Mean(Last50): -109.22
[Q] Episode 600/1500 | Mean(Last50): -119.64
[Q] Episode 800/1500 | Mean(Last50): -106.48
[Q] Episode 1000/1500 | Mean(Last50): -105.58
[Q] Episode 1200/1500 | Mean(Last50): -111.64
[Q] Episode 1400/1500 | Mean(Last50): -109.70
Run finished | α=0.001, γ=0.9, param=0.01 → avg_eval=-100.25


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▃▃▃▄▄▄▄▅▆▆▆▆▆▇▇▇▇▇█████
episode_return,▅▆▁▆▆▅▆▄▆▆▅▅█▅▆▆▆▄▆▆▁▆▆▅▆▅▅▆▆▆▆▆▆▆▅▁▆▅▆▆
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.001
episode,1499
episode_return,-115
final_mean_reward_last50,-115.44
gamma,0.9
mean_eval_return,-100.25
param,0.01
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: r0h12pdy with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.001
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.9
[34m[1mwandb[0m: 	param: 0.05
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -119.22
[Q] Episode 400/1500 | Mean(Last50): -111.42
[Q] Episode 600/1500 | Mean(Last50): -104.12
[Q] Episode 800/1500 | Mean(Last50): -112.72
[Q] Episode 1000/1500 | Mean(Last50): -110.42
[Q] Episode 1200/1500 | Mean(Last50): -112.84
[Q] Episode 1400/1500 | Mean(Last50): -106.74
Run finished | α=0.001, γ=0.9, param=0.05 → avg_eval=-100.00


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇█
episode_return,▅▄▄▆▅▅█▆▆▁▅▅▅▆▆▆▆▆▆▆▅▅▆▅▆▆▅▅▆▆▂▁▆▇▆▅▅▅▇▆
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.001
episode,1499
episode_return,-100
final_mean_reward_last50,-108.4
gamma,0.9
mean_eval_return,-100
param,0.05
strategy,eps_greedy


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: v5gjfsa2 with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.001
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.9
[34m[1mwandb[0m: 	param: 0.1
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -109.82
[Q] Episode 400/1500 | Mean(Last50): -120.04
[Q] Episode 600/1500 | Mean(Last50): -111.46
[Q] Episode 800/1500 | Mean(Last50): -112.28
[Q] Episode 1000/1500 | Mean(Last50): -117.66
[Q] Episode 1200/1500 | Mean(Last50): -113.18
[Q] Episode 1400/1500 | Mean(Last50): -111.16
Run finished | α=0.001, γ=0.9, param=0.1 → avg_eval=-100.00


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▁▂▂▂▃▃▃▃▃▃▃▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇█
episode_return,▆▁▅▆▆▆▆▁▃▆▆█▆▆▆▅▆▆▃▆▆▆▁▆▆▁▅▆▆▆▄▁▆▆▆▆▇▆▁▆
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.001
episode,1499
episode_return,-100
final_mean_reward_last50,-110.08
gamma,0.9
mean_eval_return,-100
param,0.1
strategy,eps_greedy


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: y9tbdqrl with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.001
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 1
[34m[1mwandb[0m: 	param: 0.001
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -113.88
[Q] Episode 400/1500 | Mean(Last50): -120.24
[Q] Episode 600/1500 | Mean(Last50): -112.12
[Q] Episode 800/1500 | Mean(Last50): -113.68
[Q] Episode 1000/1500 | Mean(Last50): -107.46
[Q] Episode 1200/1500 | Mean(Last50): -113.38
[Q] Episode 1400/1500 | Mean(Last50): -112.06
Run finished | α=0.001, γ=1, param=0.001 → avg_eval=-111.25


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇█
episode_return,▂▆▂▆▆▆▆▆▆▆█▆▆▆▆▄▆▁▆▆▆▆▆▆▆▆▇▂▆▆▆▆▇▆▆▆▆▅▆▆
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.001
episode,1499
episode_return,-100
final_mean_reward_last50,-109.14
gamma,1
mean_eval_return,-111.25
param,0.001
strategy,eps_greedy


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: bu92x3o8 with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.001
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 1
[34m[1mwandb[0m: 	param: 0.01
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -107.72
[Q] Episode 400/1500 | Mean(Last50): -111.78
[Q] Episode 600/1500 | Mean(Last50): -113.48
[Q] Episode 800/1500 | Mean(Last50): -110.00
[Q] Episode 1000/1500 | Mean(Last50): -110.20
[Q] Episode 1200/1500 | Mean(Last50): -112.02
[Q] Episode 1400/1500 | Mean(Last50): -114.16
Run finished | α=0.001, γ=1, param=0.01 → avg_eval=-165.75


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▁▂▂▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
episode_return,▆▆▅▆▆▆▆▆▆▆▆▆▆▁▂▇▆▆▁▆▆▆▆▆▆▆▅▆▆▂▆▅▆▆█▂▅▁▆▆
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.001
episode,1499
episode_return,-110
final_mean_reward_last50,-109.22
gamma,1
mean_eval_return,-165.75
param,0.01
strategy,eps_greedy


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 9pac2d3h with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.001
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 1
[34m[1mwandb[0m: 	param: 0.05
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -113.82
[Q] Episode 400/1500 | Mean(Last50): -117.58
[Q] Episode 600/1500 | Mean(Last50): -116.02
[Q] Episode 800/1500 | Mean(Last50): -100.58
[Q] Episode 1000/1500 | Mean(Last50): -111.22
[Q] Episode 1200/1500 | Mean(Last50): -107.24
[Q] Episode 1400/1500 | Mean(Last50): -112.92
Run finished | α=0.001, γ=1, param=0.05 → avg_eval=-107.75


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇▇▇███
episode_return,▆▆▆▂▆▆▆▁▆▆▆▆▅▅▆▆▆▆▆▆▅▄▆▆▂▆█▆▆▄▆▆▆▇▆▆▅▆▆▆
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.001
episode,1499
episode_return,-100
final_mean_reward_last50,-114.3
gamma,1
mean_eval_return,-107.75
param,0.05
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: qzu86wgn with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.001
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 1
[34m[1mwandb[0m: 	param: 0.1
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -125.82
[Q] Episode 400/1500 | Mean(Last50): -119.34
[Q] Episode 600/1500 | Mean(Last50): -118.82
[Q] Episode 800/1500 | Mean(Last50): -112.14
[Q] Episode 1000/1500 | Mean(Last50): -117.96
[Q] Episode 1200/1500 | Mean(Last50): -124.30
[Q] Episode 1400/1500 | Mean(Last50): -107.52
Run finished | α=0.001, γ=1, param=0.1 → avg_eval=-98.10


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇█████
episode_return,▆▆▆▆▆▅▆▅▅▅▅▆▆▆▆▆▁▆▆▆▆▆▆█▆▅█▆▆▇▆▆▆▅▆▆▆▆▆▆
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.001
episode,1499
episode_return,-100
final_mean_reward_last50,-108.64
gamma,1
mean_eval_return,-98.1
param,0.1
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: s20usvdn with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.01
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.7
[34m[1mwandb[0m: 	param: 0.001
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -105.30
[Q] Episode 400/1500 | Mean(Last50): -97.16
[Q] Episode 600/1500 | Mean(Last50): -102.80
[Q] Episode 800/1500 | Mean(Last50): -97.72
[Q] Episode 1000/1500 | Mean(Last50): -96.98
[Q] Episode 1200/1500 | Mean(Last50): -88.36
[Q] Episode 1400/1500 | Mean(Last50): -86.34
Run finished | α=0.01, γ=0.7, param=0.001 → avg_eval=-94.50


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▂▂▂▂▂▂▂▃▃▃▃▄▄▄▅▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇█
episode_return,▁▇▅▅▅▄▅▅▅▅▅▅▅▇▆▆▅▅▆▅▅▅▅▆▅▅▅█▇▆▅▆▅▆▅▇▅▆▅▅
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.01
episode,1499
episode_return,-100
final_mean_reward_last50,-85.78
gamma,0.7
mean_eval_return,-94.5
param,0.001
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: 7nexeyip with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.01
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.7
[34m[1mwandb[0m: 	param: 0.01
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -106.02
[Q] Episode 400/1500 | Mean(Last50): -103.56
[Q] Episode 600/1500 | Mean(Last50): -96.44
[Q] Episode 800/1500 | Mean(Last50): -92.14
[Q] Episode 1000/1500 | Mean(Last50): -97.14
[Q] Episode 1200/1500 | Mean(Last50): -83.70
[Q] Episode 1400/1500 | Mean(Last50): -91.58
Run finished | α=0.01, γ=0.7, param=0.01 → avg_eval=-100.00


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▁▂▃▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇██
episode_return,▄▄▄▅▇▅▅▄▆▅▅▆▇██▇▅▅▄▅▆▅█▅▁▅▅▅▅▅▅▅▅▅▅▅▅▅▆▆
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.01
episode,1499
episode_return,-86
final_mean_reward_last50,-87.34
gamma,0.7
mean_eval_return,-100
param,0.01
strategy,eps_greedy


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ixud3ggl with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.01
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.7
[34m[1mwandb[0m: 	param: 0.05
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -112.78
[Q] Episode 400/1500 | Mean(Last50): -97.10
[Q] Episode 600/1500 | Mean(Last50): -98.92
[Q] Episode 800/1500 | Mean(Last50): -96.38
[Q] Episode 1000/1500 | Mean(Last50): -97.66
[Q] Episode 1200/1500 | Mean(Last50): -101.38
[Q] Episode 1400/1500 | Mean(Last50): -104.34
Run finished | α=0.01, γ=0.7, param=0.05 → avg_eval=-100.00


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇████
episode_return,▅▅▅▅▅▅▅▅▅▅▇▅▁▅▆▁▅▅▅▅▅▅▆▅▆▅▅▅█▇▅▅▅▅▆▅▁▅▆▅
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.01
episode,1499
episode_return,-100
final_mean_reward_last50,-94.12
gamma,0.7
mean_eval_return,-100
param,0.05
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: 8qt8ect5 with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.01
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.7
[34m[1mwandb[0m: 	param: 0.1
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -115.04
[Q] Episode 400/1500 | Mean(Last50): -109.52
[Q] Episode 600/1500 | Mean(Last50): -103.86
[Q] Episode 800/1500 | Mean(Last50): -101.02
[Q] Episode 1000/1500 | Mean(Last50): -111.98
[Q] Episode 1200/1500 | Mean(Last50): -121.70
[Q] Episode 1400/1500 | Mean(Last50): -98.20
Run finished | α=0.01, γ=0.7, param=0.1 → avg_eval=-100.00


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇█████
episode_return,▆▂▁▆▆▆▆▁▆▆▅▆▆▆▆▆▆█▇▆▆▇▂▆▆█▆█▆▆▆▆▆▂▇█▆▆▆▆
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.01
episode,1499
episode_return,-199
final_mean_reward_last50,-107.1
gamma,0.7
mean_eval_return,-100
param,0.1
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: p5dfkb4i with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.01
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.8
[34m[1mwandb[0m: 	param: 0.001
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -101.86
[Q] Episode 400/1500 | Mean(Last50): -102.08
[Q] Episode 600/1500 | Mean(Last50): -95.02
[Q] Episode 800/1500 | Mean(Last50): -95.36
[Q] Episode 1000/1500 | Mean(Last50): -96.28
[Q] Episode 1200/1500 | Mean(Last50): -83.36
[Q] Episode 1400/1500 | Mean(Last50): -86.12
Run finished | α=0.01, γ=0.8, param=0.001 → avg_eval=-100.00


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇████
episode_return,▅█▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▇▅▅▆▅▇▆▅▅▅▅▅▇▅▇▆▅▅▆▅▁▅
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.01
episode,1499
episode_return,-100
final_mean_reward_last50,-82.64
gamma,0.8
mean_eval_return,-100
param,0.001
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: 4h41rxca with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.01
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.8
[34m[1mwandb[0m: 	param: 0.01
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -108.88
[Q] Episode 400/1500 | Mean(Last50): -104.04
[Q] Episode 600/1500 | Mean(Last50): -94.14
[Q] Episode 800/1500 | Mean(Last50): -92.40
[Q] Episode 1000/1500 | Mean(Last50): -95.06
[Q] Episode 1200/1500 | Mean(Last50): -91.94
[Q] Episode 1400/1500 | Mean(Last50): -79.50
Run finished | α=0.01, γ=0.8, param=0.01 → avg_eval=-100.00


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇██
episode_return,▅▄▁▅▆▆▅▇▅▅█▄▅▅▅▆▅▆▅▁▅▅▅▅█▅▅▅▅▅▆▆▄▆▆▆▇▅▅▅
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.01
episode,1499
episode_return,-92
final_mean_reward_last50,-88.34
gamma,0.8
mean_eval_return,-100
param,0.01
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: 1t8wpqe0 with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.01
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.8
[34m[1mwandb[0m: 	param: 0.05
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -105.40
[Q] Episode 400/1500 | Mean(Last50): -102.44
[Q] Episode 600/1500 | Mean(Last50): -109.04
[Q] Episode 800/1500 | Mean(Last50): -103.06
[Q] Episode 1000/1500 | Mean(Last50): -100.68
[Q] Episode 1200/1500 | Mean(Last50): -96.86
[Q] Episode 1400/1500 | Mean(Last50): -93.72
Run finished | α=0.01, γ=0.8, param=0.05 → avg_eval=-100.00


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇███
episode_return,▆▄█▆▄▆▆▁▅▇▆▆▇▆█▁▆▇▆▅▆▆▆▆▆▅▆▅▆█▆▆▇▆▆▇▇▆▁▆
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.01
episode,1499
episode_return,-65
final_mean_reward_last50,-93.38
gamma,0.8
mean_eval_return,-100
param,0.05
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: rkt154th with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.01
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.8
[34m[1mwandb[0m: 	param: 0.1
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -115.20
[Q] Episode 400/1500 | Mean(Last50): -112.78
[Q] Episode 600/1500 | Mean(Last50): -107.04
[Q] Episode 800/1500 | Mean(Last50): -92.56
[Q] Episode 1000/1500 | Mean(Last50): -102.20
[Q] Episode 1200/1500 | Mean(Last50): -111.42
[Q] Episode 1400/1500 | Mean(Last50): -106.40
Run finished | α=0.01, γ=0.8, param=0.1 → avg_eval=-100.00


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▆▆▇▇████
episode_return,▆▁▅▅▅▄▅▄█▅▅▅▅▅▅▇▅▁▅▄▇▅▁▅▁█▅▅▅▅▅▅▆▅▆▅▅▆▁▅
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.01
episode,1499
episode_return,-100
final_mean_reward_last50,-97.52
gamma,0.8
mean_eval_return,-100
param,0.1
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: zczxc1ga with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.01
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.9
[34m[1mwandb[0m: 	param: 0.001
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -107.40
[Q] Episode 400/1500 | Mean(Last50): -103.38
[Q] Episode 600/1500 | Mean(Last50): -100.46
[Q] Episode 800/1500 | Mean(Last50): -93.56
[Q] Episode 1000/1500 | Mean(Last50): -91.82
[Q] Episode 1200/1500 | Mean(Last50): -70.88
[Q] Episode 1400/1500 | Mean(Last50): -72.66
Run finished | α=0.01, γ=0.9, param=0.001 → avg_eval=-106.20


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇███
episode_return,▄▄▄▄▄▄▄▄▇▄▃▃▄▅▄▇▄▃▄▆▄▄▄▄█▄▄▇▄█▄▁▅▇▄▆▃▅▄▄
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.01
episode,1499
episode_return,-28
final_mean_reward_last50,-67.02
gamma,0.9
mean_eval_return,-106.2
param,0.001
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: out7xnln with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.01
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.9
[34m[1mwandb[0m: 	param: 0.01
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -111.32
[Q] Episode 400/1500 | Mean(Last50): -105.62
[Q] Episode 600/1500 | Mean(Last50): -98.32
[Q] Episode 800/1500 | Mean(Last50): -95.62
[Q] Episode 1000/1500 | Mean(Last50): -80.12
[Q] Episode 1200/1500 | Mean(Last50): -88.20
[Q] Episode 1400/1500 | Mean(Last50): -68.44
Run finished | α=0.01, γ=0.9, param=0.01 → avg_eval=-61.20


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▆▆▆▇▇▇▇▇█████
episode_return,▁▄▅▁▃▅▅▅▆▅▅▅▃▅▅▅▄▅▅▅▅▅▅▅▅▆▆▆▅▅▅▆▆▆█▆▅▅▅█
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.01
episode,1499
episode_return,-75
final_mean_reward_last50,-63
gamma,0.9
mean_eval_return,-61.2
param,0.01
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: 6zui7lyq with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.01
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.9
[34m[1mwandb[0m: 	param: 0.05
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -110.86
[Q] Episode 400/1500 | Mean(Last50): -104.10
[Q] Episode 600/1500 | Mean(Last50): -104.62
[Q] Episode 800/1500 | Mean(Last50): -98.84
[Q] Episode 1000/1500 | Mean(Last50): -99.84
[Q] Episode 1200/1500 | Mean(Last50): -92.56
[Q] Episode 1400/1500 | Mean(Last50): -75.80
Run finished | α=0.01, γ=0.9, param=0.05 → avg_eval=-111.50


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▂▂▂▂▃▃▃▄▄▄▄▄▄▄▄▄▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇████
episode_return,▅▁▆▅▅▆▇▆▆▆▅▆▅▆▆▅▆▆▆▆▂▂██▇▇▇▆▇▆█▇▆▅█▂█▆▇█
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.01
episode,1499
episode_return,-100
final_mean_reward_last50,-65
gamma,0.9
mean_eval_return,-111.5
param,0.05
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: orxv3hbk with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.01
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.9
[34m[1mwandb[0m: 	param: 0.1
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -114.24
[Q] Episode 400/1500 | Mean(Last50): -101.26
[Q] Episode 600/1500 | Mean(Last50): -107.48
[Q] Episode 800/1500 | Mean(Last50): -96.56
[Q] Episode 1000/1500 | Mean(Last50): -98.88
[Q] Episode 1200/1500 | Mean(Last50): -85.66
[Q] Episode 1400/1500 | Mean(Last50): -72.82
Run finished | α=0.01, γ=0.9, param=0.1 → avg_eval=-81.00


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇██
episode_return,▄▅▄▅▄▅▄▅▇▅▁▅▅▅█▂▆▆▅▆▆▅▇▅▅▇█▅▅▅▅▁█▇▅▇▅▆▆▁
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.01
episode,1499
episode_return,-30
final_mean_reward_last50,-70.92
gamma,0.9
mean_eval_return,-81
param,0.1
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: eonmq9xg with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.01
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 1
[34m[1mwandb[0m: 	param: 0.001
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -109.42
[Q] Episode 400/1500 | Mean(Last50): -108.52
[Q] Episode 600/1500 | Mean(Last50): -94.06
[Q] Episode 800/1500 | Mean(Last50): -98.62
[Q] Episode 1000/1500 | Mean(Last50): -87.50
[Q] Episode 1200/1500 | Mean(Last50): -63.12
[Q] Episode 1400/1500 | Mean(Last50): -66.96
Run finished | α=0.01, γ=1, param=0.001 → avg_eval=-12.80


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▁▂▂▃▃▃▄▄▄▄▄▄▄▄▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇████
episode_return,▁▄▅▆▅▅▅▅▁▄▅▅▅▇▆▆▅█▅▇▅▇▅▇▆▅█▅▅▇▇▇▃▇▄▃██▅▅
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.01
episode,1499
episode_return,-60
final_mean_reward_last50,-53.22
gamma,1
mean_eval_return,-12.8
param,0.001
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: e6ceruzp with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.01
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 1
[34m[1mwandb[0m: 	param: 0.01
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -104.62
[Q] Episode 400/1500 | Mean(Last50): -102.02
[Q] Episode 600/1500 | Mean(Last50): -96.96
[Q] Episode 800/1500 | Mean(Last50): -94.80
[Q] Episode 1000/1500 | Mean(Last50): -75.08
[Q] Episode 1200/1500 | Mean(Last50): -69.60
[Q] Episode 1400/1500 | Mean(Last50): -52.64
Run finished | α=0.01, γ=1, param=0.01 → avg_eval=-19.10


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▂▂▂▃▃▃▃▃▃▃▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇█████
episode_return,▅▅▅▅▅▅▄▄▅▅▄▅▅▄▆▁▅▅▆▄▇▄▅▅▅▇▅▄▄▅▅▇▆▄▇▆▇██▇
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.01
episode,1499
episode_return,-79
final_mean_reward_last50,-54.46
gamma,1
mean_eval_return,-19.1
param,0.01
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: ygluvqbt with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.01
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 1
[34m[1mwandb[0m: 	param: 0.05
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -117.18
[Q] Episode 400/1500 | Mean(Last50): -106.74
[Q] Episode 600/1500 | Mean(Last50): -96.06
[Q] Episode 800/1500 | Mean(Last50): -93.58
[Q] Episode 1000/1500 | Mean(Last50): -81.70
[Q] Episode 1200/1500 | Mean(Last50): -69.10
[Q] Episode 1400/1500 | Mean(Last50): -71.80
Run finished | α=0.01, γ=1, param=0.05 → avg_eval=-113.25


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▂▂▂▂▂▂▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇█████
episode_return,▂▂▂▂▂▂▁▂▂▂▂▅▂▂▄▂▅▄▆▁▅▂▄▇▂▂▂▂▅▅▄▂▆▅▇█▆▅▅▇
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.01
episode,1499
episode_return,-36
final_mean_reward_last50,-60.88
gamma,1
mean_eval_return,-113.25
param,0.05
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: 5ulzzv29 with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.01
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 1
[34m[1mwandb[0m: 	param: 0.1
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -118.24
[Q] Episode 400/1500 | Mean(Last50): -110.20
[Q] Episode 600/1500 | Mean(Last50): -98.44
[Q] Episode 800/1500 | Mean(Last50): -105.56
[Q] Episode 1000/1500 | Mean(Last50): -91.10
[Q] Episode 1200/1500 | Mean(Last50): -86.70
[Q] Episode 1400/1500 | Mean(Last50): -68.56
Run finished | α=0.01, γ=1, param=0.1 → avg_eval=-23.60


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▆▇▇▇▇█
episode_return,▆▆▅▁▃▃▅▆▃▅▅▅▆▅▆▆▆▆█▃▆▆▅█▆▂▆▆▇▆▂█▇▆█▅▇▇▇█
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.01
episode,1499
episode_return,-100
final_mean_reward_last50,-64.46
gamma,1
mean_eval_return,-23.6
param,0.1
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: y4m405ge with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.1
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.7
[34m[1mwandb[0m: 	param: 0.001
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -93.36
[Q] Episode 400/1500 | Mean(Last50): -70.44
[Q] Episode 600/1500 | Mean(Last50): -27.58
[Q] Episode 800/1500 | Mean(Last50): -27.56
[Q] Episode 1000/1500 | Mean(Last50): -24.34
[Q] Episode 1200/1500 | Mean(Last50): -27.24
[Q] Episode 1400/1500 | Mean(Last50): -29.46
Run finished | α=0.1, γ=0.7, param=0.001 → avg_eval=-27.05


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
episode_return,▅▁▅▅▅▅▇▅▅▆▆█▇██▇▇██▇▇██▇▇██▇█████▇█▇█▆██
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.1
episode,1499
episode_return,-30
final_mean_reward_last50,-31.34
gamma,0.7
mean_eval_return,-27.05
param,0.001
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: jwmhnvci with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.1
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.7
[34m[1mwandb[0m: 	param: 0.01
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -88.06
[Q] Episode 400/1500 | Mean(Last50): -63.48
[Q] Episode 600/1500 | Mean(Last50): -35.24
[Q] Episode 800/1500 | Mean(Last50): -23.82
[Q] Episode 1000/1500 | Mean(Last50): -30.62
[Q] Episode 1200/1500 | Mean(Last50): -24.18
[Q] Episode 1400/1500 | Mean(Last50): -23.98
Run finished | α=0.1, γ=0.7, param=0.01 → avg_eval=-25.55


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▅▅▅▅▅▅▅▅▆▆▆▇▇▇▇▇████
episode_return,▄▄▅▁▄▅▅▆▇▅▇█▅███▇███▇▇█▇▇███▃▇██▇▇▇▇██▇▇
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.1
episode,1499
episode_return,-23
final_mean_reward_last50,-32
gamma,0.7
mean_eval_return,-25.55
param,0.01
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: 59gaupbu with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.1
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.7
[34m[1mwandb[0m: 	param: 0.05
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -100.18
[Q] Episode 400/1500 | Mean(Last50): -62.38
[Q] Episode 600/1500 | Mean(Last50): -44.22
[Q] Episode 800/1500 | Mean(Last50): -41.56
[Q] Episode 1000/1500 | Mean(Last50): -36.72
[Q] Episode 1200/1500 | Mean(Last50): -47.78
[Q] Episode 1400/1500 | Mean(Last50): -31.80
Run finished | α=0.1, γ=0.7, param=0.05 → avg_eval=-22.80


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇█
episode_return,▅▆▅▆▆▆▃▆█▆▆▇▇█▇▇▇▇▇▇█▄▇▄▆█▄▇███▇█▁▇█▇▇▇█
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.1
episode,1499
episode_return,-22
final_mean_reward_last50,-39.02
gamma,0.7
mean_eval_return,-22.8
param,0.05
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: oddlwpij with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.1
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.7
[34m[1mwandb[0m: 	param: 0.1
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -107.54
[Q] Episode 400/1500 | Mean(Last50): -74.02
[Q] Episode 600/1500 | Mean(Last50): -57.72
[Q] Episode 800/1500 | Mean(Last50): -42.36
[Q] Episode 1000/1500 | Mean(Last50): -53.02
[Q] Episode 1200/1500 | Mean(Last50): -61.34
[Q] Episode 1400/1500 | Mean(Last50): -45.04
Run finished | α=0.1, γ=0.7, param=0.1 → avg_eval=-25.90


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇█
episode_return,▆▆▆▆▆▆▆▇▁▄▇▆▆▇▅█▆▄████▅█▄█████▅█▇▄███▇▅▇
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.1
episode,1499
episode_return,-39
final_mean_reward_last50,-53.72
gamma,0.7
mean_eval_return,-25.9
param,0.1
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: jy0u0q8t with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.1
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.8
[34m[1mwandb[0m: 	param: 0.001
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -70.48
[Q] Episode 400/1500 | Mean(Last50): -21.72
[Q] Episode 600/1500 | Mean(Last50): -32.34
[Q] Episode 800/1500 | Mean(Last50): -15.54
[Q] Episode 1000/1500 | Mean(Last50): -15.56
[Q] Episode 1200/1500 | Mean(Last50): -53.98
[Q] Episode 1400/1500 | Mean(Last50): -29.26
Run finished | α=0.1, γ=0.8, param=0.001 → avg_eval=-33.85


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇█
episode_return,▂▂▁▂▆▂▂▇▄▂▆▇██▂▂█▇▇▄▂██▇▇▇█▄▆▆▅▆▆▆█▇▇█▆▄
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.1
episode,1499
episode_return,-27
final_mean_reward_last50,-27.88
gamma,0.8
mean_eval_return,-33.85
param,0.001
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: iz4sapeu with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.1
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.8
[34m[1mwandb[0m: 	param: 0.01
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -87.24
[Q] Episode 400/1500 | Mean(Last50): -37.32
[Q] Episode 600/1500 | Mean(Last50): -14.36
[Q] Episode 800/1500 | Mean(Last50): -14.78
[Q] Episode 1000/1500 | Mean(Last50): -14.38
[Q] Episode 1200/1500 | Mean(Last50): -23.94
[Q] Episode 1400/1500 | Mean(Last50): -19.76
Run finished | α=0.1, γ=0.8, param=0.01 → avg_eval=-21.50


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇██
episode_return,▁▁▆▅▆█▇█▅█████▇▇██▇▅██▇▇██▇▇▇▇▆█▇█▇▇█▇█▇
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.1
episode,1499
episode_return,-8
final_mean_reward_last50,-20
gamma,0.8
mean_eval_return,-21.5
param,0.01
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: j2u6qpam with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.1
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.8
[34m[1mwandb[0m: 	param: 0.05
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -79.58
[Q] Episode 400/1500 | Mean(Last50): -49.44
[Q] Episode 600/1500 | Mean(Last50): -15.50
[Q] Episode 800/1500 | Mean(Last50): -29.86
[Q] Episode 1000/1500 | Mean(Last50): -40.22
[Q] Episode 1200/1500 | Mean(Last50): -15.60
[Q] Episode 1400/1500 | Mean(Last50): -21.02
Run finished | α=0.1, γ=0.8, param=0.05 → avg_eval=-17.30


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇███
episode_return,▂▁▅▅█▆█▂█▇██████████▇██▇█▆▅▇███▇███▇██▇█
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.1
episode,1499
episode_return,-22
final_mean_reward_last50,-16.5
gamma,0.8
mean_eval_return,-17.3
param,0.05
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: jzvwbd1q with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.1
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.8
[34m[1mwandb[0m: 	param: 0.1
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -97.42
[Q] Episode 400/1500 | Mean(Last50): -44.96
[Q] Episode 600/1500 | Mean(Last50): -25.68
[Q] Episode 800/1500 | Mean(Last50): -19.74
[Q] Episode 1000/1500 | Mean(Last50): -21.34
[Q] Episode 1200/1500 | Mean(Last50): -26.58
[Q] Episode 1400/1500 | Mean(Last50): -21.06
Run finished | α=0.1, γ=0.8, param=0.1 → avg_eval=-15.20


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇███
episode_return,▁▆▆▇▃▇▇▆█▇▁▇▇▅████████▇▇████████████████
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.1
episode,1499
episode_return,-14
final_mean_reward_last50,-19.8
gamma,0.8
mean_eval_return,-15.2
param,0.1
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: xiryiypx with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.1
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.9
[34m[1mwandb[0m: 	param: 0.001
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -49.78
[Q] Episode 400/1500 | Mean(Last50): -21.76
[Q] Episode 600/1500 | Mean(Last50): -15.72
[Q] Episode 800/1500 | Mean(Last50): -25.04
[Q] Episode 1000/1500 | Mean(Last50): -15.28
[Q] Episode 1200/1500 | Mean(Last50): -18.30
[Q] Episode 1400/1500 | Mean(Last50): -16.58
Run finished | α=0.1, γ=0.9, param=0.001 → avg_eval=-13.05


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▂▂▂▂▂▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇███
episode_return,▁▄▂▇▅█▇▇▆█▇▆█▇▇▇█▇██▇▇▇▇▂█▇▇███▆█▇██▇▇▇█
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.1
episode,1499
episode_return,-24
final_mean_reward_last50,-13.7
gamma,0.9
mean_eval_return,-13.05
param,0.001
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: o6sruhsq with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.1
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.9
[34m[1mwandb[0m: 	param: 0.01
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -49.98
[Q] Episode 400/1500 | Mean(Last50): -30.02
[Q] Episode 600/1500 | Mean(Last50): -15.38
[Q] Episode 800/1500 | Mean(Last50): -19.06
[Q] Episode 1000/1500 | Mean(Last50): -15.74
[Q] Episode 1200/1500 | Mean(Last50): -16.64
[Q] Episode 1400/1500 | Mean(Last50): -17.82
Run finished | α=0.1, γ=0.9, param=0.01 → avg_eval=-12.85


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▇▇▇▇▇███
episode_return,▁▅▆█▇█▇███████████████▆▇█████████▇████▇█
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.1
episode,1499
episode_return,-54
final_mean_reward_last50,-17.18
gamma,0.9
mean_eval_return,-12.85
param,0.01
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: ebyoy2l3 with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.1
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.9
[34m[1mwandb[0m: 	param: 0.05
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -58.50
[Q] Episode 400/1500 | Mean(Last50): -18.26
[Q] Episode 600/1500 | Mean(Last50): -21.04
[Q] Episode 800/1500 | Mean(Last50): -17.60
[Q] Episode 1000/1500 | Mean(Last50): -16.12
[Q] Episode 1200/1500 | Mean(Last50): -28.12
[Q] Episode 1400/1500 | Mean(Last50): -17.84
Run finished | α=0.1, γ=0.9, param=0.05 → avg_eval=-16.80


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇█
episode_return,▅▁▆▅▅██████▇▇▇▇████████▇█▇▇███████████▇▇
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.1
episode,1499
episode_return,-20
final_mean_reward_last50,-19.72
gamma,0.9
mean_eval_return,-16.8
param,0.05
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: 1sidcvlr with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.1
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.9
[34m[1mwandb[0m: 	param: 0.1
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -76.62
[Q] Episode 400/1500 | Mean(Last50): -22.52
[Q] Episode 600/1500 | Mean(Last50): -16.98
[Q] Episode 800/1500 | Mean(Last50): -16.84
[Q] Episode 1000/1500 | Mean(Last50): -18.10
[Q] Episode 1200/1500 | Mean(Last50): -19.16
[Q] Episode 1400/1500 | Mean(Last50): -19.40
Run finished | α=0.1, γ=0.9, param=0.1 → avg_eval=-14.55


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▁▂▂▂▂▂▃▃▃▄▄▄▄▄▄▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇████
episode_return,▄▄▃▁▇▂█▇███████▇█▇█▇███▇███▇██▇███▇███▇█
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.1
episode,1499
episode_return,-126
final_mean_reward_last50,-20.48
gamma,0.9
mean_eval_return,-14.55
param,0.1
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: wc7w97km with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.1
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 1
[34m[1mwandb[0m: 	param: 0.001
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -57.44
[Q] Episode 400/1500 | Mean(Last50): -16.40
[Q] Episode 600/1500 | Mean(Last50): -14.14
[Q] Episode 800/1500 | Mean(Last50): -14.88
[Q] Episode 1000/1500 | Mean(Last50): -15.40
[Q] Episode 1200/1500 | Mean(Last50): -17.36
[Q] Episode 1400/1500 | Mean(Last50): -15.06
Run finished | α=0.1, γ=1, param=0.001 → avg_eval=-15.40


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆▆▇█████
episode_return,▂▁▁▂▂▄▄██▁▇▆▇████▇▇█▇▇▇█▇███▆██▇█▇█▇▇██▇
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.1
episode,1499
episode_return,-21
final_mean_reward_last50,-16.18
gamma,1
mean_eval_return,-15.4
param,0.001
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: 7zyr514e with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.1
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 1
[34m[1mwandb[0m: 	param: 0.01
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -60.70
[Q] Episode 400/1500 | Mean(Last50): -17.80
[Q] Episode 600/1500 | Mean(Last50): -16.68
[Q] Episode 800/1500 | Mean(Last50): -18.34
[Q] Episode 1000/1500 | Mean(Last50): -15.88
[Q] Episode 1200/1500 | Mean(Last50): -14.66
[Q] Episode 1400/1500 | Mean(Last50): -16.74
Run finished | α=0.1, γ=1, param=0.01 → avg_eval=-16.85


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇██
episode_return,▁▅▄▅▇▇▇█████▇▇██████████▇██▇█▇██████▇███
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.1
episode,1499
episode_return,-8
final_mean_reward_last50,-15.64
gamma,1
mean_eval_return,-16.85
param,0.01
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: tdizod29 with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.1
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 1
[34m[1mwandb[0m: 	param: 0.05
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -45.04
[Q] Episode 400/1500 | Mean(Last50): -21.34
[Q] Episode 600/1500 | Mean(Last50): -19.62
[Q] Episode 800/1500 | Mean(Last50): -20.12
[Q] Episode 1000/1500 | Mean(Last50): -19.44
[Q] Episode 1200/1500 | Mean(Last50): -17.86
[Q] Episode 1400/1500 | Mean(Last50): -14.72
Run finished | α=0.1, γ=1, param=0.05 → avg_eval=-17.95


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▁▂▂▃▃▃▃▃▃▄▄▄▄▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇██
episode_return,▁▇█▂▇▇███▇▇▇██▇███▇▇█▇█▇▇█▇██▇██▇▇█▇█▇█▇
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.1
episode,1499
episode_return,-25
final_mean_reward_last50,-17.04
gamma,1
mean_eval_return,-17.95
param,0.05
strategy,eps_greedy


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: g5ly0b0k with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.1
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 1
[34m[1mwandb[0m: 	param: 0.1
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -58.10
[Q] Episode 400/1500 | Mean(Last50): -19.52
[Q] Episode 600/1500 | Mean(Last50): -21.52
[Q] Episode 800/1500 | Mean(Last50): -16.88
[Q] Episode 1000/1500 | Mean(Last50): -19.80
[Q] Episode 1200/1500 | Mean(Last50): -18.16
[Q] Episode 1400/1500 | Mean(Last50): -16.88
Run finished | α=0.1, γ=1, param=0.1 → avg_eval=-18.95


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▂▂▂▂▂▂▂▂▃▃▃▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇███
episode_return,▁▁▆▅▇▇▅█▇█▆▇█▇███████████▇█████▇▆███████
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,0.1
episode,1499
episode_return,-23
final_mean_reward_last50,-17.3
gamma,1
mean_eval_return,-18.95
param,0.1
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: ii0t551h with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 1
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.7
[34m[1mwandb[0m: 	param: 0.001
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -79.76
[Q] Episode 400/1500 | Mean(Last50): -83.88
[Q] Episode 600/1500 | Mean(Last50): -100.00
[Q] Episode 800/1500 | Mean(Last50): -100.00
[Q] Episode 1000/1500 | Mean(Last50): -100.00
[Q] Episode 1200/1500 | Mean(Last50): -100.00
[Q] Episode 1400/1500 | Mean(Last50): -100.00
Run finished | α=1, γ=0.7, param=0.001 → avg_eval=-100.00


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇██
episode_return,▁▅█▇▅█▇▅▅█▆▇▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,1
episode,1499
episode_return,-100
final_mean_reward_last50,-100
gamma,0.7
mean_eval_return,-100
param,0.001
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: h0han8ul with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 1
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.7
[34m[1mwandb[0m: 	param: 0.01
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -82.78
[Q] Episode 400/1500 | Mean(Last50): -87.90
[Q] Episode 600/1500 | Mean(Last50): -47.84
[Q] Episode 800/1500 | Mean(Last50): -54.78
[Q] Episode 1000/1500 | Mean(Last50): -76.70
[Q] Episode 1200/1500 | Mean(Last50): -83.62
[Q] Episode 1400/1500 | Mean(Last50): -95.20
Run finished | α=1, γ=0.7, param=0.01 → avg_eval=-50.25


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇███
episode_return,▇▅▅▅▅▅▅▁▁▅▅▅▅▅▇▆▅▃█▇▆▅▅▅▇▇▇▆▆██▅▅▆▂▇▆▇▅█
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,1
episode,1499
episode_return,-85
final_mean_reward_last50,-98.7
gamma,0.7
mean_eval_return,-50.25
param,0.01
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: dvmvkycs with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 1
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.7
[34m[1mwandb[0m: 	param: 0.05
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -96.00
[Q] Episode 400/1500 | Mean(Last50): -107.00
[Q] Episode 600/1500 | Mean(Last50): -121.26
[Q] Episode 800/1500 | Mean(Last50): -120.28
[Q] Episode 1000/1500 | Mean(Last50): -92.18
[Q] Episode 1200/1500 | Mean(Last50): -124.52
[Q] Episode 1400/1500 | Mean(Last50): -101.66
Run finished | α=1, γ=0.7, param=0.05 → avg_eval=-154.45


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███
episode_return,▇▆▇▆▇▇▆▆▃▄█▃▄▆▆▄▆▆▇▆▄█▄▆▄▄▆▆▆▁▆█▆▇▆▆▇▅█▆
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,1
episode,1499
episode_return,-100
final_mean_reward_last50,-129.08
gamma,0.7
mean_eval_return,-154.45
param,0.05
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: lkvi0qwp with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 1
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.7
[34m[1mwandb[0m: 	param: 0.1
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -100.86
[Q] Episode 400/1500 | Mean(Last50): -127.24
[Q] Episode 600/1500 | Mean(Last50): -111.10
[Q] Episode 800/1500 | Mean(Last50): -151.10
[Q] Episode 1000/1500 | Mean(Last50): -156.34
[Q] Episode 1200/1500 | Mean(Last50): -125.14
[Q] Episode 1400/1500 | Mean(Last50): -129.42
Run finished | α=1, γ=0.7, param=0.1 → avg_eval=-218.80


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▇▇▇▇▇▇████
episode_return,▃▁▃▆▆█▇▆▆▆▆▃▆▃▆▃▆▆▆▄█▁▇▆▁▆▆▃▆▃▆▃▇▃▆▂█▄▆▆
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,1
episode,1499
episode_return,-397
final_mean_reward_last50,-145.56
gamma,0.7
mean_eval_return,-218.8
param,0.1
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: q2z0fjm4 with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 1
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.8
[34m[1mwandb[0m: 	param: 0.001
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -62.36
[Q] Episode 400/1500 | Mean(Last50): -53.14
[Q] Episode 600/1500 | Mean(Last50): -54.36
[Q] Episode 800/1500 | Mean(Last50): -71.06
[Q] Episode 1000/1500 | Mean(Last50): -56.80
[Q] Episode 1200/1500 | Mean(Last50): -59.80
[Q] Episode 1400/1500 | Mean(Last50): -77.10
Run finished | α=1, γ=0.8, param=0.001 → avg_eval=-60.65


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇██
episode_return,▆▅▁▆▆█▅▅▅██▃▅▆█▆▇▇▅▇▇█▇█▇▇█▆▇█▆▇▅▆▅▅▅▅█▇
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,1
episode,1499
episode_return,-43
final_mean_reward_last50,-60.54
gamma,0.8
mean_eval_return,-60.65
param,0.001
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: fq9ivs38 with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 1
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.8
[34m[1mwandb[0m: 	param: 0.01
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -87.44
[Q] Episode 400/1500 | Mean(Last50): -65.90
[Q] Episode 600/1500 | Mean(Last50): -74.34
[Q] Episode 800/1500 | Mean(Last50): -61.86
[Q] Episode 1000/1500 | Mean(Last50): -47.18
[Q] Episode 1200/1500 | Mean(Last50): -68.12
[Q] Episode 1400/1500 | Mean(Last50): -88.84
Run finished | α=1, γ=0.8, param=0.01 → avg_eval=-64.65


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▅▅▆▇▇▇▇▇▇▇███
episode_return,▆▆▇▆▆▆▆▆▇▇▇▆███▆▇▁▆▇█▇█▇█▇▇█▇▆▃▇█▅▆▆▆▇▇▇
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,1
episode,1499
episode_return,-69
final_mean_reward_last50,-70.32
gamma,0.8
mean_eval_return,-64.65
param,0.01
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: hf34qm2i with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 1
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.8
[34m[1mwandb[0m: 	param: 0.05
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -89.84
[Q] Episode 400/1500 | Mean(Last50): -111.66
[Q] Episode 600/1500 | Mean(Last50): -127.10
[Q] Episode 800/1500 | Mean(Last50): -113.66
[Q] Episode 1000/1500 | Mean(Last50): -139.06
[Q] Episode 1200/1500 | Mean(Last50): -114.04
[Q] Episode 1400/1500 | Mean(Last50): -123.12
Run finished | α=1, γ=0.8, param=0.05 → avg_eval=-101.30


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▆▆▆▆▇▇▇▇█
episode_return,▅▅▇█▅▇▅▅▇▇▅█▂▅▅▅▆█▅▅▁▁▆▇▅▇▅▆█▇▆▁▅▅▁██▅█▅
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,1
episode,1499
episode_return,-69
final_mean_reward_last50,-97.7
gamma,0.8
mean_eval_return,-101.3
param,0.05
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: lur7kc08 with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 1
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.8
[34m[1mwandb[0m: 	param: 0.1
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -146.64
[Q] Episode 400/1500 | Mean(Last50): -136.12
[Q] Episode 600/1500 | Mean(Last50): -116.74
[Q] Episode 800/1500 | Mean(Last50): -102.88
[Q] Episode 1000/1500 | Mean(Last50): -140.36
[Q] Episode 1200/1500 | Mean(Last50): -164.72
[Q] Episode 1400/1500 | Mean(Last50): -119.22
Run finished | α=1, γ=0.8, param=0.1 → avg_eval=-90.05


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇██
episode_return,▆▄▆▁▇▇███▄▄▆▄▁▆▅▇▇▆▆▁▆█▄▇▄▇▄▄▇█▇▄▄██▇█▄▆
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,1
episode,1499
episode_return,-298
final_mean_reward_last50,-114.32
gamma,0.8
mean_eval_return,-90.05
param,0.1
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: supptp6w with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 1
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.9
[34m[1mwandb[0m: 	param: 0.001
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -40.80
[Q] Episode 400/1500 | Mean(Last50): -70.40
[Q] Episode 600/1500 | Mean(Last50): -74.66
[Q] Episode 800/1500 | Mean(Last50): -57.10
[Q] Episode 1000/1500 | Mean(Last50): -45.62
[Q] Episode 1200/1500 | Mean(Last50): -51.68
[Q] Episode 1400/1500 | Mean(Last50): -59.40
Run finished | α=1, γ=0.9, param=0.001 → avg_eval=-76.10


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▆▆▆▆▆▆▇▇▇▇████
episode_return,▁▅▇▅▅▇██▇▇██▆▅▅█▅▆▆▅▆▇▇▅█▆▆▇▅▅▅███▇█▅▇▅▇
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,1
episode,1499
episode_return,-100
final_mean_reward_last50,-57.88
gamma,0.9
mean_eval_return,-76.1
param,0.001
strategy,eps_greedy


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: gmccvx5o with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 1
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.9
[34m[1mwandb[0m: 	param: 0.01
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -88.52
[Q] Episode 400/1500 | Mean(Last50): -85.94
[Q] Episode 600/1500 | Mean(Last50): -66.86
[Q] Episode 800/1500 | Mean(Last50): -79.50
[Q] Episode 1000/1500 | Mean(Last50): -49.32
[Q] Episode 1200/1500 | Mean(Last50): -68.00
[Q] Episode 1400/1500 | Mean(Last50): -65.66
Run finished | α=1, γ=0.9, param=0.01 → avg_eval=-92.75


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇█████
episode_return,▃▁▃▆▆▆▇▆▆█▆▆▆▆▆██▆▆▆▇▇▆▆▇███▇██▅▇▇█▇▇▇█▇
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,1
episode,1499
episode_return,-34
final_mean_reward_last50,-79.36
gamma,0.9
mean_eval_return,-92.75
param,0.01
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: k1jlpmn6 with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 1
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.9
[34m[1mwandb[0m: 	param: 0.05
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -107.48
[Q] Episode 400/1500 | Mean(Last50): -61.20
[Q] Episode 600/1500 | Mean(Last50): -84.76
[Q] Episode 800/1500 | Mean(Last50): -102.72
[Q] Episode 1000/1500 | Mean(Last50): -75.12
[Q] Episode 1200/1500 | Mean(Last50): -96.06
[Q] Episode 1400/1500 | Mean(Last50): -100.62
Run finished | α=1, γ=0.9, param=0.05 → avg_eval=-121.30


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
episode_return,▆▂▆▆▄▆▆▆▇▅█▅▇▄▆▇▇▅▇█▇▆▇▆▁█▆▄█▆▆▆██▇▆▇▆█▆
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,1
episode,1499
episode_return,-58
final_mean_reward_last50,-124.16
gamma,0.9
mean_eval_return,-121.3
param,0.05
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: nx0zbm0n with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 1
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.9
[34m[1mwandb[0m: 	param: 0.1
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -133.96
[Q] Episode 400/1500 | Mean(Last50): -69.62
[Q] Episode 600/1500 | Mean(Last50): -71.48
[Q] Episode 800/1500 | Mean(Last50): -119.84
[Q] Episode 1000/1500 | Mean(Last50): -121.02
[Q] Episode 1200/1500 | Mean(Last50): -126.00
[Q] Episode 1400/1500 | Mean(Last50): -128.04
Run finished | α=1, γ=0.9, param=0.1 → avg_eval=-113.00


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▁▂▂▂▃▃▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▇▇▇███
episode_return,▅█▆▅▇█▇▆▇▆▇█▆▇█▆▇▆▇████▁▇▆▇██▅▇█▁▇▆▃▅▇▆█
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,1
episode,1499
episode_return,-76
final_mean_reward_last50,-116.2
gamma,0.9
mean_eval_return,-113
param,0.1
strategy,eps_greedy


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 9zya26km with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 1
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 1
[34m[1mwandb[0m: 	param: 0.001
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -35.08
[Q] Episode 400/1500 | Mean(Last50): -23.96
[Q] Episode 600/1500 | Mean(Last50): -41.76
[Q] Episode 800/1500 | Mean(Last50): -32.02
[Q] Episode 1000/1500 | Mean(Last50): -28.18
[Q] Episode 1200/1500 | Mean(Last50): -39.20
[Q] Episode 1400/1500 | Mean(Last50): -47.58
Run finished | α=1, γ=1, param=0.001 → avg_eval=-33.15


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇█████
episode_return,▁▃▆▇▆██▆▇▅█▇▅▇▇█▆▇▆▄▇▇▆▆██▇▅▇▇▅▇▅▆▆▆▆▆▇▆
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,1
episode,1499
episode_return,-28
final_mean_reward_last50,-28.32
gamma,1
mean_eval_return,-33.15
param,0.001
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: ogsu3qd8 with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 1
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 1
[34m[1mwandb[0m: 	param: 0.01
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -39.00
[Q] Episode 400/1500 | Mean(Last50): -50.34
[Q] Episode 600/1500 | Mean(Last50): -45.60
[Q] Episode 800/1500 | Mean(Last50): -35.10
[Q] Episode 1000/1500 | Mean(Last50): -33.10
[Q] Episode 1200/1500 | Mean(Last50): -34.54
[Q] Episode 1400/1500 | Mean(Last50): -31.26
Run finished | α=1, γ=1, param=0.01 → avg_eval=-54.05


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇█████
episode_return,▁██▇███▅██▆▆█▇█▇▆███▇██▇█▇█▇▇█▇█▅▄███▆▇█
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,1
episode,1499
episode_return,-115
final_mean_reward_last50,-55.5
gamma,1
mean_eval_return,-54.05
param,0.01
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: 64lbrgsf with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 1
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 1
[34m[1mwandb[0m: 	param: 0.05
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -74.52
[Q] Episode 400/1500 | Mean(Last50): -53.64
[Q] Episode 600/1500 | Mean(Last50): -62.18
[Q] Episode 800/1500 | Mean(Last50): -47.78
[Q] Episode 1000/1500 | Mean(Last50): -56.32
[Q] Episode 1200/1500 | Mean(Last50): -38.84
[Q] Episode 1400/1500 | Mean(Last50): -37.96
Run finished | α=1, γ=1, param=0.05 → avg_eval=-33.80


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▄▅▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆▆▇▇▇▇███
episode_return,██▅▇███▇▆█▇█▆█▇▁▇██▇▇█▆▅▇██▇█▅▇█████▇▇▆▇
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,1
episode,1499
episode_return,-31
final_mean_reward_last50,-38.16
gamma,1
mean_eval_return,-33.8
param,0.05
strategy,eps_greedy


[34m[1mwandb[0m: Agent Starting Run: gb0d4seq with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 1
[34m[1mwandb[0m: 	env_name: std_q_tp0.7_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 1
[34m[1mwandb[0m: 	param: 0.1
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -70.48
[Q] Episode 400/1500 | Mean(Last50): -67.44
[Q] Episode 600/1500 | Mean(Last50): -56.10
[Q] Episode 800/1500 | Mean(Last50): -68.96
[Q] Episode 1000/1500 | Mean(Last50): -84.98
[Q] Episode 1200/1500 | Mean(Last50): -62.56
[Q] Episode 1400/1500 | Mean(Last50): -52.06
Run finished | α=1, γ=1, param=0.1 → avg_eval=-77.85


0,1
alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▁▂▂▂▃▃▃▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇███
episode_return,▇▁▃▆▆▇▆▆█▇█▇▆▃▇▆▆▇▄█▆▇▆▇▇█▆▆▇▇█▆▇█▇▇██▅▇
final_mean_reward_last50,▁
gamma,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_eval_return,▁
param,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
algorithm,Q-learning
alpha,1
episode,1499
episode_return,-59
final_mean_reward_last50,-58.58
gamma,1
mean_eval_return,-77.85
param,0.1
strategy,eps_greedy


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


Error in callback <bound method _WandbInit._post_run_cell_hook of <wandb.sdk.wandb_init._WandbInit object at 0x00000296B1874890>> (for post_run_cell), with arguments args (<ExecutionResult object at 296b188cc10, execution_count=10 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 296b186c490, raw_cell="wandb.agent(sweep_id, function=train_one_sweep_run.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/d%3A/...SEM5/RL/GPA_2_new/Grid-World-Environment-main/PART_A.ipynb#X22sZmlsZQ%3D%3D> result=None>,),kwargs {}:


AlreadyJoinedError: 