# PART A

In [1]:
# imports
import os
import time
import glob
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import wandb
from env import create_standard_grid, create_four_room

# Optional: configure matplotlib for notebooks
%matplotlib inline


### Create environment

In [2]:
all_configs = []

# -------------------------------
# 10x10 Grid World: Q-learning (8)
# -------------------------------
for tp in [0.7, 1.0]:
    for start_state in [(0, 4), (3, 6)]:
        for strat in ['eps_greedy', 'softmax']:
            name = f"std_q_tp{tp}_ss{start_state}_strat{strat}"
            cfg = {
                'name': name,
                'alg': 'q_learning',
                'env_builder': create_standard_grid,
                'env_kwargs': {
                    # ✅ FIX: convert tuple to 2D numpy array
                    'start_state': np.array([[start_state[0], start_state[1]]]),
                    'transition_prob': tp,
                    'wind': False
                },
                'strategy': strat
            }
            all_configs.append(cfg)

# -------------------------------
# 10x10 Grid World: SARSA (8)
# -------------------------------
for wind in [True, False]:
    for start_state in [(0, 4), (3, 6)]:
        for strat in ['eps_greedy', 'softmax']:
            name = f"std_sarsa_wind{wind}_ss{start_state}_strat{strat}"
            cfg = {
                'name': name,
                'alg': 'sarsa',
                'env_builder': create_standard_grid,
                'env_kwargs': {
                    # ✅ FIX here too
                    'start_state': np.array([[start_state[0], start_state[1]]]),
                    'transition_prob': 1.0,
                    'wind': wind
                },
                'strategy': strat
            }
            all_configs.append(cfg)

# -------------------------------
# Four-Room configs (no fix needed)
# -------------------------------
for goal_change in [True, False]:
    all_configs.append({
        'name': f"four_q_goalchange{goal_change}_strateps_greedy",
        'alg': 'q_learning',
        'env_builder': create_four_room,
        'env_kwargs': {'goal_change': goal_change},
        'strategy': 'eps_greedy'
    })
for goal_change in [True, False]:
    all_configs.append({
        'name': f"four_sarsa_goalchange{goal_change}_strateps_greedy",
        'alg': 'sarsa',
        'env_builder': create_four_room,
        'env_kwargs': {'goal_change': goal_change},
        'strategy': 'eps_greedy'
    })

### Functions

In [3]:
# Safe reward scalar extractor for 1-element arrays
def reward_scalar(r):
    return float(np.ravel(r)[0])

# epsilon greedy function
# Chose to use correct datatype outputs as it becomes a problem in downstream tasks.

def epsilon_greedy_action(Q: np.ndarray, state: int, epsilon: float) -> int:
    """Pick epsilon-greedy action from Q[state]."""
    if np.random.rand() < epsilon:
        return np.random.randint(Q.shape[1])
    return int(np.argmax(Q[state]))

# --- Episode termination check ---
def is_terminal(state: int, goal_states_seq: np.ndarray) -> bool:
    return state in set(map(int, np.array(goal_states_seq).flatten()))

# --- Rollouts for evaluation ---
def evaluate_policy(env, Q: np.ndarray, episodes: int = 20, max_steps: int = 100) -> float:
    total = 0.0
    for _ in range(episodes):
        s = env.reset()
        ep_ret = 0.0
        for _ in range(max_steps):
            a = int(np.argmax(Q[s]))
            s_next, r = env.step(s, a)
            ep_ret += reward_scalar(r)
            s = int(s_next)
            if is_terminal(s, env.goal_states_seq):
                break
        total += ep_ret
    return total / episodes
    
def softmax_action(Q, state, tau):
    tau = float(tau)
    if tau <= 0:
        return int(np.argmax(Q[state]))
    z = Q[state] / tau
    z = z - np.max(z)
    p = np.exp(z)
    p = p / np.sum(p)
    return int(np.random.choice(len(p), p=p))


def select_action(Q, state, strategy, param):
    if strategy == 'eps_greedy':
        if np.random.rand() < float(param):
            return int(np.random.randint(Q.shape[1]))
        return int(np.argmax(Q[state]))
    elif strategy == 'softmax':
        return softmax_action(Q, state, float(param))
    else:
        raise ValueError('Unknown exploration strategy: ' + str(strategy))

### Q-learning function

In [4]:
def train_q_learning_fixed(env, episodes, alpha, gamma, strategy, param, max_steps=100, run=None, seed=None):
    """
    Q-Learning with optional WandB logging.
    """
    n_states, n_actions = env.num_states, env.num_actions
    Q = np.zeros((n_states, n_actions))
    returns = []

    for ep in range(episodes):
        state = int(env.reset())
        total_reward = 0.0

        for _ in range(max_steps):
            a = select_action(Q, state, strategy, param)
            s_next, r = env.step(state, a)
            r = reward_scalar(r)
            total_reward += r
            s_next = int(s_next)

            if is_terminal(s_next, env.goal_states_seq):
                Q[state, a] += alpha * (r - Q[state, a])
                break

            td_target = r + gamma * np.max(Q[s_next])
            Q[state, a] += alpha * (td_target - Q[state, a])
            state = s_next

        returns.append(total_reward)

        # ✅ WandB logging
        if run is not None:
            wandb.log({
                "episode": ep,
                "episode_return": total_reward,
                "algorithm": "Q-learning",
                "alpha": alpha,
                "gamma": gamma,
                "param": param,
                "strategy": strategy,
                "seed": seed
            })

        if (ep + 1) % 200 == 0:
            print(f"[Q] Episode {ep+1}/{episodes} | Mean(Last50): {np.mean(returns[-50:]):.2f}")

    return Q, returns


### SARSA function

In [5]:
def train_sarsa_fixed(env, episodes, alpha, gamma, strategy, param, max_steps=100, run=None, seed=None):
    """
    SARSA with optional WandB logging.
    """
    n_states, n_actions = env.num_states, env.num_actions
    Q = np.zeros((n_states, n_actions))
    returns = []

    for ep in range(episodes):
        s = int(env.reset())
        a = select_action(Q, s, strategy, param)
        total_reward = 0.0

        for _ in range(max_steps):
            s_next, r = env.step(s, a)
            r = reward_scalar(r)
            total_reward += r
            s_next = int(s_next)

            if is_terminal(s_next, env.goal_states_seq):
                Q[s, a] += alpha * (r - Q[s, a])
                break

            a_next = select_action(Q, s_next, strategy, param)
            td_target = r + gamma * Q[s_next, a_next]
            Q[s, a] += alpha * (td_target - Q[s, a])
            s, a = s_next, a_next

        returns.append(total_reward)

        # ✅ WandB logging
        if run is not None:
            wandb.log({
                "episode": ep,
                "episode_return": total_reward,
                "algorithm": "SARSA",
                "alpha": alpha,
                "gamma": gamma,
                "param": param,
                "strategy": strategy,
                "seed": seed
            })

        if (ep + 1) % 200 == 0:
            print(f"[SARSA] Episode {ep+1}/{episodes} | Mean(Last50): {np.mean(returns[-50:]):.2f}")

    return Q, returns


### Setting up Wandb

In [6]:
'''wandb.init()

# 
sweep_config_1 = {
    'method': 'grid',   # try every combination
    'metric': {
        'name': 'mean_eval_return',
        'goal': 'maximize'
    },
    'parameters': {
        'alpha': {'values': [0.001, 0.01, 0.1, 1.0]},
        'gamma': {'values': [0.7, 0.8, 0.9, 1.0]},
        'param': {'values': [0.001, 0.01, 0.05, 0.1]},  # Use param if your code expects epsilon as "param"
        'algorithm': {'values': ['q_learning']},
        'strategy': {'values': ['eps_greedy']},
        'env_name': {'values': ['std_q_tp0.7_ss(0,4)_strateps_greedy']}
    }
}'''

'wandb.init()\n\n# \nsweep_config_1 = {\n    \'method\': \'grid\',   # try every combination\n    \'metric\': {\n        \'name\': \'mean_eval_return\',\n        \'goal\': \'maximize\'\n    },\n    \'parameters\': {\n        \'alpha\': {\'values\': [0.001, 0.01, 0.1, 1.0]},\n        \'gamma\': {\'values\': [0.7, 0.8, 0.9, 1.0]},\n        \'param\': {\'values\': [0.001, 0.01, 0.05, 0.1]},  # Use param if your code expects epsilon as "param"\n        \'algorithm\': {\'values\': [\'q_learning\']},\n        \'strategy\': {\'values\': [\'eps_greedy\']},\n        \'env_name\': {\'values\': [\'std_q_tp0.7_ss(0,4)_strateps_greedy\']}\n    }\n}'

In [8]:
'''sweep_id = wandb.sweep(sweep_config_1, project="GPA_2_PART_A")
print("Sweep ID:", sweep_id)'''


'sweep_id = wandb.sweep(sweep_config_1, project="GPA_2_PART_A")\nprint("Sweep ID:", sweep_id)'

In [11]:
import numpy as np
import wandb
import random

def train_one_sweep_run():
    wandb.init()
    cfg = wandb.config

    n_seeds = 5  # number of seeds
    eval_scores = []
    final_mean_rewards = []

    for seed in range(n_seeds):
        np.random.seed(seed)
        random.seed(seed)

        # setup environment (make sure env creation uses np.random properly)
        env = create_standard_grid(transition_prob=0.7, start_state=np.array([[0, 4]]))

        # choose algorithm
        if cfg.algorithm == "q_learning":
            Q, returns = train_q_learning_fixed(
                env,
                episodes=1500,
                alpha=cfg.alpha,
                gamma=cfg.gamma,
                strategy=cfg.strategy,
                param=cfg.param,
                max_steps=100,
                run=None  # avoid spamming W&B with logs per seed
            )
        else:
            Q, returns = train_sarsa_fixed(
                env,
                episodes=1500,
                alpha=cfg.alpha,
                gamma=cfg.gamma,
                strategy=cfg.strategy,
                param=cfg.param,
                max_steps=100,
                run=None
            )

        # evaluate this seed
        avg_eval = evaluate_policy(env, Q, episodes=20, max_steps=100)

        eval_scores.append(avg_eval)
        final_mean_rewards.append(np.mean(returns[-50:]))

        wandb.log({
            'seed': seed,
            'seed_eval_return': avg_eval,
            'seed_final_mean_reward': np.mean(returns[-50:])
        })

    # aggregate over all seeds
    mean_eval = np.mean(eval_scores)
    std_eval = np.std(eval_scores)
    mean_final_reward = np.mean(final_mean_rewards)
    std_final_reward = np.std(final_mean_rewards)

    # log aggregated statistics
    wandb.log({
        'mean_eval_return': mean_eval,
        'std_eval_return': std_eval,
        'mean_final_mean_reward_last50': mean_final_reward,
        'std_final_mean_reward_last50': std_final_reward
    })

    print(
        f"Run finished | α={cfg.alpha}, γ={cfg.gamma}, param={cfg.param} "
        f"→ avg_eval={mean_eval:.2f} ± {std_eval:.2f}"
    )
    wandb.finish()



In [None]:
'''def train_one_sweep_run_for_seeds():
    wandb.init()
    seeds = [1, 2, 3, 4, 5]
    eval_returns = []
    returns_list = []

    cfg = wandb.config

    for seed in seeds:
        set_seed(seed)
        # DO NOT call wandb.init() here!

        env = create_standard_grid(transition_prob=0.7, start_state=np.array([[0, 4]]))

        # ...train...

        avg_eval = evaluate_policy(env, Q, episodes=20, max_steps=100)
        eval_returns.append(avg_eval)
        returns_list.append(np.mean(returns[-50:]))

        wandb.log({
            'mean_eval_return': avg_eval,
            'final_mean_reward_last50': np.mean(returns[-50:]),
            'seed': seed
        })
        print(f"Seed {seed}... avg_eval={avg_eval:.2f}")

    # Log the mean across all seeds for the sweep run
    mean_eval = np.mean(eval_returns)
    mean_last_50 = np.mean(returns_list)
    wandb.log({
        'mean_eval_return_5seeds': mean_eval,
        'final_mean_reward_last50_5seeds': mean_last_50
    })
    wandb.finish()'''


In [None]:
#wandb.agent(sweep_id, function=train_one_sweep_run, count=None)


In [12]:
sweep_config_2 = {
    'method': 'grid',   # try every combination
    'metric': {
        'name': 'mean_eval_return',
        'goal': 'maximize'
    },
    'parameters': {
        'alpha': {'values': [0.001, 0.01, 0.1, 1.0]},
        'gamma': {'values': [0.7, 0.8, 0.9, 1.0]},
        'param': {'values': [0.001, 0.01, 0.05, 0.1]},
        'algorithm': {'values': ['q_learning']},
        'strategy': {'values': ['eps_greedy']},
        'env_name': {'values': ['std_q_tp1.0_ss(0,4)_strateps_greedy']}
    }
}


In [13]:
sweep_id_2 = wandb.sweep(sweep_config_2, project="GPA_2_PART_A")
print("Sweep ID:", sweep_id_2)


Create sweep with ID: dn2upq25
Sweep URL: https://wandb.ai/leenh050418-iit-madras-foundation/GPA_2_PART_A/sweeps/dn2upq25
Sweep ID: dn2upq25


In [14]:
wandb.agent(sweep_id_2, function=train_one_sweep_run, count=None)

[34m[1mwandb[0m: Agent Starting Run: ds8x8ucm with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.001
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.7
[34m[1mwandb[0m: 	param: 0.001
[34m[1mwandb[0m: 	strategy: eps_greedy
[34m[1mwandb[0m: Currently logged in as: [33mleenh050418[0m ([33mleenh050418-iit-madras-foundation[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[Q] Episode 200/1500 | Mean(Last50): -117.18
[Q] Episode 400/1500 | Mean(Last50): -111.70
[Q] Episode 600/1500 | Mean(Last50): -107.54
[Q] Episode 800/1500 | Mean(Last50): -106.52
[Q] Episode 1000/1500 | Mean(Last50): -107.24
[Q] Episode 1200/1500 | Mean(Last50): -106.56
[Q] Episode 1400/1500 | Mean(Last50): -112.98
[Q] Episode 200/1500 | Mean(Last50): -111.56
[Q] Episode 400/1500 | Mean(Last50): -108.46
[Q] Episode 600/1500 | Mean(Last50): -110.16
[Q] Episode 800/1500 | Mean(Last50): -111.22
[Q] Episode 1000/1500 | Mean(Last50): -105.06
[Q] Episode 1200/1500 | Mean(Last50): -108.20
[Q] Episode 1400/1500 | Mean(Last50): -107.04
[Q] Episode 200/1500 | Mean(Last50): -113.84
[Q] Episode 400/1500 | Mean(Last50): -117.84
[Q] Episode 600/1500 | Mean(Last50): -112.90
[Q] Episode 800/1500 | Mean(Last50): -115.48
[Q] Episode 1000/1500 | Mean(Last50): -111.96
[Q] Episode 1200/1500 | Mean(Last50): -107.74
[Q] Episode 1400/1500 | Mean(Last50): -104.94
[Q] Episode 200/1500 | Mean(Last50): -114.36
[

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=0.001, γ=0.7, param=0.001 → avg_eval=-133.22 ± 54.59


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,█▇█▁█
seed_final_mean_reward,▂█▄▆▁
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-133.22
mean_final_mean_reward_last50,-106.38
seed,4.0
seed_eval_return,-105.5
seed_final_mean_reward,-110.2
std_eval_return,54.5884
std_final_mean_reward_last50,2.99145


[34m[1mwandb[0m: Agent Starting Run: 8st3j7vk with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.001
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.7
[34m[1mwandb[0m: 	param: 0.01
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -115.14
[Q] Episode 400/1500 | Mean(Last50): -110.16
[Q] Episode 600/1500 | Mean(Last50): -108.42
[Q] Episode 800/1500 | Mean(Last50): -113.42
[Q] Episode 1000/1500 | Mean(Last50): -109.90
[Q] Episode 1200/1500 | Mean(Last50): -109.18
[Q] Episode 1400/1500 | Mean(Last50): -112.20
[Q] Episode 200/1500 | Mean(Last50): -111.18
[Q] Episode 400/1500 | Mean(Last50): -113.48
[Q] Episode 600/1500 | Mean(Last50): -117.42
[Q] Episode 800/1500 | Mean(Last50): -115.70
[Q] Episode 1000/1500 | Mean(Last50): -112.84
[Q] Episode 1200/1500 | Mean(Last50): -113.40
[Q] Episode 1400/1500 | Mean(Last50): -106.94
[Q] Episode 200/1500 | Mean(Last50): -111.92
[Q] Episode 400/1500 | Mean(Last50): -114.94
[Q] Episode 600/1500 | Mean(Last50): -111.06
[Q] Episode 800/1500 | Mean(Last50): -106.14
[Q] Episode 1000/1500 | Mean(Last50): -112.40
[Q] Episode 1200/1500 | Mean(Last50): -109.08
[Q] Episode 1400/1500 | Mean(Last50): -105.40
[Q] Episode 200/1500 | Mean(Last50): -111.48
[

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=0.001, γ=0.7, param=0.01 → avg_eval=-108.38 ± 11.33


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,██▁▅█
seed_final_mean_reward,▅▁█▃▃
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-108.38
mean_final_mean_reward_last50,-106.78
seed,4.0
seed_eval_return,-100.0
seed_final_mean_reward,-108.6
std_eval_return,11.33215
std_final_mean_reward_last50,3.70744


[34m[1mwandb[0m: Agent Starting Run: xuugc7qr with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.001
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.7
[34m[1mwandb[0m: 	param: 0.05
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -113.26
[Q] Episode 400/1500 | Mean(Last50): -121.46
[Q] Episode 600/1500 | Mean(Last50): -107.42
[Q] Episode 800/1500 | Mean(Last50): -108.62
[Q] Episode 1000/1500 | Mean(Last50): -110.68
[Q] Episode 1200/1500 | Mean(Last50): -107.30
[Q] Episode 1400/1500 | Mean(Last50): -116.56
[Q] Episode 200/1500 | Mean(Last50): -110.18
[Q] Episode 400/1500 | Mean(Last50): -115.52
[Q] Episode 600/1500 | Mean(Last50): -113.10
[Q] Episode 800/1500 | Mean(Last50): -116.76
[Q] Episode 1000/1500 | Mean(Last50): -113.30
[Q] Episode 1200/1500 | Mean(Last50): -110.46
[Q] Episode 1400/1500 | Mean(Last50): -114.44
[Q] Episode 200/1500 | Mean(Last50): -106.98
[Q] Episode 400/1500 | Mean(Last50): -112.04
[Q] Episode 600/1500 | Mean(Last50): -115.18
[Q] Episode 800/1500 | Mean(Last50): -111.58
[Q] Episode 1000/1500 | Mean(Last50): -114.70
[Q] Episode 1200/1500 | Mean(Last50): -116.92
[Q] Episode 1400/1500 | Mean(Last50): -114.26
[Q] Episode 200/1500 | Mean(Last50): -121.70
[

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=0.001, γ=0.7, param=0.05 → avg_eval=-104.79 ± 6.66


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,███▅▁
seed_final_mean_reward,▆▁█▇█
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-104.79
mean_final_mean_reward_last50,-112.82
seed,4.0
seed_eval_return,-116.95
seed_final_mean_reward,-109.18
std_eval_return,6.65706
std_final_mean_reward_last50,4.6958


[34m[1mwandb[0m: Agent Starting Run: b4wyhq73 with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.001
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.7
[34m[1mwandb[0m: 	param: 0.1
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -118.96
[Q] Episode 400/1500 | Mean(Last50): -115.24
[Q] Episode 600/1500 | Mean(Last50): -113.26
[Q] Episode 800/1500 | Mean(Last50): -120.68
[Q] Episode 1000/1500 | Mean(Last50): -115.62
[Q] Episode 1200/1500 | Mean(Last50): -115.36
[Q] Episode 1400/1500 | Mean(Last50): -120.02
[Q] Episode 200/1500 | Mean(Last50): -116.98
[Q] Episode 400/1500 | Mean(Last50): -118.24
[Q] Episode 600/1500 | Mean(Last50): -112.86
[Q] Episode 800/1500 | Mean(Last50): -111.80
[Q] Episode 1000/1500 | Mean(Last50): -107.56
[Q] Episode 1200/1500 | Mean(Last50): -111.06
[Q] Episode 1400/1500 | Mean(Last50): -112.08
[Q] Episode 200/1500 | Mean(Last50): -122.78
[Q] Episode 400/1500 | Mean(Last50): -120.24
[Q] Episode 600/1500 | Mean(Last50): -120.56
[Q] Episode 800/1500 | Mean(Last50): -122.40
[Q] Episode 1000/1500 | Mean(Last50): -115.62
[Q] Episode 1200/1500 | Mean(Last50): -119.04
[Q] Episode 1400/1500 | Mean(Last50): -110.66
[Q] Episode 200/1500 | Mean(Last50): -114.86
[

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=0.001, γ=0.7, param=0.1 → avg_eval=-104.68 ± 4.15


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▁█▃▅█
seed_final_mean_reward,▁▃▃██
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-104.68
mean_final_mean_reward_last50,-114.044
seed,4.0
seed_eval_return,-100.65
seed_final_mean_reward,-110.22
std_eval_return,4.15266
std_final_mean_reward_last50,3.20351


[34m[1mwandb[0m: Agent Starting Run: vvtl7i3g with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.001
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.8
[34m[1mwandb[0m: 	param: 0.001
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -119.26
[Q] Episode 400/1500 | Mean(Last50): -115.26
[Q] Episode 600/1500 | Mean(Last50): -113.04
[Q] Episode 800/1500 | Mean(Last50): -112.08
[Q] Episode 1000/1500 | Mean(Last50): -102.38
[Q] Episode 1200/1500 | Mean(Last50): -107.84
[Q] Episode 1400/1500 | Mean(Last50): -107.30
[Q] Episode 200/1500 | Mean(Last50): -117.56
[Q] Episode 400/1500 | Mean(Last50): -115.18
[Q] Episode 600/1500 | Mean(Last50): -108.42
[Q] Episode 800/1500 | Mean(Last50): -109.94
[Q] Episode 1000/1500 | Mean(Last50): -110.52
[Q] Episode 1200/1500 | Mean(Last50): -114.38
[Q] Episode 1400/1500 | Mean(Last50): -108.96
[Q] Episode 200/1500 | Mean(Last50): -117.02
[Q] Episode 400/1500 | Mean(Last50): -113.96
[Q] Episode 600/1500 | Mean(Last50): -107.18
[Q] Episode 800/1500 | Mean(Last50): -104.88
[Q] Episode 1000/1500 | Mean(Last50): -110.18
[Q] Episode 1200/1500 | Mean(Last50): -104.62
[Q] Episode 1400/1500 | Mean(Last50): -106.12
[Q] Episode 200/1500 | Mean(Last50): -110.60
[

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=0.001, γ=0.8, param=0.001 → avg_eval=-106.00 ± 12.63


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,█▁███
seed_final_mean_reward,▁▄▃▂█
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-106.0
mean_final_mean_reward_last50,-106.788
seed,4.0
seed_eval_return,-100.0
seed_final_mean_reward,-99.68
std_eval_return,12.63428
std_final_mean_reward_last50,3.87033


[34m[1mwandb[0m: Agent Starting Run: 3uspultf with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.001
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.8
[34m[1mwandb[0m: 	param: 0.01
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -113.02
[Q] Episode 400/1500 | Mean(Last50): -113.32
[Q] Episode 600/1500 | Mean(Last50): -112.20
[Q] Episode 800/1500 | Mean(Last50): -104.32
[Q] Episode 1000/1500 | Mean(Last50): -107.78
[Q] Episode 1200/1500 | Mean(Last50): -107.12
[Q] Episode 1400/1500 | Mean(Last50): -111.74
[Q] Episode 200/1500 | Mean(Last50): -116.86
[Q] Episode 400/1500 | Mean(Last50): -110.54
[Q] Episode 600/1500 | Mean(Last50): -109.48
[Q] Episode 800/1500 | Mean(Last50): -111.96
[Q] Episode 1000/1500 | Mean(Last50): -111.58
[Q] Episode 1200/1500 | Mean(Last50): -114.70
[Q] Episode 1400/1500 | Mean(Last50): -106.74
[Q] Episode 200/1500 | Mean(Last50): -115.28
[Q] Episode 400/1500 | Mean(Last50): -116.52
[Q] Episode 600/1500 | Mean(Last50): -111.48
[Q] Episode 800/1500 | Mean(Last50): -113.16
[Q] Episode 1000/1500 | Mean(Last50): -112.02
[Q] Episode 1200/1500 | Mean(Last50): -107.96
[Q] Episode 1400/1500 | Mean(Last50): -111.58
[Q] Episode 200/1500 | Mean(Last50): -114.36
[

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=0.001, γ=0.8, param=0.01 → avg_eval=-108.98 ± 16.74


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,█▁███
seed_final_mean_reward,█▂▅▁▃
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-108.98
mean_final_mean_reward_last50,-107.132
seed,4.0
seed_eval_return,-102.5
seed_final_mean_reward,-107.92
std_eval_return,16.73803
std_final_mean_reward_last50,3.50265


[34m[1mwandb[0m: Agent Starting Run: vwm080y1 with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.001
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.8
[34m[1mwandb[0m: 	param: 0.05
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -114.32
[Q] Episode 400/1500 | Mean(Last50): -115.64
[Q] Episode 600/1500 | Mean(Last50): -106.18
[Q] Episode 800/1500 | Mean(Last50): -110.52
[Q] Episode 1000/1500 | Mean(Last50): -105.56
[Q] Episode 1200/1500 | Mean(Last50): -113.80
[Q] Episode 1400/1500 | Mean(Last50): -112.22
[Q] Episode 200/1500 | Mean(Last50): -109.60
[Q] Episode 400/1500 | Mean(Last50): -115.54
[Q] Episode 600/1500 | Mean(Last50): -120.80
[Q] Episode 800/1500 | Mean(Last50): -113.80
[Q] Episode 1000/1500 | Mean(Last50): -115.22
[Q] Episode 1200/1500 | Mean(Last50): -112.76
[Q] Episode 1400/1500 | Mean(Last50): -105.68
[Q] Episode 200/1500 | Mean(Last50): -121.98
[Q] Episode 400/1500 | Mean(Last50): -110.56
[Q] Episode 600/1500 | Mean(Last50): -111.94
[Q] Episode 800/1500 | Mean(Last50): -106.40
[Q] Episode 1000/1500 | Mean(Last50): -110.20
[Q] Episode 1200/1500 | Mean(Last50): -105.82
[Q] Episode 1400/1500 | Mean(Last50): -113.58
[Q] Episode 200/1500 | Mean(Last50): -114.62
[

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=0.001, γ=0.8, param=0.05 → avg_eval=-100.10 ± 0.20


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,██▁██
seed_final_mean_reward,▃▆▁▄█
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-100.1
mean_final_mean_reward_last50,-111.24
seed,4.0
seed_eval_return,-100.0
seed_final_mean_reward,-101.74
std_eval_return,0.2
std_final_mean_reward_last50,6.20985


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: o6tpdnhk with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.001
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.8
[34m[1mwandb[0m: 	param: 0.1
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -118.06
[Q] Episode 400/1500 | Mean(Last50): -114.14
[Q] Episode 600/1500 | Mean(Last50): -128.70
[Q] Episode 800/1500 | Mean(Last50): -120.64
[Q] Episode 1000/1500 | Mean(Last50): -116.20
[Q] Episode 1200/1500 | Mean(Last50): -116.34
[Q] Episode 1400/1500 | Mean(Last50): -108.40
[Q] Episode 200/1500 | Mean(Last50): -129.98
[Q] Episode 400/1500 | Mean(Last50): -111.68
[Q] Episode 600/1500 | Mean(Last50): -114.18
[Q] Episode 800/1500 | Mean(Last50): -116.92
[Q] Episode 1000/1500 | Mean(Last50): -118.68
[Q] Episode 1200/1500 | Mean(Last50): -117.86
[Q] Episode 1400/1500 | Mean(Last50): -106.92
[Q] Episode 200/1500 | Mean(Last50): -120.52
[Q] Episode 400/1500 | Mean(Last50): -109.28
[Q] Episode 600/1500 | Mean(Last50): -120.56
[Q] Episode 800/1500 | Mean(Last50): -121.46
[Q] Episode 1000/1500 | Mean(Last50): -112.16
[Q] Episode 1200/1500 | Mean(Last50): -114.56
[Q] Episode 1400/1500 | Mean(Last50): -108.26
[Q] Episode 200/1500 | Mean(Last50): -117.72
[

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=0.001, γ=0.8, param=0.1 → avg_eval=-103.40 ± 8.82


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,█▇▇▇▁
seed_final_mean_reward,▁▃█▆▅
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-103.4
mean_final_mean_reward_last50,-118.632
seed,4.0
seed_eval_return,-120.8
seed_final_mean_reward,-117.84
std_eval_return,8.8236
std_final_mean_reward_last50,4.22144


[34m[1mwandb[0m: Agent Starting Run: uoy62ohm with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.001
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.9
[34m[1mwandb[0m: 	param: 0.001
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -112.30
[Q] Episode 400/1500 | Mean(Last50): -112.86
[Q] Episode 600/1500 | Mean(Last50): -111.30
[Q] Episode 800/1500 | Mean(Last50): -110.66
[Q] Episode 1000/1500 | Mean(Last50): -109.68
[Q] Episode 1200/1500 | Mean(Last50): -111.48
[Q] Episode 1400/1500 | Mean(Last50): -111.20
[Q] Episode 200/1500 | Mean(Last50): -114.34
[Q] Episode 400/1500 | Mean(Last50): -108.48
[Q] Episode 600/1500 | Mean(Last50): -111.02
[Q] Episode 800/1500 | Mean(Last50): -116.64
[Q] Episode 1000/1500 | Mean(Last50): -103.44
[Q] Episode 1200/1500 | Mean(Last50): -112.66
[Q] Episode 1400/1500 | Mean(Last50): -107.88
[Q] Episode 200/1500 | Mean(Last50): -116.36
[Q] Episode 400/1500 | Mean(Last50): -119.48
[Q] Episode 600/1500 | Mean(Last50): -107.12
[Q] Episode 800/1500 | Mean(Last50): -113.62
[Q] Episode 1000/1500 | Mean(Last50): -109.12
[Q] Episode 1200/1500 | Mean(Last50): -108.80
[Q] Episode 1400/1500 | Mean(Last50): -112.78
[Q] Episode 200/1500 | Mean(Last50): -112.58
[

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=0.001, γ=0.9, param=0.001 → avg_eval=-112.45 ± 21.83


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,██▁█▇
seed_final_mean_reward,█▁▂█▅
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-112.45
mean_final_mean_reward_last50,-109.252
seed,4.0
seed_eval_return,-104.25
seed_final_mean_reward,-108.74
std_eval_return,21.8314
std_final_mean_reward_last50,3.77608


[34m[1mwandb[0m: Agent Starting Run: ie7orcgt with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.001
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.9
[34m[1mwandb[0m: 	param: 0.01
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -114.56
[Q] Episode 400/1500 | Mean(Last50): -111.44
[Q] Episode 600/1500 | Mean(Last50): -106.94
[Q] Episode 800/1500 | Mean(Last50): -106.46
[Q] Episode 1000/1500 | Mean(Last50): -106.80
[Q] Episode 1200/1500 | Mean(Last50): -112.24
[Q] Episode 1400/1500 | Mean(Last50): -105.22
[Q] Episode 200/1500 | Mean(Last50): -114.42
[Q] Episode 400/1500 | Mean(Last50): -111.92
[Q] Episode 600/1500 | Mean(Last50): -106.50
[Q] Episode 800/1500 | Mean(Last50): -108.18
[Q] Episode 1000/1500 | Mean(Last50): -111.46
[Q] Episode 1200/1500 | Mean(Last50): -108.54
[Q] Episode 1400/1500 | Mean(Last50): -110.72
[Q] Episode 200/1500 | Mean(Last50): -113.22
[Q] Episode 400/1500 | Mean(Last50): -116.36
[Q] Episode 600/1500 | Mean(Last50): -109.12
[Q] Episode 800/1500 | Mean(Last50): -110.92
[Q] Episode 1000/1500 | Mean(Last50): -109.58
[Q] Episode 1200/1500 | Mean(Last50): -110.46
[Q] Episode 1400/1500 | Mean(Last50): -104.88
[Q] Episode 200/1500 | Mean(Last50): -112.52
[

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=0.001, γ=0.9, param=0.01 → avg_eval=-122.35 ± 26.22


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▇▄██▁
seed_final_mean_reward,█▁▅▇▂
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-122.35
mean_final_mean_reward_last50,-108.808
seed,4.0
seed_eval_return,-165.25
seed_final_mean_reward,-111.12
std_eval_return,26.21526
std_final_mean_reward_last50,2.26421


[34m[1mwandb[0m: Agent Starting Run: 2963y4fn with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.001
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.9
[34m[1mwandb[0m: 	param: 0.05
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -112.46
[Q] Episode 400/1500 | Mean(Last50): -116.44
[Q] Episode 600/1500 | Mean(Last50): -107.68
[Q] Episode 800/1500 | Mean(Last50): -119.14
[Q] Episode 1000/1500 | Mean(Last50): -107.28
[Q] Episode 1200/1500 | Mean(Last50): -110.12
[Q] Episode 1400/1500 | Mean(Last50): -115.30
[Q] Episode 200/1500 | Mean(Last50): -115.28
[Q] Episode 400/1500 | Mean(Last50): -113.98
[Q] Episode 600/1500 | Mean(Last50): -113.78
[Q] Episode 800/1500 | Mean(Last50): -106.60
[Q] Episode 1000/1500 | Mean(Last50): -108.56
[Q] Episode 1200/1500 | Mean(Last50): -110.70
[Q] Episode 1400/1500 | Mean(Last50): -118.86
[Q] Episode 200/1500 | Mean(Last50): -117.22
[Q] Episode 400/1500 | Mean(Last50): -119.84
[Q] Episode 600/1500 | Mean(Last50): -115.00
[Q] Episode 800/1500 | Mean(Last50): -107.60
[Q] Episode 1000/1500 | Mean(Last50): -114.62
[Q] Episode 1200/1500 | Mean(Last50): -120.72
[Q] Episode 1400/1500 | Mean(Last50): -111.72
[Q] Episode 200/1500 | Mean(Last50): -117.24
[

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=0.001, γ=0.9, param=0.05 → avg_eval=-96.42 ± 9.29


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▁█▂▂▂
seed_final_mean_reward,▇█▇▁▄
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-96.42
mean_final_mean_reward_last50,-111.052
seed,4.0
seed_eval_return,-100.0
seed_final_mean_reward,-112.74
std_eval_return,9.29008
std_final_mean_reward_last50,2.81074


[34m[1mwandb[0m: Agent Starting Run: vy8xsg14 with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.001
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.9
[34m[1mwandb[0m: 	param: 0.1
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -119.04
[Q] Episode 400/1500 | Mean(Last50): -114.26
[Q] Episode 600/1500 | Mean(Last50): -124.32
[Q] Episode 800/1500 | Mean(Last50): -117.94
[Q] Episode 1000/1500 | Mean(Last50): -117.98
[Q] Episode 1200/1500 | Mean(Last50): -129.58
[Q] Episode 1400/1500 | Mean(Last50): -116.34
[Q] Episode 200/1500 | Mean(Last50): -120.30
[Q] Episode 400/1500 | Mean(Last50): -120.46
[Q] Episode 600/1500 | Mean(Last50): -117.06
[Q] Episode 800/1500 | Mean(Last50): -118.44
[Q] Episode 1000/1500 | Mean(Last50): -114.62
[Q] Episode 1200/1500 | Mean(Last50): -113.72
[Q] Episode 1400/1500 | Mean(Last50): -110.94
[Q] Episode 200/1500 | Mean(Last50): -117.72
[Q] Episode 400/1500 | Mean(Last50): -118.60
[Q] Episode 600/1500 | Mean(Last50): -118.80
[Q] Episode 800/1500 | Mean(Last50): -115.58
[Q] Episode 1000/1500 | Mean(Last50): -115.08
[Q] Episode 1200/1500 | Mean(Last50): -113.36
[Q] Episode 1400/1500 | Mean(Last50): -106.20
[Q] Episode 200/1500 | Mean(Last50): -114.80
[

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=0.001, γ=0.9, param=0.1 → avg_eval=-100.36 ± 0.96


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▇▇█▇▁
seed_final_mean_reward,▅▁▁█▃
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-100.36
mean_final_mean_reward_last50,-114.728
seed,4.0
seed_eval_return,-102.25
seed_final_mean_reward,-116.04
std_eval_return,0.96094
std_final_mean_reward_last50,5.04597


[34m[1mwandb[0m: Agent Starting Run: 6ry5tvz6 with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.001
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 1
[34m[1mwandb[0m: 	param: 0.001
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -111.58
[Q] Episode 400/1500 | Mean(Last50): -103.46
[Q] Episode 600/1500 | Mean(Last50): -117.20
[Q] Episode 800/1500 | Mean(Last50): -115.44
[Q] Episode 1000/1500 | Mean(Last50): -108.06
[Q] Episode 1200/1500 | Mean(Last50): -115.76
[Q] Episode 1400/1500 | Mean(Last50): -107.12
[Q] Episode 200/1500 | Mean(Last50): -116.30
[Q] Episode 400/1500 | Mean(Last50): -115.22
[Q] Episode 600/1500 | Mean(Last50): -112.42
[Q] Episode 800/1500 | Mean(Last50): -112.30
[Q] Episode 1000/1500 | Mean(Last50): -112.04
[Q] Episode 1200/1500 | Mean(Last50): -110.48
[Q] Episode 1400/1500 | Mean(Last50): -112.56
[Q] Episode 200/1500 | Mean(Last50): -115.68
[Q] Episode 400/1500 | Mean(Last50): -111.06
[Q] Episode 600/1500 | Mean(Last50): -108.28
[Q] Episode 800/1500 | Mean(Last50): -113.06
[Q] Episode 1000/1500 | Mean(Last50): -117.08
[Q] Episode 1200/1500 | Mean(Last50): -115.82
[Q] Episode 1400/1500 | Mean(Last50): -112.96
[Q] Episode 200/1500 | Mean(Last50): -112.34
[

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=0.001, γ=1, param=0.001 → avg_eval=-129.05 ± 55.93


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,█▁█▇█
seed_final_mean_reward,▁█▆▅█
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-129.05
mean_final_mean_reward_last50,-109.288
seed,4.0
seed_eval_return,-97.15
seed_final_mean_reward,-106.42
std_eval_return,55.93279
std_final_mean_reward_last50,2.91121


[34m[1mwandb[0m: Agent Starting Run: ef4mozja with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.001
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 1
[34m[1mwandb[0m: 	param: 0.01
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -109.10
[Q] Episode 400/1500 | Mean(Last50): -116.96
[Q] Episode 600/1500 | Mean(Last50): -111.68
[Q] Episode 800/1500 | Mean(Last50): -108.00
[Q] Episode 1000/1500 | Mean(Last50): -109.02
[Q] Episode 1200/1500 | Mean(Last50): -115.64
[Q] Episode 1400/1500 | Mean(Last50): -111.06
[Q] Episode 200/1500 | Mean(Last50): -114.58
[Q] Episode 400/1500 | Mean(Last50): -113.24
[Q] Episode 600/1500 | Mean(Last50): -109.50
[Q] Episode 800/1500 | Mean(Last50): -114.36
[Q] Episode 1000/1500 | Mean(Last50): -111.42
[Q] Episode 1200/1500 | Mean(Last50): -105.38
[Q] Episode 1400/1500 | Mean(Last50): -106.62
[Q] Episode 200/1500 | Mean(Last50): -112.76
[Q] Episode 400/1500 | Mean(Last50): -115.20
[Q] Episode 600/1500 | Mean(Last50): -108.78
[Q] Episode 800/1500 | Mean(Last50): -111.74
[Q] Episode 1000/1500 | Mean(Last50): -107.88
[Q] Episode 1200/1500 | Mean(Last50): -111.06
[Q] Episode 1400/1500 | Mean(Last50): -112.00
[Q] Episode 200/1500 | Mean(Last50): -116.52
[

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=0.001, γ=1, param=0.01 → avg_eval=-106.76 ± 13.30


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▁▇▇▆█
seed_final_mean_reward,▅▁██▇
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-106.76
mean_final_mean_reward_last50,-110.888
seed,4.0
seed_eval_return,-96.05
seed_final_mean_reward,-109.18
std_eval_return,13.30171
std_final_mean_reward_last50,4.12442


[34m[1mwandb[0m: Agent Starting Run: y31k7364 with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.001
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 1
[34m[1mwandb[0m: 	param: 0.05
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -114.18
[Q] Episode 400/1500 | Mean(Last50): -114.14
[Q] Episode 600/1500 | Mean(Last50): -118.74
[Q] Episode 800/1500 | Mean(Last50): -109.26
[Q] Episode 1000/1500 | Mean(Last50): -116.60
[Q] Episode 1200/1500 | Mean(Last50): -112.56
[Q] Episode 1400/1500 | Mean(Last50): -103.28
[Q] Episode 200/1500 | Mean(Last50): -120.46
[Q] Episode 400/1500 | Mean(Last50): -118.10
[Q] Episode 600/1500 | Mean(Last50): -112.32
[Q] Episode 800/1500 | Mean(Last50): -111.66
[Q] Episode 1000/1500 | Mean(Last50): -107.78
[Q] Episode 1200/1500 | Mean(Last50): -110.84
[Q] Episode 1400/1500 | Mean(Last50): -106.36
[Q] Episode 200/1500 | Mean(Last50): -113.36
[Q] Episode 400/1500 | Mean(Last50): -108.86
[Q] Episode 600/1500 | Mean(Last50): -110.32
[Q] Episode 800/1500 | Mean(Last50): -104.62
[Q] Episode 1000/1500 | Mean(Last50): -109.36
[Q] Episode 1200/1500 | Mean(Last50): -115.70
[Q] Episode 1400/1500 | Mean(Last50): -117.72
[Q] Episode 200/1500 | Mean(Last50): -115.02
[

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=0.001, γ=1, param=0.05 → avg_eval=-102.68 ± 5.41


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,██▁██
seed_final_mean_reward,▅▇█▁▇
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-102.68
mean_final_mean_reward_last50,-110.0
seed,4.0
seed_eval_return,-100.0
seed_final_mean_reward,-108.86
std_eval_return,5.41014
std_final_mean_reward_last50,2.54555


[34m[1mwandb[0m: Agent Starting Run: t60526jr with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.001
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 1
[34m[1mwandb[0m: 	param: 0.1
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -130.44
[Q] Episode 400/1500 | Mean(Last50): -110.88
[Q] Episode 600/1500 | Mean(Last50): -110.12
[Q] Episode 800/1500 | Mean(Last50): -117.12
[Q] Episode 1000/1500 | Mean(Last50): -124.64
[Q] Episode 1200/1500 | Mean(Last50): -119.62
[Q] Episode 1400/1500 | Mean(Last50): -117.62
[Q] Episode 200/1500 | Mean(Last50): -124.92
[Q] Episode 400/1500 | Mean(Last50): -123.00
[Q] Episode 600/1500 | Mean(Last50): -123.18
[Q] Episode 800/1500 | Mean(Last50): -114.48
[Q] Episode 1000/1500 | Mean(Last50): -109.46
[Q] Episode 1200/1500 | Mean(Last50): -115.10
[Q] Episode 1400/1500 | Mean(Last50): -111.94
[Q] Episode 200/1500 | Mean(Last50): -112.58
[Q] Episode 400/1500 | Mean(Last50): -112.34
[Q] Episode 600/1500 | Mean(Last50): -108.14
[Q] Episode 800/1500 | Mean(Last50): -112.50
[Q] Episode 1000/1500 | Mean(Last50): -122.34
[Q] Episode 1200/1500 | Mean(Last50): -115.20
[Q] Episode 1400/1500 | Mean(Last50): -109.80
[Q] Episode 200/1500 | Mean(Last50): -128.70
[

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=0.001, γ=1, param=0.1 → avg_eval=-99.50 ± 1.00


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▁▁█▁▁
seed_final_mean_reward,▇█▁▇▇
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-99.5
mean_final_mean_reward_last50,-114.412
seed,4.0
seed_eval_return,-100.0
seed_final_mean_reward,-112.62
std_eval_return,1.0
std_final_mean_reward_last50,4.49741


[34m[1mwandb[0m: Agent Starting Run: w1f4qbpl with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.01
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.7
[34m[1mwandb[0m: 	param: 0.001
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -102.88
[Q] Episode 400/1500 | Mean(Last50): -101.50
[Q] Episode 600/1500 | Mean(Last50): -101.88
[Q] Episode 800/1500 | Mean(Last50): -102.40
[Q] Episode 1000/1500 | Mean(Last50): -93.26
[Q] Episode 1200/1500 | Mean(Last50): -90.00
[Q] Episode 1400/1500 | Mean(Last50): -94.12
[Q] Episode 200/1500 | Mean(Last50): -108.30
[Q] Episode 400/1500 | Mean(Last50): -100.92
[Q] Episode 600/1500 | Mean(Last50): -103.52
[Q] Episode 800/1500 | Mean(Last50): -86.08
[Q] Episode 1000/1500 | Mean(Last50): -97.64
[Q] Episode 1200/1500 | Mean(Last50): -93.68
[Q] Episode 1400/1500 | Mean(Last50): -87.78
[Q] Episode 200/1500 | Mean(Last50): -107.08
[Q] Episode 400/1500 | Mean(Last50): -98.44
[Q] Episode 600/1500 | Mean(Last50): -99.28
[Q] Episode 800/1500 | Mean(Last50): -96.60
[Q] Episode 1000/1500 | Mean(Last50): -90.96
[Q] Episode 1200/1500 | Mean(Last50): -87.54
[Q] Episode 1400/1500 | Mean(Last50): -89.40
[Q] Episode 200/1500 | Mean(Last50): -103.50
[Q] Episode 40

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


[Q] Episode 1400/1500 | Mean(Last50): -90.44
Run finished | α=0.01, γ=0.7, param=0.001 → avg_eval=-104.95 ± 9.26


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,███▁█
seed_final_mean_reward,▁█▇▆▆
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-104.95
mean_final_mean_reward_last50,-88.328
seed,4.0
seed_eval_return,-100.5
seed_final_mean_reward,-88.12
std_eval_return,9.25505
std_final_mean_reward_last50,3.04916


[34m[1mwandb[0m: Agent Starting Run: 7m7v07fl with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.01
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.7
[34m[1mwandb[0m: 	param: 0.01
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -108.24
[Q] Episode 400/1500 | Mean(Last50): -102.66
[Q] Episode 600/1500 | Mean(Last50): -99.84
[Q] Episode 800/1500 | Mean(Last50): -103.40
[Q] Episode 1000/1500 | Mean(Last50): -103.26
[Q] Episode 1200/1500 | Mean(Last50): -90.22
[Q] Episode 1400/1500 | Mean(Last50): -86.64
[Q] Episode 200/1500 | Mean(Last50): -106.74
[Q] Episode 400/1500 | Mean(Last50): -101.70
[Q] Episode 600/1500 | Mean(Last50): -100.92
[Q] Episode 800/1500 | Mean(Last50): -97.00
[Q] Episode 1000/1500 | Mean(Last50): -100.60
[Q] Episode 1200/1500 | Mean(Last50): -92.30
[Q] Episode 1400/1500 | Mean(Last50): -88.74
[Q] Episode 200/1500 | Mean(Last50): -108.70
[Q] Episode 400/1500 | Mean(Last50): -100.88
[Q] Episode 600/1500 | Mean(Last50): -102.88
[Q] Episode 800/1500 | Mean(Last50): -95.22
[Q] Episode 1000/1500 | Mean(Last50): -93.80
[Q] Episode 1200/1500 | Mean(Last50): -97.74
[Q] Episode 1400/1500 | Mean(Last50): -91.50
[Q] Episode 200/1500 | Mean(Last50): -100.64
[Q] Episode

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=0.01, γ=0.7, param=0.01 → avg_eval=-99.00 ± 2.00


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▁▁▁█▁
seed_final_mean_reward,▅▁▁▄█
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-99.0
mean_final_mean_reward_last50,-92.972
seed,4.0
seed_eval_return,-100.0
seed_final_mean_reward,-87.06
std_eval_return,2.0
std_final_mean_reward_last50,3.64384


[34m[1mwandb[0m: Agent Starting Run: ih9snhd7 with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.01
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.7
[34m[1mwandb[0m: 	param: 0.05
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -110.36
[Q] Episode 400/1500 | Mean(Last50): -108.46
[Q] Episode 600/1500 | Mean(Last50): -97.02
[Q] Episode 800/1500 | Mean(Last50): -98.90
[Q] Episode 1000/1500 | Mean(Last50): -107.34
[Q] Episode 1200/1500 | Mean(Last50): -96.20
[Q] Episode 1400/1500 | Mean(Last50): -99.10
[Q] Episode 200/1500 | Mean(Last50): -102.22
[Q] Episode 400/1500 | Mean(Last50): -109.84
[Q] Episode 600/1500 | Mean(Last50): -107.26
[Q] Episode 800/1500 | Mean(Last50): -106.42
[Q] Episode 1000/1500 | Mean(Last50): -94.22
[Q] Episode 1200/1500 | Mean(Last50): -102.54
[Q] Episode 1400/1500 | Mean(Last50): -86.70
[Q] Episode 200/1500 | Mean(Last50): -104.48
[Q] Episode 400/1500 | Mean(Last50): -106.32
[Q] Episode 600/1500 | Mean(Last50): -107.88
[Q] Episode 800/1500 | Mean(Last50): -99.84
[Q] Episode 1000/1500 | Mean(Last50): -107.16
[Q] Episode 1200/1500 | Mean(Last50): -97.50
[Q] Episode 1400/1500 | Mean(Last50): -92.90
[Q] Episode 200/1500 | Mean(Last50): -112.76
[Q] Episod

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=0.01, γ=0.7, param=0.05 → avg_eval=-96.38 ± 4.91


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▃▂▅█▁
seed_final_mean_reward,▂▁█▁▂
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-96.38
mean_final_mean_reward_last50,-95.264
seed,4.0
seed_eval_return,-101.5
seed_final_mean_reward,-97.1
std_eval_return,4.91229
std_final_mean_reward_last50,5.04443


[34m[1mwandb[0m: Agent Starting Run: wznkc1mw with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.01
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.7
[34m[1mwandb[0m: 	param: 0.1
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -110.20
[Q] Episode 400/1500 | Mean(Last50): -110.16
[Q] Episode 600/1500 | Mean(Last50): -102.88
[Q] Episode 800/1500 | Mean(Last50): -110.88
[Q] Episode 1000/1500 | Mean(Last50): -103.34
[Q] Episode 1200/1500 | Mean(Last50): -110.66
[Q] Episode 1400/1500 | Mean(Last50): -99.14
[Q] Episode 200/1500 | Mean(Last50): -110.64
[Q] Episode 400/1500 | Mean(Last50): -115.18
[Q] Episode 600/1500 | Mean(Last50): -103.34
[Q] Episode 800/1500 | Mean(Last50): -99.88
[Q] Episode 1000/1500 | Mean(Last50): -104.20
[Q] Episode 1200/1500 | Mean(Last50): -104.64
[Q] Episode 1400/1500 | Mean(Last50): -107.54
[Q] Episode 200/1500 | Mean(Last50): -121.94
[Q] Episode 400/1500 | Mean(Last50): -108.10
[Q] Episode 600/1500 | Mean(Last50): -116.60
[Q] Episode 800/1500 | Mean(Last50): -96.92
[Q] Episode 1000/1500 | Mean(Last50): -110.72
[Q] Episode 1200/1500 | Mean(Last50): -107.44
[Q] Episode 1400/1500 | Mean(Last50): -104.28
[Q] Episode 200/1500 | Mean(Last50): -115.94
[Q] 

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=0.01, γ=0.7, param=0.1 → avg_eval=-98.39 ± 2.90


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▁█▂▂▅
seed_final_mean_reward,▃▁▃▄█
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-98.39
mean_final_mean_reward_last50,-103.064
seed,4.0
seed_eval_return,-97.1
seed_final_mean_reward,-96.4
std_eval_return,2.89558
std_final_mean_reward_last50,3.69028


[34m[1mwandb[0m: Agent Starting Run: i4hjdeij with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.01
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.8
[34m[1mwandb[0m: 	param: 0.001
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -110.18
[Q] Episode 400/1500 | Mean(Last50): -100.36
[Q] Episode 600/1500 | Mean(Last50): -98.10
[Q] Episode 800/1500 | Mean(Last50): -96.66
[Q] Episode 1000/1500 | Mean(Last50): -91.88
[Q] Episode 1200/1500 | Mean(Last50): -85.72
[Q] Episode 1400/1500 | Mean(Last50): -90.40
[Q] Episode 200/1500 | Mean(Last50): -103.72
[Q] Episode 400/1500 | Mean(Last50): -101.88
[Q] Episode 600/1500 | Mean(Last50): -93.26
[Q] Episode 800/1500 | Mean(Last50): -96.16
[Q] Episode 1000/1500 | Mean(Last50): -87.66
[Q] Episode 1200/1500 | Mean(Last50): -90.78
[Q] Episode 1400/1500 | Mean(Last50): -79.80
[Q] Episode 200/1500 | Mean(Last50): -106.10
[Q] Episode 400/1500 | Mean(Last50): -102.24
[Q] Episode 600/1500 | Mean(Last50): -104.16
[Q] Episode 800/1500 | Mean(Last50): -96.12
[Q] Episode 1000/1500 | Mean(Last50): -88.50
[Q] Episode 1200/1500 | Mean(Last50): -89.58
[Q] Episode 1400/1500 | Mean(Last50): -84.10
[Q] Episode 200/1500 | Mean(Last50): -103.32
[Q] Episode 400

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


[Q] Episode 1400/1500 | Mean(Last50): -84.58
Run finished | α=0.01, γ=0.8, param=0.001 → avg_eval=-98.86 ± 4.85


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▃█▃▁▃
seed_final_mean_reward,▅▁▆█▇
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-98.86
mean_final_mean_reward_last50,-84.676
seed,4.0
seed_eval_return,-100.0
seed_final_mean_reward,-82.84
std_eval_return,4.8537
std_final_mean_reward_last50,3.09496


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: yf5x2imh with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.01
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.8
[34m[1mwandb[0m: 	param: 0.01
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -111.24
[Q] Episode 400/1500 | Mean(Last50): -104.64
[Q] Episode 600/1500 | Mean(Last50): -99.84
[Q] Episode 800/1500 | Mean(Last50): -93.96
[Q] Episode 1000/1500 | Mean(Last50): -92.86
[Q] Episode 1200/1500 | Mean(Last50): -94.22
[Q] Episode 1400/1500 | Mean(Last50): -83.22
[Q] Episode 200/1500 | Mean(Last50): -111.12
[Q] Episode 400/1500 | Mean(Last50): -102.26
[Q] Episode 600/1500 | Mean(Last50): -106.06
[Q] Episode 800/1500 | Mean(Last50): -93.42
[Q] Episode 1000/1500 | Mean(Last50): -96.42
[Q] Episode 1200/1500 | Mean(Last50): -87.84
[Q] Episode 1400/1500 | Mean(Last50): -87.88
[Q] Episode 200/1500 | Mean(Last50): -111.14
[Q] Episode 400/1500 | Mean(Last50): -111.20
[Q] Episode 600/1500 | Mean(Last50): -97.60
[Q] Episode 800/1500 | Mean(Last50): -93.58
[Q] Episode 1000/1500 | Mean(Last50): -94.70
[Q] Episode 1200/1500 | Mean(Last50): -86.62
[Q] Episode 1400/1500 | Mean(Last50): -85.32
[Q] Episode 200/1500 | Mean(Last50): -101.90
[Q] Episode 400

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


[Q] Episode 1400/1500 | Mean(Last50): -79.60
Run finished | α=0.01, γ=0.8, param=0.01 → avg_eval=-95.51 ± 9.36


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▁▁▁▁█
seed_final_mean_reward,▆▁▇▇█
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-95.51
mean_final_mean_reward_last50,-80.824
seed,4.0
seed_eval_return,-76.8
seed_final_mean_reward,-75.88
std_eval_return,9.35951
std_final_mean_reward_last50,5.74146


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: fbnisbux with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.01
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.8
[34m[1mwandb[0m: 	param: 0.05
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -109.36
[Q] Episode 400/1500 | Mean(Last50): -104.10
[Q] Episode 600/1500 | Mean(Last50): -93.84
[Q] Episode 800/1500 | Mean(Last50): -94.12
[Q] Episode 1000/1500 | Mean(Last50): -104.40
[Q] Episode 1200/1500 | Mean(Last50): -89.40
[Q] Episode 1400/1500 | Mean(Last50): -88.98
[Q] Episode 200/1500 | Mean(Last50): -105.18
[Q] Episode 400/1500 | Mean(Last50): -108.44
[Q] Episode 600/1500 | Mean(Last50): -112.28
[Q] Episode 800/1500 | Mean(Last50): -94.42
[Q] Episode 1000/1500 | Mean(Last50): -93.88
[Q] Episode 1200/1500 | Mean(Last50): -99.86
[Q] Episode 1400/1500 | Mean(Last50): -100.24
[Q] Episode 200/1500 | Mean(Last50): -111.26
[Q] Episode 400/1500 | Mean(Last50): -108.00
[Q] Episode 600/1500 | Mean(Last50): -102.66
[Q] Episode 800/1500 | Mean(Last50): -95.92
[Q] Episode 1000/1500 | Mean(Last50): -99.76
[Q] Episode 1200/1500 | Mean(Last50): -95.00
[Q] Episode 1400/1500 | Mean(Last50): -84.94
[Q] Episode 200/1500 | Mean(Last50): -107.26
[Q] Episode 

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=0.01, γ=0.8, param=0.05 → avg_eval=-97.82 ± 2.67


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▁▁▁██
seed_final_mean_reward,▆▅▂▁█
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-97.82
mean_final_mean_reward_last50,-86.912
seed,4.0
seed_eval_return,-94.6
seed_final_mean_reward,-81.1
std_eval_return,2.67013
std_final_mean_reward_last50,4.45681


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 1w3gwu9j with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.01
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.8
[34m[1mwandb[0m: 	param: 0.1
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -106.74
[Q] Episode 400/1500 | Mean(Last50): -113.48
[Q] Episode 600/1500 | Mean(Last50): -107.18
[Q] Episode 800/1500 | Mean(Last50): -106.80
[Q] Episode 1000/1500 | Mean(Last50): -96.52
[Q] Episode 1200/1500 | Mean(Last50): -106.88
[Q] Episode 1400/1500 | Mean(Last50): -96.66
[Q] Episode 200/1500 | Mean(Last50): -114.40
[Q] Episode 400/1500 | Mean(Last50): -117.44
[Q] Episode 600/1500 | Mean(Last50): -104.80
[Q] Episode 800/1500 | Mean(Last50): -101.34
[Q] Episode 1000/1500 | Mean(Last50): -97.64
[Q] Episode 1200/1500 | Mean(Last50): -111.90
[Q] Episode 1400/1500 | Mean(Last50): -88.62
[Q] Episode 200/1500 | Mean(Last50): -107.20
[Q] Episode 400/1500 | Mean(Last50): -110.50
[Q] Episode 600/1500 | Mean(Last50): -107.76
[Q] Episode 800/1500 | Mean(Last50): -100.24
[Q] Episode 1000/1500 | Mean(Last50): -100.74
[Q] Episode 1200/1500 | Mean(Last50): -97.48
[Q] Episode 1400/1500 | Mean(Last50): -93.52
[Q] Episode 200/1500 | Mean(Last50): -119.12
[Q] Epi

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=0.01, γ=0.8, param=0.1 → avg_eval=-88.99 ± 21.62


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▁▁▁▁█
seed_final_mean_reward,█▂▁▅▆
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-88.99
mean_final_mean_reward_last50,-95.448
seed,4.0
seed_eval_return,-45.75
seed_final_mean_reward,-91.54
std_eval_return,21.62074
std_final_mean_reward_last50,7.07974


[34m[1mwandb[0m: Agent Starting Run: f2p9ji9s with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.01
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.9
[34m[1mwandb[0m: 	param: 0.001
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -104.68
[Q] Episode 400/1500 | Mean(Last50): -108.62
[Q] Episode 600/1500 | Mean(Last50): -94.58
[Q] Episode 800/1500 | Mean(Last50): -96.50
[Q] Episode 1000/1500 | Mean(Last50): -80.44
[Q] Episode 1200/1500 | Mean(Last50): -83.54
[Q] Episode 1400/1500 | Mean(Last50): -64.76
[Q] Episode 200/1500 | Mean(Last50): -107.46
[Q] Episode 400/1500 | Mean(Last50): -104.34
[Q] Episode 600/1500 | Mean(Last50): -88.20
[Q] Episode 800/1500 | Mean(Last50): -92.78
[Q] Episode 1000/1500 | Mean(Last50): -93.48
[Q] Episode 1200/1500 | Mean(Last50): -83.08
[Q] Episode 1400/1500 | Mean(Last50): -69.16
[Q] Episode 200/1500 | Mean(Last50): -111.24
[Q] Episode 400/1500 | Mean(Last50): -113.36
[Q] Episode 600/1500 | Mean(Last50): -96.98
[Q] Episode 800/1500 | Mean(Last50): -96.52
[Q] Episode 1000/1500 | Mean(Last50): -90.86
[Q] Episode 1200/1500 | Mean(Last50): -86.34
[Q] Episode 1400/1500 | Mean(Last50): -65.32
[Q] Episode 200/1500 | Mean(Last50): -104.40
[Q] Episode 400/

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=0.01, γ=0.9, param=0.001 → avg_eval=-88.56 ± 25.38


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▁▃█▂▂
seed_final_mean_reward,▄▁█▇▇
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-88.56
mean_final_mean_reward_last50,-61.176
seed,4.0
seed_eval_return,-100.0
seed_final_mean_reward,-58.74
std_eval_return,25.38329
std_final_mean_reward_last50,3.32264


[34m[1mwandb[0m: Agent Starting Run: 33dhhci5 with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.01
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.9
[34m[1mwandb[0m: 	param: 0.01
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -109.04
[Q] Episode 400/1500 | Mean(Last50): -103.90
[Q] Episode 600/1500 | Mean(Last50): -101.12
[Q] Episode 800/1500 | Mean(Last50): -98.76
[Q] Episode 1000/1500 | Mean(Last50): -90.22
[Q] Episode 1200/1500 | Mean(Last50): -85.44
[Q] Episode 1400/1500 | Mean(Last50): -73.08
[Q] Episode 200/1500 | Mean(Last50): -108.02
[Q] Episode 400/1500 | Mean(Last50): -101.68
[Q] Episode 600/1500 | Mean(Last50): -100.22
[Q] Episode 800/1500 | Mean(Last50): -94.96
[Q] Episode 1000/1500 | Mean(Last50): -86.90
[Q] Episode 1200/1500 | Mean(Last50): -80.68
[Q] Episode 1400/1500 | Mean(Last50): -67.04
[Q] Episode 200/1500 | Mean(Last50): -111.66
[Q] Episode 400/1500 | Mean(Last50): -108.20
[Q] Episode 600/1500 | Mean(Last50): -93.62
[Q] Episode 800/1500 | Mean(Last50): -88.68
[Q] Episode 1000/1500 | Mean(Last50): -88.56
[Q] Episode 1200/1500 | Mean(Last50): -81.54
[Q] Episode 1400/1500 | Mean(Last50): -65.26
[Q] Episode 200/1500 | Mean(Last50): -111.88
[Q] Episode 40

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=0.01, γ=0.9, param=0.01 → avg_eval=-76.79 ± 23.95


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▃▁█▆▁
seed_final_mean_reward,█▁▅▆▅
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-76.79
mean_final_mean_reward_last50,-67.408
seed,4.0
seed_eval_return,-100.0
seed_final_mean_reward,-66.56
std_eval_return,23.9534
std_final_mean_reward_last50,5.71385


[34m[1mwandb[0m: Agent Starting Run: kyo8osdj with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.01
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.9
[34m[1mwandb[0m: 	param: 0.05
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -108.80
[Q] Episode 400/1500 | Mean(Last50): -102.58
[Q] Episode 600/1500 | Mean(Last50): -93.76
[Q] Episode 800/1500 | Mean(Last50): -85.00
[Q] Episode 1000/1500 | Mean(Last50): -100.32
[Q] Episode 1200/1500 | Mean(Last50): -87.78
[Q] Episode 1400/1500 | Mean(Last50): -58.38
[Q] Episode 200/1500 | Mean(Last50): -108.86
[Q] Episode 400/1500 | Mean(Last50): -117.02
[Q] Episode 600/1500 | Mean(Last50): -102.54
[Q] Episode 800/1500 | Mean(Last50): -97.98
[Q] Episode 1000/1500 | Mean(Last50): -91.30
[Q] Episode 1200/1500 | Mean(Last50): -87.42
[Q] Episode 1400/1500 | Mean(Last50): -68.36
[Q] Episode 200/1500 | Mean(Last50): -112.92
[Q] Episode 400/1500 | Mean(Last50): -109.02
[Q] Episode 600/1500 | Mean(Last50): -101.12
[Q] Episode 800/1500 | Mean(Last50): -96.56
[Q] Episode 1000/1500 | Mean(Last50): -89.18
[Q] Episode 1200/1500 | Mean(Last50): -83.60
[Q] Episode 1400/1500 | Mean(Last50): -69.38
[Q] Episode 200/1500 | Mean(Last50): -123.60
[Q] Episode 4

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=0.01, γ=0.9, param=0.05 → avg_eval=-71.69 ± 36.44


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▁▄▁▆█
seed_final_mean_reward,▆▂▇▁█
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-71.69
mean_final_mean_reward_last50,-70.908
seed,4.0
seed_eval_return,-18.2
seed_final_mean_reward,-64.9
std_eval_return,36.44362
std_final_mean_reward_last50,5.29582


[34m[1mwandb[0m: Agent Starting Run: kdrfddxc with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.01
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.9
[34m[1mwandb[0m: 	param: 0.1
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -109.42
[Q] Episode 400/1500 | Mean(Last50): -110.90
[Q] Episode 600/1500 | Mean(Last50): -112.44
[Q] Episode 800/1500 | Mean(Last50): -102.70
[Q] Episode 1000/1500 | Mean(Last50): -100.34
[Q] Episode 1200/1500 | Mean(Last50): -93.08
[Q] Episode 1400/1500 | Mean(Last50): -72.28
[Q] Episode 200/1500 | Mean(Last50): -104.52
[Q] Episode 400/1500 | Mean(Last50): -107.82
[Q] Episode 600/1500 | Mean(Last50): -105.04
[Q] Episode 800/1500 | Mean(Last50): -99.96
[Q] Episode 1000/1500 | Mean(Last50): -96.70
[Q] Episode 1200/1500 | Mean(Last50): -82.88
[Q] Episode 1400/1500 | Mean(Last50): -77.68
[Q] Episode 200/1500 | Mean(Last50): -119.54
[Q] Episode 400/1500 | Mean(Last50): -119.08
[Q] Episode 600/1500 | Mean(Last50): -101.94
[Q] Episode 800/1500 | Mean(Last50): -98.96
[Q] Episode 1000/1500 | Mean(Last50): -93.12
[Q] Episode 1200/1500 | Mean(Last50): -86.12
[Q] Episode 1400/1500 | Mean(Last50): -73.00
[Q] Episode 200/1500 | Mean(Last50): -115.36
[Q] Episode

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=0.01, γ=0.9, param=0.1 → avg_eval=-51.41 ± 24.64


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,█▁▆▅▇
seed_final_mean_reward,▇▁█▅▃
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-51.41
mean_final_mean_reward_last50,-77.236
seed,4.0
seed_eval_return,-36.1
seed_final_mean_reward,-79.76
std_eval_return,24.64409
std_final_mean_reward_last50,3.49659


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: e936v03e with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.01
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 1
[34m[1mwandb[0m: 	param: 0.001
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -110.30
[Q] Episode 400/1500 | Mean(Last50): -106.44
[Q] Episode 600/1500 | Mean(Last50): -99.46
[Q] Episode 800/1500 | Mean(Last50): -89.58
[Q] Episode 1000/1500 | Mean(Last50): -85.58
[Q] Episode 1200/1500 | Mean(Last50): -62.06
[Q] Episode 1400/1500 | Mean(Last50): -59.46
[Q] Episode 200/1500 | Mean(Last50): -106.26
[Q] Episode 400/1500 | Mean(Last50): -108.54
[Q] Episode 600/1500 | Mean(Last50): -94.88
[Q] Episode 800/1500 | Mean(Last50): -94.02
[Q] Episode 1000/1500 | Mean(Last50): -83.48
[Q] Episode 1200/1500 | Mean(Last50): -64.64
[Q] Episode 1400/1500 | Mean(Last50): -59.78
[Q] Episode 200/1500 | Mean(Last50): -105.20
[Q] Episode 400/1500 | Mean(Last50): -104.16
[Q] Episode 600/1500 | Mean(Last50): -99.82
[Q] Episode 800/1500 | Mean(Last50): -95.44
[Q] Episode 1000/1500 | Mean(Last50): -88.46
[Q] Episode 1200/1500 | Mean(Last50): -67.88
[Q] Episode 1400/1500 | Mean(Last50): -52.18
[Q] Episode 200/1500 | Mean(Last50): -107.52
[Q] Episode 400/

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=0.01, γ=1, param=0.001 → avg_eval=-109.78 ± 11.58


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▅▇█▃▁
seed_final_mean_reward,▁▇█▃▁
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-109.78
mean_final_mean_reward_last50,-55.632
seed,4.0
seed_eval_return,-127.25
seed_final_mean_reward,-60.96
std_eval_return,11.57595
std_final_mean_reward_last50,5.4338


[34m[1mwandb[0m: Agent Starting Run: 8uo4ubfc with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.01
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 1
[34m[1mwandb[0m: 	param: 0.01
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -102.68
[Q] Episode 400/1500 | Mean(Last50): -109.90
[Q] Episode 600/1500 | Mean(Last50): -98.08
[Q] Episode 800/1500 | Mean(Last50): -82.28
[Q] Episode 1000/1500 | Mean(Last50): -91.74
[Q] Episode 1200/1500 | Mean(Last50): -73.64
[Q] Episode 1400/1500 | Mean(Last50): -57.60
[Q] Episode 200/1500 | Mean(Last50): -110.64
[Q] Episode 400/1500 | Mean(Last50): -102.44
[Q] Episode 600/1500 | Mean(Last50): -98.20
[Q] Episode 800/1500 | Mean(Last50): -88.48
[Q] Episode 1000/1500 | Mean(Last50): -84.32
[Q] Episode 1200/1500 | Mean(Last50): -67.34
[Q] Episode 1400/1500 | Mean(Last50): -65.18
[Q] Episode 200/1500 | Mean(Last50): -105.68
[Q] Episode 400/1500 | Mean(Last50): -111.56
[Q] Episode 600/1500 | Mean(Last50): -103.58
[Q] Episode 800/1500 | Mean(Last50): -95.26
[Q] Episode 1000/1500 | Mean(Last50): -84.48
[Q] Episode 1200/1500 | Mean(Last50): -70.44
[Q] Episode 1400/1500 | Mean(Last50): -52.58
[Q] Episode 200/1500 | Mean(Last50): -112.98
[Q] Episode 400

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=0.01, γ=1, param=0.01 → avg_eval=-63.85 ± 19.35


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▇▅▁█▄
seed_final_mean_reward,▃▃█▁▃
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-63.85
mean_final_mean_reward_last50,-54.504
seed,4.0
seed_eval_return,-71.55
seed_final_mean_reward,-55.44
std_eval_return,19.35187
std_final_mean_reward_last50,3.66234


[34m[1mwandb[0m: Agent Starting Run: o1jykxra with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.01
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 1
[34m[1mwandb[0m: 	param: 0.05
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -110.60
[Q] Episode 400/1500 | Mean(Last50): -113.64
[Q] Episode 600/1500 | Mean(Last50): -92.10
[Q] Episode 800/1500 | Mean(Last50): -91.54
[Q] Episode 1000/1500 | Mean(Last50): -89.04
[Q] Episode 1200/1500 | Mean(Last50): -78.04
[Q] Episode 1400/1500 | Mean(Last50): -83.50
[Q] Episode 200/1500 | Mean(Last50): -106.50
[Q] Episode 400/1500 | Mean(Last50): -99.86
[Q] Episode 600/1500 | Mean(Last50): -92.48
[Q] Episode 800/1500 | Mean(Last50): -95.40
[Q] Episode 1000/1500 | Mean(Last50): -88.18
[Q] Episode 1200/1500 | Mean(Last50): -72.14
[Q] Episode 1400/1500 | Mean(Last50): -64.56
[Q] Episode 200/1500 | Mean(Last50): -107.48
[Q] Episode 400/1500 | Mean(Last50): -108.66
[Q] Episode 600/1500 | Mean(Last50): -96.14
[Q] Episode 800/1500 | Mean(Last50): -95.28
[Q] Episode 1000/1500 | Mean(Last50): -92.68
[Q] Episode 1200/1500 | Mean(Last50): -68.26
[Q] Episode 1400/1500 | Mean(Last50): -54.78
[Q] Episode 200/1500 | Mean(Last50): -111.42
[Q] Episode 400/1

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=0.01, γ=1, param=0.05 → avg_eval=-56.83 ± 15.63


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▁▆▂█▅
seed_final_mean_reward,▁▂▁█▇
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-56.83
mean_final_mean_reward_last50,-56.288
seed,4.0
seed_eval_return,-53.6
seed_final_mean_reward,-53.28
std_eval_return,15.62778
std_final_mean_reward_last50,2.67117


[34m[1mwandb[0m: Agent Starting Run: 84jh3fk4 with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.01
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 1
[34m[1mwandb[0m: 	param: 0.1
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -114.08
[Q] Episode 400/1500 | Mean(Last50): -104.06
[Q] Episode 600/1500 | Mean(Last50): -111.86
[Q] Episode 800/1500 | Mean(Last50): -89.32
[Q] Episode 1000/1500 | Mean(Last50): -89.02
[Q] Episode 1200/1500 | Mean(Last50): -68.94
[Q] Episode 1400/1500 | Mean(Last50): -60.86
[Q] Episode 200/1500 | Mean(Last50): -104.76
[Q] Episode 400/1500 | Mean(Last50): -112.64
[Q] Episode 600/1500 | Mean(Last50): -107.18
[Q] Episode 800/1500 | Mean(Last50): -101.52
[Q] Episode 1000/1500 | Mean(Last50): -87.14
[Q] Episode 1200/1500 | Mean(Last50): -72.74
[Q] Episode 1400/1500 | Mean(Last50): -62.86
[Q] Episode 200/1500 | Mean(Last50): -114.66
[Q] Episode 400/1500 | Mean(Last50): -95.68
[Q] Episode 600/1500 | Mean(Last50): -97.02
[Q] Episode 800/1500 | Mean(Last50): -103.42
[Q] Episode 1000/1500 | Mean(Last50): -100.44
[Q] Episode 1200/1500 | Mean(Last50): -64.64
[Q] Episode 1400/1500 | Mean(Last50): -53.68
[Q] Episode 200/1500 | Mean(Last50): -114.86
[Q] Episode 

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


[Q] Episode 1400/1500 | Mean(Last50): -57.76
Run finished | α=0.01, γ=1, param=0.1 → avg_eval=-63.85 ± 25.09


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▄▁█▁▆
seed_final_mean_reward,▅▁▆█▃
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-63.85
mean_final_mean_reward_last50,-61.852
seed,4.0
seed_eval_return,-43.95
seed_final_mean_reward,-65.26
std_eval_return,25.08904
std_final_mean_reward_last50,4.87593


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ra4znxh2 with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.1
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.7
[34m[1mwandb[0m: 	param: 0.001
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -90.60
[Q] Episode 400/1500 | Mean(Last50): -59.60
[Q] Episode 600/1500 | Mean(Last50): -39.52
[Q] Episode 800/1500 | Mean(Last50): -29.96
[Q] Episode 1000/1500 | Mean(Last50): -32.46
[Q] Episode 1200/1500 | Mean(Last50): -28.56
[Q] Episode 1400/1500 | Mean(Last50): -27.50
[Q] Episode 200/1500 | Mean(Last50): -81.20
[Q] Episode 400/1500 | Mean(Last50): -62.14
[Q] Episode 600/1500 | Mean(Last50): -31.34
[Q] Episode 800/1500 | Mean(Last50): -32.00
[Q] Episode 1000/1500 | Mean(Last50): -26.34
[Q] Episode 1200/1500 | Mean(Last50): -32.68
[Q] Episode 1400/1500 | Mean(Last50): -34.98
[Q] Episode 200/1500 | Mean(Last50): -96.56
[Q] Episode 400/1500 | Mean(Last50): -54.64
[Q] Episode 600/1500 | Mean(Last50): -40.14
[Q] Episode 800/1500 | Mean(Last50): -33.50
[Q] Episode 1000/1500 | Mean(Last50): -27.74
[Q] Episode 1200/1500 | Mean(Last50): -28.34
[Q] Episode 1400/1500 | Mean(Last50): -27.36
[Q] Episode 200/1500 | Mean(Last50): -90.26
[Q] Episode 400/1500 | 

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▄▅▁█▄
seed_final_mean_reward,▂▁▄█▅
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-29.01
mean_final_mean_reward_last50,-30.5
seed,4.0
seed_eval_return,-29.15
seed_final_mean_reward,-29.8
std_eval_return,1.59606
std_final_mean_reward_last50,3.60489


[34m[1mwandb[0m: Agent Starting Run: 6szqfg37 with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.1
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.7
[34m[1mwandb[0m: 	param: 0.01
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -89.76
[Q] Episode 400/1500 | Mean(Last50): -74.44
[Q] Episode 600/1500 | Mean(Last50): -43.48
[Q] Episode 800/1500 | Mean(Last50): -29.46
[Q] Episode 1000/1500 | Mean(Last50): -31.34
[Q] Episode 1200/1500 | Mean(Last50): -26.82
[Q] Episode 1400/1500 | Mean(Last50): -32.34
[Q] Episode 200/1500 | Mean(Last50): -85.98
[Q] Episode 400/1500 | Mean(Last50): -50.50
[Q] Episode 600/1500 | Mean(Last50): -32.58
[Q] Episode 800/1500 | Mean(Last50): -30.56
[Q] Episode 1000/1500 | Mean(Last50): -26.60
[Q] Episode 1200/1500 | Mean(Last50): -28.36
[Q] Episode 1400/1500 | Mean(Last50): -28.60
[Q] Episode 200/1500 | Mean(Last50): -97.06
[Q] Episode 400/1500 | Mean(Last50): -56.48
[Q] Episode 600/1500 | Mean(Last50): -34.22
[Q] Episode 800/1500 | Mean(Last50): -32.48
[Q] Episode 1000/1500 | Mean(Last50): -27.94
[Q] Episode 1200/1500 | Mean(Last50): -26.90
[Q] Episode 1400/1500 | Mean(Last50): -27.04
[Q] Episode 200/1500 | Mean(Last50): -91.58
[Q] Episode 400/1500 | 

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


[Q] Episode 1400/1500 | Mean(Last50): -31.38
Run finished | α=0.1, γ=0.7, param=0.01 → avg_eval=-26.85 ± 3.11


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▅█▄▁▄
seed_final_mean_reward,▁█▅▆▃
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-26.85
mean_final_mean_reward_last50,-29.204
seed,4.0
seed_eval_return,-27.25
seed_final_mean_reward,-31.9
std_eval_return,3.11079
std_final_mean_reward_last50,3.30736


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 9mip18aj with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.1
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.7
[34m[1mwandb[0m: 	param: 0.05
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -112.32
[Q] Episode 400/1500 | Mean(Last50): -52.90
[Q] Episode 600/1500 | Mean(Last50): -37.54
[Q] Episode 800/1500 | Mean(Last50): -38.56
[Q] Episode 1000/1500 | Mean(Last50): -43.64
[Q] Episode 1200/1500 | Mean(Last50): -44.74
[Q] Episode 1400/1500 | Mean(Last50): -38.06
[Q] Episode 200/1500 | Mean(Last50): -95.42
[Q] Episode 400/1500 | Mean(Last50): -69.06
[Q] Episode 600/1500 | Mean(Last50): -50.82
[Q] Episode 800/1500 | Mean(Last50): -56.54
[Q] Episode 1000/1500 | Mean(Last50): -38.16
[Q] Episode 1200/1500 | Mean(Last50): -43.02
[Q] Episode 1400/1500 | Mean(Last50): -54.00
[Q] Episode 200/1500 | Mean(Last50): -105.28
[Q] Episode 400/1500 | Mean(Last50): -78.60
[Q] Episode 600/1500 | Mean(Last50): -46.38
[Q] Episode 800/1500 | Mean(Last50): -44.32
[Q] Episode 1000/1500 | Mean(Last50): -33.90
[Q] Episode 1200/1500 | Mean(Last50): -28.38
[Q] Episode 1400/1500 | Mean(Last50): -47.76
[Q] Episode 200/1500 | Mean(Last50): -101.00
[Q] Episode 400/1500

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=0.1, γ=0.7, param=0.05 → avg_eval=-26.05 ± 2.78


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▃▁▇▅█
seed_final_mean_reward,▆▁▂▅█
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-26.05
mean_final_mean_reward_last50,-43.972
seed,4.0
seed_eval_return,-22.75
seed_final_mean_reward,-33.48
std_eval_return,2.78478
std_final_mean_reward_last50,7.28753


[34m[1mwandb[0m: Agent Starting Run: 1eazf3os with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.1
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.7
[34m[1mwandb[0m: 	param: 0.1
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -107.62
[Q] Episode 400/1500 | Mean(Last50): -80.74
[Q] Episode 600/1500 | Mean(Last50): -69.14
[Q] Episode 800/1500 | Mean(Last50): -47.72
[Q] Episode 1000/1500 | Mean(Last50): -42.98
[Q] Episode 1200/1500 | Mean(Last50): -50.30
[Q] Episode 1400/1500 | Mean(Last50): -48.18
[Q] Episode 200/1500 | Mean(Last50): -99.86
[Q] Episode 400/1500 | Mean(Last50): -84.10
[Q] Episode 600/1500 | Mean(Last50): -64.00
[Q] Episode 800/1500 | Mean(Last50): -58.14
[Q] Episode 1000/1500 | Mean(Last50): -50.04
[Q] Episode 1200/1500 | Mean(Last50): -49.54
[Q] Episode 1400/1500 | Mean(Last50): -43.50
[Q] Episode 200/1500 | Mean(Last50): -110.78
[Q] Episode 400/1500 | Mean(Last50): -80.00
[Q] Episode 600/1500 | Mean(Last50): -72.02
[Q] Episode 800/1500 | Mean(Last50): -49.50
[Q] Episode 1000/1500 | Mean(Last50): -59.42
[Q] Episode 1200/1500 | Mean(Last50): -57.06
[Q] Episode 1400/1500 | Mean(Last50): -48.50
[Q] Episode 200/1500 | Mean(Last50): -114.56
[Q] Episode 400/1500

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


[Q] Episode 1400/1500 | Mean(Last50): -45.78
Run finished | α=0.1, γ=0.7, param=0.1 → avg_eval=-25.73 ± 3.16


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▅▃▃█▁
seed_final_mean_reward,█▁▅▁▂
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-25.73
mean_final_mean_reward_last50,-56.424
seed,4.0
seed_eval_return,-29.7
seed_final_mean_reward,-61.46
std_eval_return,3.15668
std_final_mean_reward_last50,8.63295


[34m[1mwandb[0m: Agent Starting Run: fdgjbkfu with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.1
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.8
[34m[1mwandb[0m: 	param: 0.001
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -61.42
[Q] Episode 400/1500 | Mean(Last50): -25.72
[Q] Episode 600/1500 | Mean(Last50): -40.58
[Q] Episode 800/1500 | Mean(Last50): -14.48
[Q] Episode 1000/1500 | Mean(Last50): -35.84
[Q] Episode 1200/1500 | Mean(Last50): -26.20
[Q] Episode 1400/1500 | Mean(Last50): -27.18
[Q] Episode 200/1500 | Mean(Last50): -69.84
[Q] Episode 400/1500 | Mean(Last50): -18.22
[Q] Episode 600/1500 | Mean(Last50): -34.92
[Q] Episode 800/1500 | Mean(Last50): -32.64
[Q] Episode 1000/1500 | Mean(Last50): -18.82
[Q] Episode 1200/1500 | Mean(Last50): -25.70
[Q] Episode 1400/1500 | Mean(Last50): -27.42
[Q] Episode 200/1500 | Mean(Last50): -79.56
[Q] Episode 400/1500 | Mean(Last50): -35.86
[Q] Episode 600/1500 | Mean(Last50): -30.88
[Q] Episode 800/1500 | Mean(Last50): -18.92
[Q] Episode 1000/1500 | Mean(Last50): -16.38
[Q] Episode 1200/1500 | Mean(Last50): -15.80
[Q] Episode 1400/1500 | Mean(Last50): -15.08
[Q] Episode 200/1500 | Mean(Last50): -75.60
[Q] Episode 400/1500 | 

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=0.1, γ=0.8, param=0.001 → avg_eval=-21.90 ± 5.01


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▃▁█▅▁
seed_final_mean_reward,▃▁█▆▁
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-21.9
mean_final_mean_reward_last50,-23.012
seed,4.0
seed_eval_return,-26.15
seed_final_mean_reward,-28.8
std_eval_return,5.01338
std_final_mean_reward_last50,5.54712


[34m[1mwandb[0m: Agent Starting Run: 3o3tb7ji with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.1
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.8
[34m[1mwandb[0m: 	param: 0.01
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -71.06
[Q] Episode 400/1500 | Mean(Last50): -35.24
[Q] Episode 600/1500 | Mean(Last50): -14.88
[Q] Episode 800/1500 | Mean(Last50): -16.18
[Q] Episode 1000/1500 | Mean(Last50): -20.18
[Q] Episode 1200/1500 | Mean(Last50): -26.80
[Q] Episode 1400/1500 | Mean(Last50): -14.94
[Q] Episode 200/1500 | Mean(Last50): -76.64
[Q] Episode 400/1500 | Mean(Last50): -54.02
[Q] Episode 600/1500 | Mean(Last50): -40.16
[Q] Episode 800/1500 | Mean(Last50): -15.64
[Q] Episode 1000/1500 | Mean(Last50): -15.14
[Q] Episode 1200/1500 | Mean(Last50): -15.82
[Q] Episode 1400/1500 | Mean(Last50): -31.02
[Q] Episode 200/1500 | Mean(Last50): -48.56
[Q] Episode 400/1500 | Mean(Last50): -33.90
[Q] Episode 600/1500 | Mean(Last50): -20.98
[Q] Episode 800/1500 | Mean(Last50): -19.04
[Q] Episode 1000/1500 | Mean(Last50): -15.00
[Q] Episode 1200/1500 | Mean(Last50): -25.08
[Q] Episode 1400/1500 | Mean(Last50): -35.60
[Q] Episode 200/1500 | Mean(Last50): -88.92
[Q] Episode 400/1500 | 

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


[Q] Episode 1400/1500 | Mean(Last50): -19.94
Run finished | α=0.1, γ=0.8, param=0.01 → avg_eval=-23.44 ± 8.21


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,█▁▃█▇
seed_final_mean_reward,█▂▁▂▄
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-23.44
mean_final_mean_reward_last50,-23.112
seed,4.0
seed_eval_return,-18.7
seed_final_mean_reward,-21.86
std_eval_return,8.20527
std_final_mean_reward_last50,3.69409


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ymvk3m95 with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.1
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.8
[34m[1mwandb[0m: 	param: 0.05
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -99.68
[Q] Episode 400/1500 | Mean(Last50): -30.42
[Q] Episode 600/1500 | Mean(Last50): -15.64
[Q] Episode 800/1500 | Mean(Last50): -17.32
[Q] Episode 1000/1500 | Mean(Last50): -27.42
[Q] Episode 1200/1500 | Mean(Last50): -16.64
[Q] Episode 1400/1500 | Mean(Last50): -28.20
[Q] Episode 200/1500 | Mean(Last50): -87.80
[Q] Episode 400/1500 | Mean(Last50): -37.66
[Q] Episode 600/1500 | Mean(Last50): -20.18
[Q] Episode 800/1500 | Mean(Last50): -20.92
[Q] Episode 1000/1500 | Mean(Last50): -19.68
[Q] Episode 1200/1500 | Mean(Last50): -18.74
[Q] Episode 1400/1500 | Mean(Last50): -18.92
[Q] Episode 200/1500 | Mean(Last50): -94.84
[Q] Episode 400/1500 | Mean(Last50): -23.20
[Q] Episode 600/1500 | Mean(Last50): -40.56
[Q] Episode 800/1500 | Mean(Last50): -18.46
[Q] Episode 1000/1500 | Mean(Last50): -35.76
[Q] Episode 1200/1500 | Mean(Last50): -16.42
[Q] Episode 1400/1500 | Mean(Last50): -15.48
[Q] Episode 200/1500 | Mean(Last50): -79.12
[Q] Episode 400/1500 | 

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=0.1, γ=0.8, param=0.05 → avg_eval=-20.36 ± 5.81


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▁▇▆█▅
seed_final_mean_reward,▅▃█▁▇
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-20.36
mean_final_mean_reward_last50,-24.792
seed,4.0
seed_eval_return,-21.3
seed_final_mean_reward,-19.52
std_eval_return,5.80942
std_final_mean_reward_last50,5.9196


[34m[1mwandb[0m: Agent Starting Run: cjx0qojp with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.1
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.8
[34m[1mwandb[0m: 	param: 0.1
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -118.30
[Q] Episode 400/1500 | Mean(Last50): -35.02
[Q] Episode 600/1500 | Mean(Last50): -19.98
[Q] Episode 800/1500 | Mean(Last50): -27.44
[Q] Episode 1000/1500 | Mean(Last50): -63.28
[Q] Episode 1200/1500 | Mean(Last50): -23.62
[Q] Episode 1400/1500 | Mean(Last50): -30.66
[Q] Episode 200/1500 | Mean(Last50): -85.32
[Q] Episode 400/1500 | Mean(Last50): -24.10
[Q] Episode 600/1500 | Mean(Last50): -26.02
[Q] Episode 800/1500 | Mean(Last50): -34.60
[Q] Episode 1000/1500 | Mean(Last50): -25.00
[Q] Episode 1200/1500 | Mean(Last50): -21.30
[Q] Episode 1400/1500 | Mean(Last50): -38.88
[Q] Episode 200/1500 | Mean(Last50): -74.88
[Q] Episode 400/1500 | Mean(Last50): -36.96
[Q] Episode 600/1500 | Mean(Last50): -44.50
[Q] Episode 800/1500 | Mean(Last50): -21.18
[Q] Episode 1000/1500 | Mean(Last50): -36.70
[Q] Episode 1200/1500 | Mean(Last50): -21.88
[Q] Episode 1400/1500 | Mean(Last50): -64.82
[Q] Episode 200/1500 | Mean(Last50): -107.36
[Q] Episode 400/1500 

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=0.1, γ=0.8, param=0.1 → avg_eval=-22.19 ± 4.07


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,██▄▁▂
seed_final_mean_reward,██▂▁▁
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-22.19
mean_final_mean_reward_last50,-39.528
seed,4.0
seed_eval_return,-25.6
seed_final_mean_reward,-54.78
std_eval_return,4.07362
std_final_mean_reward_last50,15.83623


[34m[1mwandb[0m: Agent Starting Run: 6f0175uv with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.1
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.9
[34m[1mwandb[0m: 	param: 0.001
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -39.60
[Q] Episode 400/1500 | Mean(Last50): -17.02
[Q] Episode 600/1500 | Mean(Last50): -18.22
[Q] Episode 800/1500 | Mean(Last50): -15.80
[Q] Episode 1000/1500 | Mean(Last50): -17.54
[Q] Episode 1200/1500 | Mean(Last50): -13.38
[Q] Episode 1400/1500 | Mean(Last50): -15.52
[Q] Episode 200/1500 | Mean(Last50): -53.00
[Q] Episode 400/1500 | Mean(Last50): -21.66
[Q] Episode 600/1500 | Mean(Last50): -15.04
[Q] Episode 800/1500 | Mean(Last50): -13.42
[Q] Episode 1000/1500 | Mean(Last50): -16.42
[Q] Episode 1200/1500 | Mean(Last50): -15.76
[Q] Episode 1400/1500 | Mean(Last50): -16.60
[Q] Episode 200/1500 | Mean(Last50): -69.02
[Q] Episode 400/1500 | Mean(Last50): -19.22
[Q] Episode 600/1500 | Mean(Last50): -17.06
[Q] Episode 800/1500 | Mean(Last50): -16.64
[Q] Episode 1000/1500 | Mean(Last50): -15.62
[Q] Episode 1200/1500 | Mean(Last50): -14.62
[Q] Episode 1400/1500 | Mean(Last50): -16.60
[Q] Episode 200/1500 | Mean(Last50): -71.64
[Q] Episode 400/1500 | 

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


[Q] Episode 1200/1500 | Mean(Last50): -16.46
[Q] Episode 1400/1500 | Mean(Last50): -14.24
Run finished | α=0.1, γ=0.9, param=0.001 → avg_eval=-14.77 ± 1.20


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▁▅█▁▆
seed_final_mean_reward,▁█▅▄▃
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-14.77
mean_final_mean_reward_last50,-15.02
seed,4.0
seed_eval_return,-14.1
seed_final_mean_reward,-15.58
std_eval_return,1.20192
std_final_mean_reward_last50,1.05292


[34m[1mwandb[0m: Agent Starting Run: mlf5lhom with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.1
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.9
[34m[1mwandb[0m: 	param: 0.01
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -57.52
[Q] Episode 400/1500 | Mean(Last50): -18.10
[Q] Episode 600/1500 | Mean(Last50): -18.16
[Q] Episode 800/1500 | Mean(Last50): -18.74
[Q] Episode 1000/1500 | Mean(Last50): -15.94
[Q] Episode 1200/1500 | Mean(Last50): -17.20
[Q] Episode 1400/1500 | Mean(Last50): -14.82
[Q] Episode 200/1500 | Mean(Last50): -67.10
[Q] Episode 400/1500 | Mean(Last50): -14.64
[Q] Episode 600/1500 | Mean(Last50): -15.62
[Q] Episode 800/1500 | Mean(Last50): -14.68
[Q] Episode 1000/1500 | Mean(Last50): -15.98
[Q] Episode 1200/1500 | Mean(Last50): -15.48
[Q] Episode 1400/1500 | Mean(Last50): -14.86
[Q] Episode 200/1500 | Mean(Last50): -61.38
[Q] Episode 400/1500 | Mean(Last50): -18.16
[Q] Episode 600/1500 | Mean(Last50): -15.24
[Q] Episode 800/1500 | Mean(Last50): -16.82
[Q] Episode 1000/1500 | Mean(Last50): -14.14
[Q] Episode 1200/1500 | Mean(Last50): -16.92
[Q] Episode 1400/1500 | Mean(Last50): -16.16
[Q] Episode 200/1500 | Mean(Last50): -49.34
[Q] Episode 400/1500 | 

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▃█▆▇▁
seed_final_mean_reward,▇█▁▁▃
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-14.61
mean_final_mean_reward_last50,-15.68
seed,4.0
seed_eval_return,-16.3
seed_final_mean_reward,-16.12
std_eval_return,1.09243
std_final_mean_reward_last50,1.03135


[34m[1mwandb[0m: Agent Starting Run: 5gg7h60a with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.1
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.9
[34m[1mwandb[0m: 	param: 0.05
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -65.72
[Q] Episode 400/1500 | Mean(Last50): -19.42
[Q] Episode 600/1500 | Mean(Last50): -17.16
[Q] Episode 800/1500 | Mean(Last50): -18.06
[Q] Episode 1000/1500 | Mean(Last50): -15.40
[Q] Episode 1200/1500 | Mean(Last50): -16.92
[Q] Episode 1400/1500 | Mean(Last50): -16.46
[Q] Episode 200/1500 | Mean(Last50): -58.38
[Q] Episode 400/1500 | Mean(Last50): -18.78
[Q] Episode 600/1500 | Mean(Last50): -22.34
[Q] Episode 800/1500 | Mean(Last50): -16.80
[Q] Episode 1000/1500 | Mean(Last50): -18.54
[Q] Episode 1200/1500 | Mean(Last50): -17.38
[Q] Episode 1400/1500 | Mean(Last50): -15.98
[Q] Episode 200/1500 | Mean(Last50): -62.78
[Q] Episode 400/1500 | Mean(Last50): -22.96
[Q] Episode 600/1500 | Mean(Last50): -19.98
[Q] Episode 800/1500 | Mean(Last50): -16.44
[Q] Episode 1000/1500 | Mean(Last50): -16.66
[Q] Episode 1200/1500 | Mean(Last50): -16.94
[Q] Episode 1400/1500 | Mean(Last50): -18.04
[Q] Episode 200/1500 | Mean(Last50): -55.68
[Q] Episode 400/1500 | 

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,█▅▆▁▁
seed_final_mean_reward,▂██▆▁
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-15.55
mean_final_mean_reward_last50,-18.172
seed,4.0
seed_eval_return,-17.55
seed_final_mean_reward,-20.3
std_eval_return,1.84472
std_final_mean_reward_last50,1.58233


[34m[1mwandb[0m: Agent Starting Run: osx73c0c with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.1
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.9
[34m[1mwandb[0m: 	param: 0.1
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -86.10
[Q] Episode 400/1500 | Mean(Last50): -19.40
[Q] Episode 600/1500 | Mean(Last50): -19.08
[Q] Episode 800/1500 | Mean(Last50): -18.56
[Q] Episode 1000/1500 | Mean(Last50): -17.64
[Q] Episode 1200/1500 | Mean(Last50): -20.50
[Q] Episode 1400/1500 | Mean(Last50): -17.26
[Q] Episode 200/1500 | Mean(Last50): -73.02
[Q] Episode 400/1500 | Mean(Last50): -22.48
[Q] Episode 600/1500 | Mean(Last50): -24.72
[Q] Episode 800/1500 | Mean(Last50): -22.62
[Q] Episode 1000/1500 | Mean(Last50): -18.26
[Q] Episode 1200/1500 | Mean(Last50): -18.62
[Q] Episode 1400/1500 | Mean(Last50): -18.50
[Q] Episode 200/1500 | Mean(Last50): -50.42
[Q] Episode 400/1500 | Mean(Last50): -33.52
[Q] Episode 600/1500 | Mean(Last50): -23.16
[Q] Episode 800/1500 | Mean(Last50): -18.88
[Q] Episode 1000/1500 | Mean(Last50): -17.48
[Q] Episode 1200/1500 | Mean(Last50): -22.38
[Q] Episode 1400/1500 | Mean(Last50): -21.44
[Q] Episode 200/1500 | Mean(Last50): -60.20
[Q] Episode 400/1500 | 

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


[Q] Episode 1400/1500 | Mean(Last50): -24.66
Run finished | α=0.1, γ=0.9, param=0.1 → avg_eval=-16.04 ± 1.07


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▁▁▇█▃
seed_final_mean_reward,▄▅█▂▁
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-16.04
mean_final_mean_reward_last50,-20.168
seed,4.0
seed_eval_return,-16.55
seed_final_mean_reward,-21.64
std_eval_return,1.07257
std_final_mean_reward_last50,1.14155


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 2nms7m98 with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.1
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 1
[34m[1mwandb[0m: 	param: 0.001
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -64.00
[Q] Episode 400/1500 | Mean(Last50): -16.48
[Q] Episode 600/1500 | Mean(Last50): -16.76
[Q] Episode 800/1500 | Mean(Last50): -14.28
[Q] Episode 1000/1500 | Mean(Last50): -14.82
[Q] Episode 1200/1500 | Mean(Last50): -13.18
[Q] Episode 1400/1500 | Mean(Last50): -16.20
[Q] Episode 200/1500 | Mean(Last50): -54.66
[Q] Episode 400/1500 | Mean(Last50): -20.32
[Q] Episode 600/1500 | Mean(Last50): -15.42
[Q] Episode 800/1500 | Mean(Last50): -16.08
[Q] Episode 1000/1500 | Mean(Last50): -15.88
[Q] Episode 1200/1500 | Mean(Last50): -18.28
[Q] Episode 1400/1500 | Mean(Last50): -15.88
[Q] Episode 200/1500 | Mean(Last50): -48.72
[Q] Episode 400/1500 | Mean(Last50): -17.54
[Q] Episode 600/1500 | Mean(Last50): -26.12
[Q] Episode 800/1500 | Mean(Last50): -15.18
[Q] Episode 1000/1500 | Mean(Last50): -20.80
[Q] Episode 1200/1500 | Mean(Last50): -17.64
[Q] Episode 1400/1500 | Mean(Last50): -14.00
[Q] Episode 200/1500 | Mean(Last50): -63.22
[Q] Episode 400/1500 | 

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


[Q] Episode 1400/1500 | Mean(Last50): -14.00
Run finished | α=0.1, γ=1, param=0.001 → avg_eval=-16.94 ± 2.60


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▁▆▅▅█
seed_final_mean_reward,▁▅▄▄█
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-16.94
mean_final_mean_reward_last50,-16.204
seed,4.0
seed_eval_return,-13.65
seed_final_mean_reward,-14.8
std_eval_return,2.60027
std_final_mean_reward_last50,0.86083


[34m[1mwandb[0m: Agent Starting Run: oy8xkes5 with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.1
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 1
[34m[1mwandb[0m: 	param: 0.01
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -46.82
[Q] Episode 400/1500 | Mean(Last50): -14.90
[Q] Episode 600/1500 | Mean(Last50): -14.68
[Q] Episode 800/1500 | Mean(Last50): -18.56
[Q] Episode 1000/1500 | Mean(Last50): -15.64
[Q] Episode 1200/1500 | Mean(Last50): -14.50
[Q] Episode 1400/1500 | Mean(Last50): -15.90
[Q] Episode 200/1500 | Mean(Last50): -49.44
[Q] Episode 400/1500 | Mean(Last50): -20.96
[Q] Episode 600/1500 | Mean(Last50): -14.78
[Q] Episode 800/1500 | Mean(Last50): -14.26
[Q] Episode 1000/1500 | Mean(Last50): -16.00
[Q] Episode 1200/1500 | Mean(Last50): -14.58
[Q] Episode 1400/1500 | Mean(Last50): -15.32
[Q] Episode 200/1500 | Mean(Last50): -61.06
[Q] Episode 400/1500 | Mean(Last50): -17.28
[Q] Episode 600/1500 | Mean(Last50): -18.90
[Q] Episode 800/1500 | Mean(Last50): -13.92
[Q] Episode 1000/1500 | Mean(Last50): -14.78
[Q] Episode 1200/1500 | Mean(Last50): -20.18
[Q] Episode 1400/1500 | Mean(Last50): -14.84
[Q] Episode 200/1500 | Mean(Last50): -59.50
[Q] Episode 400/1500 | 

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


[Q] Episode 1400/1500 | Mean(Last50): -16.02
Run finished | α=0.1, γ=1, param=0.01 → avg_eval=-16.92 ± 3.54


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▁▇▅▇█
seed_final_mean_reward,▁▅█▁▆
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-16.92
mean_final_mean_reward_last50,-16.36
seed,4.0
seed_eval_return,-13.45
seed_final_mean_reward,-15.82
std_eval_return,3.53505
std_final_mean_reward_last50,0.89003


[34m[1mwandb[0m: Agent Starting Run: 0sn23io6 with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.1
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 1
[34m[1mwandb[0m: 	param: 0.05
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -56.72
[Q] Episode 400/1500 | Mean(Last50): -19.62
[Q] Episode 600/1500 | Mean(Last50): -18.50
[Q] Episode 800/1500 | Mean(Last50): -16.76
[Q] Episode 1000/1500 | Mean(Last50): -16.42
[Q] Episode 1200/1500 | Mean(Last50): -16.26
[Q] Episode 1400/1500 | Mean(Last50): -18.06
[Q] Episode 200/1500 | Mean(Last50): -53.44
[Q] Episode 400/1500 | Mean(Last50): -18.10
[Q] Episode 600/1500 | Mean(Last50): -18.84
[Q] Episode 800/1500 | Mean(Last50): -16.94
[Q] Episode 1000/1500 | Mean(Last50): -18.54
[Q] Episode 1200/1500 | Mean(Last50): -19.72
[Q] Episode 1400/1500 | Mean(Last50): -17.36
[Q] Episode 200/1500 | Mean(Last50): -47.54
[Q] Episode 400/1500 | Mean(Last50): -26.26
[Q] Episode 600/1500 | Mean(Last50): -18.64
[Q] Episode 800/1500 | Mean(Last50): -16.68
[Q] Episode 1000/1500 | Mean(Last50): -15.18
[Q] Episode 1200/1500 | Mean(Last50): -16.30
[Q] Episode 1400/1500 | Mean(Last50): -17.24
[Q] Episode 200/1500 | Mean(Last50): -53.62
[Q] Episode 400/1500 | 

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


[Q] Episode 1400/1500 | Mean(Last50): -16.24
Run finished | α=0.1, γ=1, param=0.05 → avg_eval=-14.86 ± 1.90


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▁▂█▇▁
seed_final_mean_reward,▁█▇█▇
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-14.86
mean_final_mean_reward_last50,-18.38
seed,4.0
seed_eval_return,-16.7
seed_final_mean_reward,-17.86
std_eval_return,1.89615
std_final_mean_reward_last50,1.51963


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: t8vu4pqj with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 0.1
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 1
[34m[1mwandb[0m: 	param: 0.1
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -67.98
[Q] Episode 400/1500 | Mean(Last50): -23.74
[Q] Episode 600/1500 | Mean(Last50): -18.20
[Q] Episode 800/1500 | Mean(Last50): -18.58
[Q] Episode 1000/1500 | Mean(Last50): -19.74
[Q] Episode 1200/1500 | Mean(Last50): -23.00
[Q] Episode 1400/1500 | Mean(Last50): -20.34
[Q] Episode 200/1500 | Mean(Last50): -65.24
[Q] Episode 400/1500 | Mean(Last50): -22.04
[Q] Episode 600/1500 | Mean(Last50): -18.74
[Q] Episode 800/1500 | Mean(Last50): -16.12
[Q] Episode 1000/1500 | Mean(Last50): -19.62
[Q] Episode 1200/1500 | Mean(Last50): -22.52
[Q] Episode 1400/1500 | Mean(Last50): -20.06
[Q] Episode 200/1500 | Mean(Last50): -52.24
[Q] Episode 400/1500 | Mean(Last50): -19.80
[Q] Episode 600/1500 | Mean(Last50): -19.84
[Q] Episode 800/1500 | Mean(Last50): -23.72
[Q] Episode 1000/1500 | Mean(Last50): -18.62
[Q] Episode 1200/1500 | Mean(Last50): -19.64
[Q] Episode 1400/1500 | Mean(Last50): -19.48
[Q] Episode 200/1500 | Mean(Last50): -46.98
[Q] Episode 400/1500 | 

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


[Q] Episode 1000/1500 | Mean(Last50): -18.36
[Q] Episode 1200/1500 | Mean(Last50): -18.78
[Q] Episode 1400/1500 | Mean(Last50): -17.80
Run finished | α=0.1, γ=1, param=0.1 → avg_eval=-15.46 ± 1.27


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▃▅▁▅█
seed_final_mean_reward,▅▄▆▁█
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-15.46
mean_final_mean_reward_last50,-19.04
seed,4.0
seed_eval_return,-13.55
seed_final_mean_reward,-17.54
std_eval_return,1.26783
std_final_mean_reward_last50,1.15329


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 6ahobgvv with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 1
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.7
[34m[1mwandb[0m: 	param: 0.001
[34m[1mwandb[0m: 	strategy: eps_greedy
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


[Q] Episode 200/1500 | Mean(Last50): -92.52
[Q] Episode 400/1500 | Mean(Last50): -55.98
[Q] Episode 600/1500 | Mean(Last50): -78.12
[Q] Episode 800/1500 | Mean(Last50): -100.00
[Q] Episode 1000/1500 | Mean(Last50): -100.00
[Q] Episode 1200/1500 | Mean(Last50): -100.00
[Q] Episode 1400/1500 | Mean(Last50): -100.00
[Q] Episode 200/1500 | Mean(Last50): -72.64
[Q] Episode 400/1500 | Mean(Last50): -54.76
[Q] Episode 600/1500 | Mean(Last50): -45.88
[Q] Episode 800/1500 | Mean(Last50): -59.28
[Q] Episode 1000/1500 | Mean(Last50): -40.00
[Q] Episode 1200/1500 | Mean(Last50): -62.04
[Q] Episode 1400/1500 | Mean(Last50): -100.00
[Q] Episode 200/1500 | Mean(Last50): -71.42
[Q] Episode 400/1500 | Mean(Last50): -74.26
[Q] Episode 600/1500 | Mean(Last50): -47.34
[Q] Episode 800/1500 | Mean(Last50): -45.92
[Q] Episode 1000/1500 | Mean(Last50): -65.00
[Q] Episode 1200/1500 | Mean(Last50): -100.00
[Q] Episode 1400/1500 | Mean(Last50): -100.00
[Q] Episode 200/1500 | Mean(Last50): -87.46
[Q] Episode 400/

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=1, γ=0.7, param=0.001 → avg_eval=-100.00 ± 0.00


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▁▁▁▁▁
seed_final_mean_reward,▁▁▁▁▁
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-100
mean_final_mean_reward_last50,-100
seed,4
seed_eval_return,-100
seed_final_mean_reward,-100
std_eval_return,0
std_final_mean_reward_last50,0


[34m[1mwandb[0m: Agent Starting Run: xesc8zjg with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 1
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.7
[34m[1mwandb[0m: 	param: 0.01
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -76.20
[Q] Episode 400/1500 | Mean(Last50): -88.50
[Q] Episode 600/1500 | Mean(Last50): -70.66
[Q] Episode 800/1500 | Mean(Last50): -85.00
[Q] Episode 1000/1500 | Mean(Last50): -87.70
[Q] Episode 1200/1500 | Mean(Last50): -86.48
[Q] Episode 1400/1500 | Mean(Last50): -70.94
[Q] Episode 200/1500 | Mean(Last50): -94.46
[Q] Episode 400/1500 | Mean(Last50): -81.16
[Q] Episode 600/1500 | Mean(Last50): -77.52
[Q] Episode 800/1500 | Mean(Last50): -69.32
[Q] Episode 1000/1500 | Mean(Last50): -80.42
[Q] Episode 1200/1500 | Mean(Last50): -86.02
[Q] Episode 1400/1500 | Mean(Last50): -101.22
[Q] Episode 200/1500 | Mean(Last50): -94.02
[Q] Episode 400/1500 | Mean(Last50): -82.36
[Q] Episode 600/1500 | Mean(Last50): -86.98
[Q] Episode 800/1500 | Mean(Last50): -88.72
[Q] Episode 1000/1500 | Mean(Last50): -71.06
[Q] Episode 1200/1500 | Mean(Last50): -73.90
[Q] Episode 1400/1500 | Mean(Last50): -56.84
[Q] Episode 200/1500 | Mean(Last50): -62.40
[Q] Episode 400/1500 |

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=1, γ=0.7, param=0.01 → avg_eval=-77.12 ± 29.28


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,█▁▅▁▁
seed_final_mean_reward,█▁▃▁▁
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-77.12
mean_final_mean_reward_last50,-88.976
seed,4.0
seed_eval_return,-100.0
seed_final_mean_reward,-100.0
std_eval_return,29.27568
std_final_mean_reward_last50,17.2124


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: uk8hz2dc with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 1
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.7
[34m[1mwandb[0m: 	param: 0.05
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -109.04
[Q] Episode 400/1500 | Mean(Last50): -104.48
[Q] Episode 600/1500 | Mean(Last50): -110.64
[Q] Episode 800/1500 | Mean(Last50): -95.48
[Q] Episode 1000/1500 | Mean(Last50): -101.94
[Q] Episode 1200/1500 | Mean(Last50): -94.24
[Q] Episode 1400/1500 | Mean(Last50): -96.04
[Q] Episode 200/1500 | Mean(Last50): -91.36
[Q] Episode 400/1500 | Mean(Last50): -107.12
[Q] Episode 600/1500 | Mean(Last50): -133.64
[Q] Episode 800/1500 | Mean(Last50): -81.18
[Q] Episode 1000/1500 | Mean(Last50): -93.02
[Q] Episode 1200/1500 | Mean(Last50): -98.60
[Q] Episode 1400/1500 | Mean(Last50): -153.06
[Q] Episode 200/1500 | Mean(Last50): -96.96
[Q] Episode 400/1500 | Mean(Last50): -123.02
[Q] Episode 600/1500 | Mean(Last50): -113.62
[Q] Episode 800/1500 | Mean(Last50): -132.88
[Q] Episode 1000/1500 | Mean(Last50): -114.18
[Q] Episode 1200/1500 | Mean(Last50): -150.18
[Q] Episode 1400/1500 | Mean(Last50): -107.00
[Q] Episode 200/1500 | Mean(Last50): -99.34
[Q] Episod

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=1, γ=0.7, param=0.05 → avg_eval=-96.53 ± 15.50


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▁▄▅▄█
seed_final_mean_reward,▄▁▁▃█
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-96.53
mean_final_mean_reward_last50,-125.628
seed,4.0
seed_eval_return,-71.95
seed_final_mean_reward,-102.66
std_eval_return,15.50128
std_final_mean_reward_last50,12.73432


[34m[1mwandb[0m: Agent Starting Run: cpbe4105 with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 1
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.7
[34m[1mwandb[0m: 	param: 0.1
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -107.08
[Q] Episode 400/1500 | Mean(Last50): -154.00
[Q] Episode 600/1500 | Mean(Last50): -137.88
[Q] Episode 800/1500 | Mean(Last50): -83.40
[Q] Episode 1000/1500 | Mean(Last50): -99.24
[Q] Episode 1200/1500 | Mean(Last50): -118.16
[Q] Episode 1400/1500 | Mean(Last50): -134.40
[Q] Episode 200/1500 | Mean(Last50): -150.48
[Q] Episode 400/1500 | Mean(Last50): -162.30
[Q] Episode 600/1500 | Mean(Last50): -127.30
[Q] Episode 800/1500 | Mean(Last50): -121.30
[Q] Episode 1000/1500 | Mean(Last50): -119.74
[Q] Episode 1200/1500 | Mean(Last50): -145.78
[Q] Episode 1400/1500 | Mean(Last50): -126.04
[Q] Episode 200/1500 | Mean(Last50): -130.44
[Q] Episode 400/1500 | Mean(Last50): -125.10
[Q] Episode 600/1500 | Mean(Last50): -102.98
[Q] Episode 800/1500 | Mean(Last50): -142.70
[Q] Episode 1000/1500 | Mean(Last50): -148.68
[Q] Episode 1200/1500 | Mean(Last50): -126.52
[Q] Episode 1400/1500 | Mean(Last50): -114.36
[Q] Episode 200/1500 | Mean(Last50): -111.88
[Q]

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=1, γ=0.7, param=0.1 → avg_eval=-117.80 ± 19.47


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▄▇▁▄█
seed_final_mean_reward,▁▁██▁
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-117.8
mean_final_mean_reward_last50,-133.428
seed,4.0
seed_eval_return,-94.65
seed_final_mean_reward,-150.18
std_eval_return,19.46918
std_final_mean_reward_last50,19.06768


[34m[1mwandb[0m: Agent Starting Run: 3x2o4ql1 with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 1
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.8
[34m[1mwandb[0m: 	param: 0.001
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -69.60
[Q] Episode 400/1500 | Mean(Last50): -54.98
[Q] Episode 600/1500 | Mean(Last50): -43.00
[Q] Episode 800/1500 | Mean(Last50): -59.04
[Q] Episode 1000/1500 | Mean(Last50): -69.20
[Q] Episode 1200/1500 | Mean(Last50): -58.52
[Q] Episode 1400/1500 | Mean(Last50): -47.56
[Q] Episode 200/1500 | Mean(Last50): -73.40
[Q] Episode 400/1500 | Mean(Last50): -79.08
[Q] Episode 600/1500 | Mean(Last50): -61.26
[Q] Episode 800/1500 | Mean(Last50): -63.38
[Q] Episode 1000/1500 | Mean(Last50): -62.82
[Q] Episode 1200/1500 | Mean(Last50): -78.66
[Q] Episode 1400/1500 | Mean(Last50): -54.92
[Q] Episode 200/1500 | Mean(Last50): -76.46
[Q] Episode 400/1500 | Mean(Last50): -69.78
[Q] Episode 600/1500 | Mean(Last50): -68.26
[Q] Episode 800/1500 | Mean(Last50): -64.48
[Q] Episode 1000/1500 | Mean(Last50): -42.68
[Q] Episode 1200/1500 | Mean(Last50): -37.22
[Q] Episode 1400/1500 | Mean(Last50): -42.54
[Q] Episode 200/1500 | Mean(Last50): -77.66
[Q] Episode 400/1500 | 

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=1, γ=0.8, param=0.001 → avg_eval=-45.86 ± 13.91


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▃▁█▇▇
seed_final_mean_reward,▄▁█▆▇
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-45.86
mean_final_mean_reward_last50,-51.184
seed,4.0
seed_eval_return,-35.75
seed_final_mean_reward,-37.88
std_eval_return,13.90537
std_final_mean_reward_last50,16.16171


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 4xdpy58o with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 1
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.8
[34m[1mwandb[0m: 	param: 0.01
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -97.94
[Q] Episode 400/1500 | Mean(Last50): -78.54
[Q] Episode 600/1500 | Mean(Last50): -62.08
[Q] Episode 800/1500 | Mean(Last50): -81.20
[Q] Episode 1000/1500 | Mean(Last50): -58.14
[Q] Episode 1200/1500 | Mean(Last50): -73.72
[Q] Episode 1400/1500 | Mean(Last50): -49.76
[Q] Episode 200/1500 | Mean(Last50): -95.82
[Q] Episode 400/1500 | Mean(Last50): -78.46
[Q] Episode 600/1500 | Mean(Last50): -62.46
[Q] Episode 800/1500 | Mean(Last50): -62.58
[Q] Episode 1000/1500 | Mean(Last50): -76.38
[Q] Episode 1200/1500 | Mean(Last50): -90.30
[Q] Episode 1400/1500 | Mean(Last50): -100.46
[Q] Episode 200/1500 | Mean(Last50): -89.30
[Q] Episode 400/1500 | Mean(Last50): -81.74
[Q] Episode 600/1500 | Mean(Last50): -61.26
[Q] Episode 800/1500 | Mean(Last50): -85.62
[Q] Episode 1000/1500 | Mean(Last50): -54.92
[Q] Episode 1200/1500 | Mean(Last50): -76.48
[Q] Episode 1400/1500 | Mean(Last50): -56.98
[Q] Episode 200/1500 | Mean(Last50): -90.36
[Q] Episode 400/1500 |

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=1, γ=0.8, param=0.01 → avg_eval=-127.11 ± 66.14


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▁▇▇▇█
seed_final_mean_reward,▆▁█▇▅
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-127.11
mean_final_mean_reward_last50,-75.184
seed,4.0
seed_eval_return,-78.75
seed_final_mean_reward,-75.46
std_eval_return,66.13611
std_final_mean_reward_last50,3.89756


[34m[1mwandb[0m: Agent Starting Run: 51q4prdt with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 1
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.8
[34m[1mwandb[0m: 	param: 0.05
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -76.78
[Q] Episode 400/1500 | Mean(Last50): -107.34
[Q] Episode 600/1500 | Mean(Last50): -86.98
[Q] Episode 800/1500 | Mean(Last50): -120.10
[Q] Episode 1000/1500 | Mean(Last50): -108.44
[Q] Episode 1200/1500 | Mean(Last50): -107.04
[Q] Episode 1400/1500 | Mean(Last50): -117.54
[Q] Episode 200/1500 | Mean(Last50): -100.18
[Q] Episode 400/1500 | Mean(Last50): -111.96
[Q] Episode 600/1500 | Mean(Last50): -122.02
[Q] Episode 800/1500 | Mean(Last50): -130.78
[Q] Episode 1000/1500 | Mean(Last50): -107.20
[Q] Episode 1200/1500 | Mean(Last50): -111.82
[Q] Episode 1400/1500 | Mean(Last50): -117.20
[Q] Episode 200/1500 | Mean(Last50): -85.74
[Q] Episode 400/1500 | Mean(Last50): -91.32
[Q] Episode 600/1500 | Mean(Last50): -101.10
[Q] Episode 800/1500 | Mean(Last50): -110.72
[Q] Episode 1000/1500 | Mean(Last50): -91.06
[Q] Episode 1200/1500 | Mean(Last50): -110.40
[Q] Episode 1400/1500 | Mean(Last50): -93.86
[Q] Episode 200/1500 | Mean(Last50): -84.80
[Q] Epis

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=1, γ=0.8, param=0.05 → avg_eval=-84.60 ± 21.93


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▁▁▁█▄
seed_final_mean_reward,▄▄▅▁█
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-84.6
mean_final_mean_reward_last50,-108.336
seed,4.0
seed_eval_return,-75.95
seed_final_mean_reward,-87.42
std_eval_return,21.9305
std_final_mean_reward_last50,13.35786


[34m[1mwandb[0m: Agent Starting Run: 22ntg8nd with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 1
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.8
[34m[1mwandb[0m: 	param: 0.1
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -91.24
[Q] Episode 400/1500 | Mean(Last50): -121.64
[Q] Episode 600/1500 | Mean(Last50): -142.78
[Q] Episode 800/1500 | Mean(Last50): -111.72
[Q] Episode 1000/1500 | Mean(Last50): -111.10
[Q] Episode 1200/1500 | Mean(Last50): -146.14
[Q] Episode 1400/1500 | Mean(Last50): -121.46
[Q] Episode 200/1500 | Mean(Last50): -145.38
[Q] Episode 400/1500 | Mean(Last50): -152.12
[Q] Episode 600/1500 | Mean(Last50): -145.60
[Q] Episode 800/1500 | Mean(Last50): -130.60
[Q] Episode 1000/1500 | Mean(Last50): -142.90
[Q] Episode 1200/1500 | Mean(Last50): -115.94
[Q] Episode 1400/1500 | Mean(Last50): -135.96
[Q] Episode 200/1500 | Mean(Last50): -90.62
[Q] Episode 400/1500 | Mean(Last50): -126.56
[Q] Episode 600/1500 | Mean(Last50): -112.10
[Q] Episode 800/1500 | Mean(Last50): -129.70
[Q] Episode 1000/1500 | Mean(Last50): -118.04
[Q] Episode 1200/1500 | Mean(Last50): -92.06
[Q] Episode 1400/1500 | Mean(Last50): -138.48
[Q] Episode 200/1500 | Mean(Last50): -143.28
[Q] 

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=1, γ=0.8, param=0.1 → avg_eval=-107.53 ± 19.59


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▅▆█▆▁
seed_final_mean_reward,██▁▃▃
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-107.53
mean_final_mean_reward_last50,-125.868
seed,4.0
seed_eval_return,-143.2
seed_final_mean_reward,-127.8
std_eval_return,19.5931
std_final_mean_reward_last50,3.73624


[34m[1mwandb[0m: Agent Starting Run: 8tnx8f0w with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 1
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.9
[34m[1mwandb[0m: 	param: 0.001
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -58.96
[Q] Episode 400/1500 | Mean(Last50): -67.28
[Q] Episode 600/1500 | Mean(Last50): -59.84
[Q] Episode 800/1500 | Mean(Last50): -75.94
[Q] Episode 1000/1500 | Mean(Last50): -55.14
[Q] Episode 1200/1500 | Mean(Last50): -58.76
[Q] Episode 1400/1500 | Mean(Last50): -85.50
[Q] Episode 200/1500 | Mean(Last50): -58.00
[Q] Episode 400/1500 | Mean(Last50): -52.30
[Q] Episode 600/1500 | Mean(Last50): -62.36
[Q] Episode 800/1500 | Mean(Last50): -82.64
[Q] Episode 1000/1500 | Mean(Last50): -47.70
[Q] Episode 1200/1500 | Mean(Last50): -96.26
[Q] Episode 1400/1500 | Mean(Last50): -62.24
[Q] Episode 200/1500 | Mean(Last50): -80.50
[Q] Episode 400/1500 | Mean(Last50): -42.30
[Q] Episode 600/1500 | Mean(Last50): -63.36
[Q] Episode 800/1500 | Mean(Last50): -62.66
[Q] Episode 1000/1500 | Mean(Last50): -74.80
[Q] Episode 1200/1500 | Mean(Last50): -72.26
[Q] Episode 1400/1500 | Mean(Last50): -61.90
[Q] Episode 200/1500 | Mean(Last50): -84.88
[Q] Episode 400/1500 | 

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=1, γ=0.9, param=0.001 → avg_eval=-70.10 ± 21.99


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,█▆▇▁▂
seed_final_mean_reward,▅▃█▅▁
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-70.1
mean_final_mean_reward_last50,-58.276
seed,4.0
seed_eval_return,-92.1
seed_final_mean_reward,-69.0
std_eval_return,21.99484
std_final_mean_reward_last50,7.43757


[34m[1mwandb[0m: Agent Starting Run: 1pmjvojh with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 1
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.9
[34m[1mwandb[0m: 	param: 0.01
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -74.34
[Q] Episode 400/1500 | Mean(Last50): -57.92
[Q] Episode 600/1500 | Mean(Last50): -41.94
[Q] Episode 800/1500 | Mean(Last50): -88.16
[Q] Episode 1000/1500 | Mean(Last50): -63.58
[Q] Episode 1200/1500 | Mean(Last50): -73.10
[Q] Episode 1400/1500 | Mean(Last50): -80.04
[Q] Episode 200/1500 | Mean(Last50): -68.36
[Q] Episode 400/1500 | Mean(Last50): -63.36
[Q] Episode 600/1500 | Mean(Last50): -58.06
[Q] Episode 800/1500 | Mean(Last50): -74.20
[Q] Episode 1000/1500 | Mean(Last50): -90.50
[Q] Episode 1200/1500 | Mean(Last50): -53.98
[Q] Episode 1400/1500 | Mean(Last50): -72.24
[Q] Episode 200/1500 | Mean(Last50): -83.04
[Q] Episode 400/1500 | Mean(Last50): -46.56
[Q] Episode 600/1500 | Mean(Last50): -57.74
[Q] Episode 800/1500 | Mean(Last50): -96.16
[Q] Episode 1000/1500 | Mean(Last50): -56.78
[Q] Episode 1200/1500 | Mean(Last50): -74.90
[Q] Episode 1400/1500 | Mean(Last50): -56.80
[Q] Episode 200/1500 | Mean(Last50): -65.62
[Q] Episode 400/1500 | 

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=1, γ=0.9, param=0.01 → avg_eval=-63.92 ± 18.83


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,█▇▆▁█
seed_final_mean_reward,▁██▂▅
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-63.92
mean_final_mean_reward_last50,-71.436
seed,4.0
seed_eval_return,-51.3
seed_final_mean_reward,-71.36
std_eval_return,18.83397
std_final_mean_reward_last50,2.42105


[34m[1mwandb[0m: Agent Starting Run: gncirllq with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 1
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.9
[34m[1mwandb[0m: 	param: 0.05
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -72.78
[Q] Episode 400/1500 | Mean(Last50): -108.56
[Q] Episode 600/1500 | Mean(Last50): -80.56
[Q] Episode 800/1500 | Mean(Last50): -94.42
[Q] Episode 1000/1500 | Mean(Last50): -107.04
[Q] Episode 1200/1500 | Mean(Last50): -91.92
[Q] Episode 1400/1500 | Mean(Last50): -110.26
[Q] Episode 200/1500 | Mean(Last50): -101.38
[Q] Episode 400/1500 | Mean(Last50): -99.90
[Q] Episode 600/1500 | Mean(Last50): -81.64
[Q] Episode 800/1500 | Mean(Last50): -109.78
[Q] Episode 1000/1500 | Mean(Last50): -106.44
[Q] Episode 1200/1500 | Mean(Last50): -97.42
[Q] Episode 1400/1500 | Mean(Last50): -132.54
[Q] Episode 200/1500 | Mean(Last50): -89.98
[Q] Episode 400/1500 | Mean(Last50): -112.78
[Q] Episode 600/1500 | Mean(Last50): -116.94
[Q] Episode 800/1500 | Mean(Last50): -100.22
[Q] Episode 1000/1500 | Mean(Last50): -85.44
[Q] Episode 1200/1500 | Mean(Last50): -135.90
[Q] Episode 1400/1500 | Mean(Last50): -101.28
[Q] Episode 200/1500 | Mean(Last50): -61.84
[Q] Episode

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=1, γ=0.9, param=0.05 → avg_eval=-91.26 ± 32.14


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▃▃█▁▄
seed_final_mean_reward,▂▃█▆▁
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-91.26
mean_final_mean_reward_last50,-100.464
seed,4.0
seed_eval_return,-84.3
seed_final_mean_reward,-111.84
std_eval_return,32.14011
std_final_mean_reward_last50,9.76794


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 9llwq9gb with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 1
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 0.9
[34m[1mwandb[0m: 	param: 0.1
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -129.08
[Q] Episode 400/1500 | Mean(Last50): -119.06
[Q] Episode 600/1500 | Mean(Last50): -126.02
[Q] Episode 800/1500 | Mean(Last50): -97.70
[Q] Episode 1000/1500 | Mean(Last50): -135.04
[Q] Episode 1200/1500 | Mean(Last50): -110.02
[Q] Episode 1400/1500 | Mean(Last50): -109.58
[Q] Episode 200/1500 | Mean(Last50): -107.56
[Q] Episode 400/1500 | Mean(Last50): -112.48
[Q] Episode 600/1500 | Mean(Last50): -130.62
[Q] Episode 800/1500 | Mean(Last50): -129.90
[Q] Episode 1000/1500 | Mean(Last50): -123.90
[Q] Episode 1200/1500 | Mean(Last50): -107.22
[Q] Episode 1400/1500 | Mean(Last50): -129.90
[Q] Episode 200/1500 | Mean(Last50): -54.86
[Q] Episode 400/1500 | Mean(Last50): -134.66
[Q] Episode 600/1500 | Mean(Last50): -117.00
[Q] Episode 800/1500 | Mean(Last50): -135.16
[Q] Episode 1000/1500 | Mean(Last50): -94.58
[Q] Episode 1200/1500 | Mean(Last50): -117.72
[Q] Episode 1400/1500 | Mean(Last50): -115.74
[Q] Episode 200/1500 | Mean(Last50): -79.98
[Q] E

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=1, γ=0.9, param=0.1 → avg_eval=-144.73 ± 129.81


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,█▁▇█▇
seed_final_mean_reward,▆█▆▄▁
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-144.73
mean_final_mean_reward_last50,-110.032
seed,4.0
seed_eval_return,-100.0
seed_final_mean_reward,-126.44
std_eval_return,129.80794
std_final_mean_reward_last50,9.74453


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: f9ps0kij with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 1
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 1
[34m[1mwandb[0m: 	param: 0.001
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -34.42
[Q] Episode 400/1500 | Mean(Last50): -19.04
[Q] Episode 600/1500 | Mean(Last50): -30.40
[Q] Episode 800/1500 | Mean(Last50): -25.46
[Q] Episode 1000/1500 | Mean(Last50): -23.04
[Q] Episode 1200/1500 | Mean(Last50): -28.68
[Q] Episode 1400/1500 | Mean(Last50): -47.06
[Q] Episode 200/1500 | Mean(Last50): -20.64
[Q] Episode 400/1500 | Mean(Last50): -42.46
[Q] Episode 600/1500 | Mean(Last50): -40.44
[Q] Episode 800/1500 | Mean(Last50): -60.42
[Q] Episode 1000/1500 | Mean(Last50): -21.50
[Q] Episode 1200/1500 | Mean(Last50): -42.38
[Q] Episode 1400/1500 | Mean(Last50): -31.74
[Q] Episode 200/1500 | Mean(Last50): -32.32
[Q] Episode 400/1500 | Mean(Last50): -26.68
[Q] Episode 600/1500 | Mean(Last50): -27.98
[Q] Episode 800/1500 | Mean(Last50): -26.70
[Q] Episode 1000/1500 | Mean(Last50): -38.64
[Q] Episode 1200/1500 | Mean(Last50): -39.74
[Q] Episode 1400/1500 | Mean(Last50): -34.34
[Q] Episode 200/1500 | Mean(Last50): -36.34
[Q] Episode 400/1500 | 

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run finished | α=1, γ=1, param=0.001 → avg_eval=-34.45 ± 5.93


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,█▅▁▇█
seed_final_mean_reward,▁▄▃█▆
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-34.45
mean_final_mean_reward_last50,-35.732
seed,4.0
seed_eval_return,-30.0
seed_final_mean_reward,-31.12
std_eval_return,5.93161
std_final_mean_reward_last50,7.09766


[34m[1mwandb[0m: Agent Starting Run: x2wwz8fp with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 1
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 1
[34m[1mwandb[0m: 	param: 0.01
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -47.48
[Q] Episode 400/1500 | Mean(Last50): -48.64
[Q] Episode 600/1500 | Mean(Last50): -41.38
[Q] Episode 800/1500 | Mean(Last50): -53.76
[Q] Episode 1000/1500 | Mean(Last50): -41.30
[Q] Episode 1200/1500 | Mean(Last50): -25.84
[Q] Episode 1400/1500 | Mean(Last50): -36.20
[Q] Episode 200/1500 | Mean(Last50): -42.94
[Q] Episode 400/1500 | Mean(Last50): -30.14
[Q] Episode 600/1500 | Mean(Last50): -22.62
[Q] Episode 800/1500 | Mean(Last50): -40.70
[Q] Episode 1000/1500 | Mean(Last50): -33.58
[Q] Episode 1200/1500 | Mean(Last50): -36.48
[Q] Episode 1400/1500 | Mean(Last50): -56.70
[Q] Episode 200/1500 | Mean(Last50): -47.76
[Q] Episode 400/1500 | Mean(Last50): -39.98
[Q] Episode 600/1500 | Mean(Last50): -49.78
[Q] Episode 800/1500 | Mean(Last50): -39.10
[Q] Episode 1000/1500 | Mean(Last50): -26.46
[Q] Episode 1200/1500 | Mean(Last50): -25.56
[Q] Episode 1400/1500 | Mean(Last50): -21.34
[Q] Episode 200/1500 | Mean(Last50): -47.46
[Q] Episode 400/1500 | 

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


[Q] Episode 1400/1500 | Mean(Last50): -29.62
Run finished | α=1, γ=1, param=0.01 → avg_eval=-40.40 ± 17.70


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▂▁█▇▇
seed_final_mean_reward,▄▁█▇█
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-40.4
mean_final_mean_reward_last50,-39.948
seed,4.0
seed_eval_return,-27.2
seed_final_mean_reward,-30.8
std_eval_return,17.69511
std_final_mean_reward_last50,11.16376


[34m[1mwandb[0m: Agent Starting Run: 6qxta5cv with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 1
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 1
[34m[1mwandb[0m: 	param: 0.05
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -30.20
[Q] Episode 400/1500 | Mean(Last50): -50.68
[Q] Episode 600/1500 | Mean(Last50): -35.60
[Q] Episode 800/1500 | Mean(Last50): -72.54
[Q] Episode 1000/1500 | Mean(Last50): -44.82
[Q] Episode 1200/1500 | Mean(Last50): -54.54
[Q] Episode 1400/1500 | Mean(Last50): -38.76
[Q] Episode 200/1500 | Mean(Last50): -38.68
[Q] Episode 400/1500 | Mean(Last50): -37.72
[Q] Episode 600/1500 | Mean(Last50): -44.72
[Q] Episode 800/1500 | Mean(Last50): -59.90
[Q] Episode 1000/1500 | Mean(Last50): -39.88
[Q] Episode 1200/1500 | Mean(Last50): -34.52
[Q] Episode 1400/1500 | Mean(Last50): -62.66
[Q] Episode 200/1500 | Mean(Last50): -75.72
[Q] Episode 400/1500 | Mean(Last50): -47.18
[Q] Episode 600/1500 | Mean(Last50): -43.86
[Q] Episode 800/1500 | Mean(Last50): -44.44
[Q] Episode 1000/1500 | Mean(Last50): -89.98
[Q] Episode 1200/1500 | Mean(Last50): -59.52
[Q] Episode 1400/1500 | Mean(Last50): -65.36
[Q] Episode 200/1500 | Mean(Last50): -52.88
[Q] Episode 400/1500 | 

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


[Q] Episode 1400/1500 | Mean(Last50): -46.54
Run finished | α=1, γ=1, param=0.05 → avg_eval=-56.29 ± 21.67


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▁██▇▄
seed_final_mean_reward,▁█▄▂█
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-56.29
mean_final_mean_reward_last50,-69.464
seed,4.0
seed_eval_return,-67.7
seed_final_mean_reward,-45.72
std_eval_return,21.66618
std_final_mean_reward_last50,19.49429


[34m[1mwandb[0m: Agent Starting Run: cjblaeww with config:
[34m[1mwandb[0m: 	algorithm: q_learning
[34m[1mwandb[0m: 	alpha: 1
[34m[1mwandb[0m: 	env_name: std_q_tp1.0_ss(0,4)_strateps_greedy
[34m[1mwandb[0m: 	gamma: 1
[34m[1mwandb[0m: 	param: 0.1
[34m[1mwandb[0m: 	strategy: eps_greedy


[Q] Episode 200/1500 | Mean(Last50): -72.44
[Q] Episode 400/1500 | Mean(Last50): -43.62
[Q] Episode 600/1500 | Mean(Last50): -74.24
[Q] Episode 800/1500 | Mean(Last50): -51.80
[Q] Episode 1000/1500 | Mean(Last50): -42.86
[Q] Episode 1200/1500 | Mean(Last50): -61.88
[Q] Episode 1400/1500 | Mean(Last50): -36.72
[Q] Episode 200/1500 | Mean(Last50): -79.02
[Q] Episode 400/1500 | Mean(Last50): -63.30
[Q] Episode 600/1500 | Mean(Last50): -65.58
[Q] Episode 800/1500 | Mean(Last50): -38.78
[Q] Episode 1000/1500 | Mean(Last50): -49.84
[Q] Episode 1200/1500 | Mean(Last50): -62.94
[Q] Episode 1400/1500 | Mean(Last50): -37.28
[Q] Episode 200/1500 | Mean(Last50): -58.58
[Q] Episode 400/1500 | Mean(Last50): -65.52
[Q] Episode 600/1500 | Mean(Last50): -64.52
[Q] Episode 800/1500 | Mean(Last50): -95.24
[Q] Episode 1000/1500 | Mean(Last50): -80.46
[Q] Episode 1200/1500 | Mean(Last50): -75.98
[Q] Episode 1400/1500 | Mean(Last50): -91.42
[Q] Episode 200/1500 | Mean(Last50): -97.68
[Q] Episode 400/1500 | 

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
mean_eval_return,▁
mean_final_mean_reward_last50,▁
seed,▁▃▅▆█
seed_eval_return,▂█▂▇▁
seed_final_mean_reward,▅▄▁█▆
std_eval_return,▁
std_final_mean_reward_last50,▁

0,1
mean_eval_return,-75.76
mean_final_mean_reward_last50,-75.12
seed,4.0
seed_eval_return,-118.25
seed_final_mean_reward,-65.64
std_eval_return,37.8535
std_final_mean_reward_last50,14.71395


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.
