In [1]:
import gymnasium as gym
import numpy as np
import time
from q_learning import QL
from tqdm import tqdm

import torch
import torch.nn.functional as F
from pclib.nn.models import FCClassifier

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
env = gym.make('FrozenLake-v1', is_slippery=False)
ql = QL(env, 0.001, 0.9, epsilon=0.1)
ql.train(10000)

                                                                           

In [10]:
seed = 42
torch.manual_seed(seed)

model = FCClassifier(
    input_size=16,
    num_classes=4,
    hidden_sizes=[],
    bias=True,
    symmetric=True,
    precision_weighted=False,
    actv_fn=F.tanh,
    steps=60,
    gamma=0.1,
).to(device)

def format_obs(obs):
    obs = F.one_hot(torch.tensor(obs), num_classes=16).float().to(device)
    baseline = torch.ones_like(obs).to(device) * 0.03
    obs = baseline + (obs * 0.94)
    return obs

optimiser = torch.optim.AdamW(model.parameters(), lr=0.0001, weight_decay=1.0)

In [11]:
num_envs = 64
envs = gym.vector.make('FrozenLake-v1', num_envs=num_envs, asynchronous=False, is_slippery=False)

  gym.logger.warn(


In [12]:
num_epochs = 1000
final_rewards = []
vfes = []
loop = tqdm(range(num_epochs), leave=False, total=num_epochs)
for epoch in loop:
    if epoch > 0:
        loop.set_description(f"Epoch {epoch}/{num_epochs}, Reward: {final_rewards[-1].mean():.3f}, max: {final_rewards[-1].max():.3f}, vfe: {vfes[-1]:.3g}")
    state = envs.reset()[0]
    total_reward = 0.0
    done = np.array([False] * num_envs)
    prev_state_value = torch.tensor(ql.Q[state].max()).to(device)
    step = 0
    epoch_vfes = []
    while not done.all():
        step += 1
        if step % 300 == 0:
            print(f"{terminated + truncated}, {(terminated + truncated).all()}")
        obs = format_obs(state)
        actions, s = model(obs)
        action = actions.argmax(dim=1).cpu().numpy()

        state, reward, terminated, truncated, _ = envs.step(action)
        for i, t in enumerate(terminated):
            if t:
                total_reward[i] -= 0.2
        state_value = torch.tensor(ql.Q[state].max(1)).to(device)
        total_reward += reward - 0.01
        done = done + terminated
        done = done + truncated

        vfe = model.vfe(s, batch_reduction=None)
        loss = (vfe * (state_value - prev_state_value)).mean()
        epoch_vfes.append(vfe.mean().item())
        optimiser.zero_grad()
        loss.backward()
        optimiser.step()

        prev_state_value = state_value
    final_rewards.append(total_reward)
    vfes.append(sum(epoch_vfes) / len(epoch_vfes))



                                                                                                          

KeyboardInterrupt: 

In [None]:
# Watch the agent play
env = gym.make('FrozenLake-v1', is_slippery=False, render_mode='human')
QL.env = env
total_rewards = []
for i in range(3):
    total_reward = 0.0
    state = env.reset()[0]
    terminated, truncated = False, False
    while not terminated and not truncated:
        action = ql.act(state)
        state, reward, terminated, truncated, _ =env.step(action)
        total_reward += reward
        env.render()
        time.sleep(0.05)
    total_rewards.append(reward)

print(f"mean reward: {np.array(total_rewards).mean()}")
env.close()

KeyboardInterrupt: 