In [10]:
from imitation.data import rollout
from imitation.data.wrappers import RolloutInfoWrapper
from stable_baselines3.common.vec_env import DummyVecEnv
import numpy as np
from Wordle import WordleEnv
from stable_baselines3.common import monitor

# Create a vectorized environment for training with `imitation`

env = monitor.Monitor(WordleEnv())
venv = DummyVecEnv([lambda: RolloutInfoWrapper(env)])  # Wrap a single environment -- only useful for simple testing like this
venv.render_mode="human"

In [23]:
from stable_baselines3.common.evaluation import evaluate_policy

rng = np.random.default_rng()
rollouts = np.load('data/trajectories_all.npy', allow_pickle=True)
transitions = rollout.flatten_trajectories_with_rew(rollouts)

In [54]:
from imitation.algorithms import bc

bc_trainer = bc.BC(
    observation_space=env.observation_space,
    action_space=env.action_space,
    demonstrations=transitions,
    rng=rng,
)
reward_before_training, _ = evaluate_policy(bc_trainer.policy, venv, 1000, return_episode_rewards=True)
print(f"Reward before training: {np.mean(reward_before_training)}")

Reward before training: -8.121


In [13]:
bc_trainer.train(n_epochs=10)

0batch [00:00, ?batch/s]

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 0        |
|    ent_loss       | -0.00947 |
|    entropy        | 9.47     |
|    epoch          | 0        |
|    l2_loss        | 0        |
|    l2_norm        | 129      |
|    loss           | 9.46     |
|    neglogp        | 9.47     |
|    prob_true_act  | 7.71e-05 |
|    samples_so_far | 32       |
--------------------------------


496batch [00:07, 61.37batch/s]

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 500      |
|    ent_loss       | -0.00573 |
|    entropy        | 5.73     |
|    epoch          | 2        |
|    l2_loss        | 0        |
|    l2_norm        | 8.9e+03  |
|    loss           | 4.76     |
|    neglogp        | 4.77     |
|    prob_true_act  | 0.173    |
|    samples_so_far | 16032    |
--------------------------------


1000batch [00:16, 56.83batch/s]

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 1000     |
|    ent_loss       | -0.00432 |
|    entropy        | 4.32     |
|    epoch          | 4        |
|    l2_loss        | 0        |
|    l2_norm        | 1.32e+04 |
|    loss           | 3.74     |
|    neglogp        | 3.75     |
|    prob_true_act  | 0.299    |
|    samples_so_far | 32032    |
--------------------------------


1499batch [00:25, 53.11batch/s]

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 1500     |
|    ent_loss       | -0.00578 |
|    entropy        | 5.78     |
|    epoch          | 6        |
|    l2_loss        | 0        |
|    l2_norm        | 1.64e+04 |
|    loss           | 5.18     |
|    neglogp        | 5.18     |
|    prob_true_act  | 0.177    |
|    samples_so_far | 48032    |
--------------------------------


1997batch [00:34, 52.25batch/s]

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 2000     |
|    ent_loss       | -0.00414 |
|    entropy        | 4.14     |
|    epoch          | 8        |
|    l2_loss        | 0        |
|    l2_norm        | 1.96e+04 |
|    loss           | 3.48     |
|    neglogp        | 3.48     |
|    prob_true_act  | 0.405    |
|    samples_so_far | 64032    |
--------------------------------


2460batch [00:41, 58.57batch/s]


In [55]:
reward_after_training, _ = evaluate_policy(bc_trainer.policy, venv, 1000, return_episode_rewards=True)
print(f"Reward after training: {np.mean(reward_after_training)}")

Reward after training: -8.19
