# Frozen Lake 
This notebook presents how to play FrozenLake game.

# Imports and setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
from pathlib import Path

import gym
import mlflow

sys.path.append("..")

In [3]:
from src.model_run import train, play
from src.models import PPO_Agent, A2C_Agent, DQN_Agent

In [4]:
mlflow.set_tracking_uri(f'file///{(Path("../reports") / "mlruns").resolve()}')

# Deterministic environment

## Experiment setup

In [5]:
experiment_name = "PPO_deterministic"
experiment_path = Path("../reports") / experiment_name
mlflow.set_experiment(experiment_name)

INFO: 'PPO_deterministic' does not exist. Creating a new experiment


## Create and train agent

In [6]:
env = gym.make("FrozenLake8x8-v0", is_slippery=False)
eval_env = gym.make("FrozenLake8x8-v0", is_slippery=False)

In [7]:
agent = PPO_Agent(MlpPolicy, env, tensorboard_log=experiment_path/"tensorboard/", verbose=0)

In [8]:
with mlflow.start_run() as run:
    train(model=agent, timesteps=5000000, eval_env=eval_env, model_path=Path("../models/") / experiment_name)



Eval num_timesteps=10000, episode_reward=0.00 +/- 0.00
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=0.00 +/- 0.00
Episode length: 200.00 +/- 0.00
Eval num_timesteps=30000, episode_reward=0.00 +/- 0.00
Episode length: 200.00 +/- 0.00
Eval num_timesteps=40000, episode_reward=0.00 +/- 0.00
Episode length: 200.00 +/- 0.00
Eval num_timesteps=50000, episode_reward=0.00 +/- 0.00
Episode length: 200.00 +/- 0.00
Eval num_timesteps=60000, episode_reward=0.00 +/- 0.00
Episode length: 200.00 +/- 0.00
Eval num_timesteps=70000, episode_reward=0.00 +/- 0.00
Episode length: 200.00 +/- 0.00
Eval num_timesteps=80000, episode_reward=0.00 +/- 0.00
Episode length: 200.00 +/- 0.00
Eval num_timesteps=90000, episode_reward=0.00 +/- 0.00
Episode length: 200.00 +/- 0.00
Eval num_timesteps=100000, episode_reward=0.00 +/- 0.00
Episode length: 200.00 +/- 0.00
Eval num_timesteps=110000, episode_reward=0.00 +/- 0.00
Episode length: 200.00 +/- 0.00
Eval num_timesteps=

## Load trained agent

In [9]:
del agent
agent = PPO_Agent.load(Path("../models/PPO_deterministic"))

In [10]:
play(agent, eval_env)


[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG

  (Right)
S[41mF[0mFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG

  (Right)
SF[41mF[0mFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG

  (Right)
SFF[41mF[0mFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG

  (Up)
SFF[41mF[0mFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG

  (Right)
SFFF[41mF[0mFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG

  (Right)
SFFFF[41mF[0mFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG

  (Right)
SFFFFF[41mF[0mF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG

  (Down)
SFFFFFFF
FFFFFF[41mF[0mF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG

  (Down)
SFFFFFFF
FFFFFFFF
FFFHFF[41mF[0mF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG

  (Down)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFH[41mF[0mF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG

  (

(1.0, 16)

# Stochastic environment

## Experiment setup

In [11]:
experiment_name = "PPO_stochastic"
experiment_path = Path("../reports") / experiment_name
mlflow.set_experiment(experiment_name)

INFO: 'PPO_stochastic' does not exist. Creating a new experiment


## Create and train agent

In [12]:
env = gym.make("FrozenLake8x8-v0")
eval_env = gym.make("FrozenLake8x8-v0")

In [13]:
agent = PPO_Agent(MlpPolicy, env, tensorboard_log=experiment_path/"tensorboard/", verbose=0)

In [14]:
with mlflow.start_run() as run:
    train(model=agent, timesteps=5000000, eval_env=eval_env, model_path=Path("../models/") / experiment_name)

Eval num_timesteps=10000, episode_reward=0.00 +/- 0.00
Episode length: 15.80 +/- 15.34
New best mean reward!
Eval num_timesteps=20000, episode_reward=0.00 +/- 0.00
Episode length: 37.50 +/- 30.66
Eval num_timesteps=30000, episode_reward=0.10 +/- 0.30
Episode length: 53.70 +/- 40.27
New best mean reward!
Eval num_timesteps=40000, episode_reward=0.30 +/- 0.46
Episode length: 51.20 +/- 38.01
New best mean reward!
Eval num_timesteps=50000, episode_reward=0.30 +/- 0.46
Episode length: 42.20 +/- 21.52
Eval num_timesteps=60000, episode_reward=0.90 +/- 0.30
Episode length: 80.90 +/- 45.64
New best mean reward!
Eval num_timesteps=70000, episode_reward=0.80 +/- 0.40
Episode length: 81.20 +/- 48.33
Eval num_timesteps=80000, episode_reward=0.80 +/- 0.40
Episode length: 102.90 +/- 56.62
Eval num_timesteps=90000, episode_reward=1.00 +/- 0.00
Episode length: 101.10 +/- 38.57
New best mean reward!


# Advantage Actor Critic

In [15]:
experiment_name = "A2C_stochastic"
experiment_path = Path("../reports") / experiment_name
mlflow.set_experiment(experiment_name)

INFO: 'A2C_stochastic' does not exist. Creating a new experiment


In [16]:
env = gym.make("FrozenLake8x8-v0")
eval_env = gym.make("FrozenLake8x8-v0")

In [17]:
agent = A2C_Agent(MlpPolicy, env, tensorboard_log=experiment_path/"tensorboard/", verbose=0)

In [18]:
with mlflow.start_run() as run:
    train(model=agent, timesteps=5000000, eval_env=eval_env, model_path=Path("../models/") / experiment_name)



Eval num_timesteps=10000, episode_reward=0.00 +/- 0.00
Episode length: 22.50 +/- 11.13
New best mean reward!
Eval num_timesteps=20000, episode_reward=0.30 +/- 0.46
Episode length: 65.80 +/- 63.57
New best mean reward!
Eval num_timesteps=30000, episode_reward=0.20 +/- 0.40
Episode length: 40.10 +/- 52.16
Eval num_timesteps=40000, episode_reward=0.30 +/- 0.46
Episode length: 42.90 +/- 57.66
Eval num_timesteps=50000, episode_reward=0.70 +/- 0.46
Episode length: 68.50 +/- 45.84
New best mean reward!
Eval num_timesteps=60000, episode_reward=0.90 +/- 0.30
Episode length: 101.20 +/- 43.08
New best mean reward!
Eval num_timesteps=70000, episode_reward=0.80 +/- 0.40
Episode length: 106.30 +/- 50.15
Eval num_timesteps=80000, episode_reward=0.80 +/- 0.40
Episode length: 72.80 +/- 43.83
Eval num_timesteps=90000, episode_reward=0.80 +/- 0.40
Episode length: 122.00 +/- 55.66
Eval num_timesteps=100000, episode_reward=1.00 +/- 0.00
Episode length: 101.50 +/- 29.94
New best mean reward!


# DQN

In [18]:
a = 0

In [19]:
experiment_name = "DQN_stochastic"
experiment_path = Path("../reports") / experiment_name
mlflow.set_experiment(experiment_name)

In [20]:
env = gym.make("FrozenLake8x8-v0")
eval_env = gym.make("FrozenLake8x8-v0")

In [21]:
agent = DQN_Agent("MlpPolicy", env, tensorboard_log=experiment_path/"tensorboard/", verbose=0)

In [22]:
with mlflow.start_run() as run:
    train(model=agent, timesteps=5000000, eval_env=eval_env, model_path=Path("../models/") / experiment_name)



Eval num_timesteps=10000, episode_reward=0.00 +/- 0.00
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=0.00 +/- 0.00
Episode length: 200.00 +/- 0.00
Eval num_timesteps=30000, episode_reward=0.00 +/- 0.00
Episode length: 200.00 +/- 0.00
Eval num_timesteps=40000, episode_reward=0.00 +/- 0.00
Episode length: 200.00 +/- 0.00
Eval num_timesteps=50000, episode_reward=0.00 +/- 0.00
Episode length: 200.00 +/- 0.00
Eval num_timesteps=60000, episode_reward=0.00 +/- 0.00
Episode length: 58.70 +/- 47.07
Eval num_timesteps=70000, episode_reward=0.00 +/- 0.00
Episode length: 161.40 +/- 61.23
Eval num_timesteps=80000, episode_reward=0.00 +/- 0.00
Episode length: 200.00 +/- 0.00
Eval num_timesteps=90000, episode_reward=0.60 +/- 0.49
Episode length: 92.90 +/- 65.85
New best mean reward!
Eval num_timesteps=100000, episode_reward=0.20 +/- 0.40
Episode length: 115.10 +/- 63.85
Eval num_timesteps=110000, episode_reward=0.00 +/- 0.00
Episode length: 136.50 +/- 