In [1]:
import gym
from common import (
    Discretizer,
)
from observers import (
    StateAnalysisLogger,
    WindowMetricLogger,
)
from strategies import (
    EpsilonDecreasingStrategy
)
from training import (
    DiscreteQLearningTrainer,
    DiscreteQLearningTrainingConfig,
)

## Setup

In [2]:
# 1. Create environment
env = gym.make('MountainCar-v0')

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


  result = entry_point.load(False)


In [3]:
# 2. Setup Strategy & Training Config
strategy = EpsilonDecreasingStrategy(epsilon=0.2)
training_config = DiscreteQLearningTrainingConfig(
    learning_rate=0.1,
    discount=0.9,
    episodes=2500
)

In [4]:
# 3. Discretize Space
discretizer = Discretizer(env, [0.1, 0.01])

In [5]:
# 4. Add Observers
mean_reward_logger = WindowMetricLogger(window_size=50, metric='reward')
state_analysis = StateAnalysisLogger(env=env, discretizer=discretizer, frequency=100)

In [6]:
# 5. Put it all together
model = DiscreteQLearningTrainer(
    env=env, 
    discretizer=discretizer, 
    training_config=training_config,
    strategy=strategy,
    observers=[
        mean_reward_logger,
        state_analysis
    ]
)

## Model Training / Evaluation

In [12]:
model.train()

Epoch: 0 | Window reward: -180.88
Visitation Pct: 0.6608187134502924
Epoch: 50 | Window reward: -167.54
Epoch: 100 | Window reward: -163.8
Visitation Pct: 0.6608187134502924
Epoch: 150 | Window reward: -172.64
Epoch: 200 | Window reward: -166.12
Visitation Pct: 0.6608187134502924
Epoch: 250 | Window reward: -158.28
Epoch: 300 | Window reward: -174.16
Visitation Pct: 0.6608187134502924
Epoch: 350 | Window reward: -173.0
Epoch: 400 | Window reward: -157.2
Visitation Pct: 0.6608187134502924
Epoch: 450 | Window reward: -151.12
Epoch: 500 | Window reward: -159.36
Visitation Pct: 0.6608187134502924
Epoch: 550 | Window reward: -160.28
Epoch: 600 | Window reward: -170.94
Visitation Pct: 0.6608187134502924
Epoch: 650 | Window reward: -160.4
Epoch: 700 | Window reward: -164.34
Visitation Pct: 0.6608187134502924
Epoch: 750 | Window reward: -166.76
Epoch: 800 | Window reward: -163.02
Visitation Pct: 0.6608187134502924
Epoch: 850 | Window reward: -160.5
Epoch: 900 | Window reward: -164.02
Visitatio

In [13]:
model.render()

In [14]:
model.close()