In [1]:
import gym
from common import (
    Discretizer,
)
from observers import (
    StateAnalysisLogger,
    WindowMetricLogger,
)
from solvers import (
    DiscreteQLearningSolver
)
from strategies import (
    EpsilonDecreasingStrategy
)
from training import (
    QLearningTrainer,
)

## Setup

In [2]:
# 1. Create environment
env = gym.make('MountainCar-v0')

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


  result = entry_point.load(False)


In [3]:
# 2. Discretize space
discretizer = Discretizer(env, [0.1, 0.01])

In [4]:
# 2. Setup Solver & Strategy
solver = DiscreteQLearningSolver(
    env=env,
    discretizer=discretizer,
    learning_rate=0.1,
    discount=0.9,
)
strategy = EpsilonDecreasingStrategy(epsilon=0.2)

In [5]:
# 3. Add Observers
mean_reward_logger = WindowMetricLogger(window_size=100, metric='reward')
state_analysis = StateAnalysisLogger(env=env, discretizer=discretizer, frequency=100)

In [6]:
# 4. Put it all together
model = QLearningTrainer(
    env=env, 
    solver=solver,
    strategy=strategy,
    observers=[
        mean_reward_logger,
        state_analysis
    ]
)

## Model Training / Evaluation

In [7]:
model.train(episodes=1000)

Epoch: 0 | Window reward: -200.0
Visitation Pct: 0.03508771929824561
Epoch: 100 | Window reward: -200.0
Visitation Pct: 0.44912280701754387
Epoch: 200 | Window reward: -200.0
Visitation Pct: 0.5824561403508772
Epoch: 300 | Window reward: -200.0
Visitation Pct: 0.6035087719298246
Epoch: 400 | Window reward: -200.0
Visitation Pct: 0.712280701754386
Epoch: 500 | Window reward: -199.75
Visitation Pct: 0.7368421052631579
Epoch: 600 | Window reward: -199.83
Visitation Pct: 0.7508771929824561
Epoch: 700 | Window reward: -195.33
Visitation Pct: 0.7649122807017544
Epoch: 800 | Window reward: -198.65
Visitation Pct: 0.7649122807017544
Epoch: 900 | Window reward: -196.75
Visitation Pct: 0.7684210526315789


In [8]:
model.render()

In [9]:
model.close()