In [1]:
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer
import time, math, random
from typing import Tuple
import gym

In [2]:
env = gym.make('CartPole-v1', render_mode='human')

Q_Learning

First i divide the continous space into buckets, to make the space discrete instead.

In [3]:
n_bins = (6, 12)
lower_bounds = [env.observation_space.low[2], -math.radians(50)]
upper_bounds = [env.observation_space.high[2], math.radians(50)]

def discretizer( _ , __ , angle, pole_velocity ) -> Tuple[int, ...]:
    est = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
    est.fit([lower_bounds, upper_bounds])
    return tuple(map(int, est.transform([[angle, pole_velocity]])[0]))

I then initialize the q-table with zeroes.

In [4]:
# Initializing the q table with zeroes
q_table = np.zeros(n_bins + (env.action_space.n,))
q_table.shape

(6, 12, 2)

Policy:
Greedy selection of highest q values.

New_q_value:
Updates the function.

Learning_rate:
Decaying learning rate.

Exploration_rate:
Also decaying as the model is trained.

In [5]:
def policy(state: tuple):
    return np.argmax(q_table[state])

def new_q_value(reward: float, state_new: tuple, discount_factor=1) -> float:
    future_optimal_value = np.max(q_table[state_new])
    learned_value = reward + discount_factor * future_optimal_value
    return learned_value

def learning_rate(n: int, min_rate=0.01) -> float:
    return max(min_rate, min(1.0, 1.0 - math.log10((n+1) / 25)))

def exploration_rate(n: int, min_rate=0.1) -> float:
    return max(min_rate, min(1., 1.0 - math.log10((n+1) / 25)))

In [6]:
# Training
n_episodes = 200
for e in range(n_episodes):
    current_state, done = discretizer(*[i for i in env.reset()[0]]), False

    while done == False:
        action = policy(current_state)

        if np.random.random() < exploration_rate(e):
            action = env.action_space.sample()

        obs, reward, done, _, __ = [i for i in env.step(action)]
        new_state = discretizer(*obs)

        lr = learning_rate(e)
        learnt_value = new_q_value(reward, new_state)
        old_value = q_table[current_state][action]
        q_table[current_state][action] = (1-lr)*old_value+lr*learnt_value

        current_state = new_state

        if done:
            print("episode: {}/{}, e: {:.2}".format(e, n_episodes, lr))

        env.render()

episode: 0/200, e: 1.0
episode: 1/200, e: 1.0
episode: 2/200, e: 1.0
episode: 3/200, e: 1.0
episode: 4/200, e: 1.0
episode: 5/200, e: 1.0
episode: 6/200, e: 1.0
episode: 7/200, e: 1.0
episode: 8/200, e: 1.0
episode: 9/200, e: 1.0
episode: 10/200, e: 1.0
episode: 11/200, e: 1.0
episode: 12/200, e: 1.0
episode: 13/200, e: 1.0
episode: 14/200, e: 1.0
episode: 15/200, e: 1.0
episode: 16/200, e: 1.0
episode: 17/200, e: 1.0
episode: 18/200, e: 1.0
episode: 19/200, e: 1.0
episode: 20/200, e: 1.0
episode: 21/200, e: 1.0
episode: 22/200, e: 1.0
episode: 23/200, e: 1.0
episode: 24/200, e: 1.0
episode: 25/200, e: 0.98
episode: 26/200, e: 0.97
episode: 27/200, e: 0.95
episode: 28/200, e: 0.94
episode: 29/200, e: 0.92
episode: 30/200, e: 0.91
episode: 31/200, e: 0.89
episode: 32/200, e: 0.88
episode: 33/200, e: 0.87
episode: 34/200, e: 0.85
episode: 35/200, e: 0.84
episode: 36/200, e: 0.83
episode: 37/200, e: 0.82
episode: 38/200, e: 0.81
episode: 39/200, e: 0.8
episode: 40/200, e: 0.79
episode: 41

KeyboardInterrupt: 

: 