In [None]:
# taken from https://github.com/IsaacPatole/CartPole-v0-using-Q-learning-SARSA-and-DNN/blob/master/Qlearning_for_cartpole.py
# taken from https://medium.com/@flomay/using-q-learning-to-solve-the-cartpole-balancing-problem-c0a7f47d3f9d

In [1]:
import gym
import random
import numpy as np
import seaborn as sns
import math
from tqdm import tqdm
import matplotlib.pyplot as plt

In [2]:
env = gym.make("CartPole-v0")

In [3]:
observation = env.reset()
print(observation)

# 
# ends:  > 12 degrees from vertical
#        cart moves to edge of display
#        episode length is greater than 200

'''
    Observation:
        Type: Box(4)
        Num     Observation               Min                     Max
        0       Cart Position             -4.8                    4.8
        1       Cart Velocity             -Inf                    Inf
        2       Pole Angle                -0.418 rad (-24 deg)    0.418 rad (24 deg)
        3       Pole Angular Velocity     -Inf                    Inf
'''

# actions: 1 or 0 (right or left)
# reward:  +1 for every timestep that the pole remains upright
# "solved": > 195.0 over 100 consecutive trials

[ 0.00713753  0.00772483  0.03280672 -0.02201091]


'\n    Observation:\n        Type: Box(4)\n        Num     Observation               Min                     Max\n        0       Cart Position             -4.8                    4.8\n        1       Cart Velocity             -Inf                    Inf\n        2       Pole Angle                -0.418 rad (-24 deg)    0.418 rad (24 deg)\n        3       Pole Angular Velocity     -Inf                    Inf\n'

In [4]:
# Cell 1: define some constants

alpha = 0.5
epsilon = 0.8
gamma = 0.9
num_episodes = 1000
#buckets = (3,3,6,6)
buckets = (1,1,6,12)
upper_bounds = [env.observation_space.high[0], 0.5, env.observation_space.high[2], math.radians(50) / 1.]
lower_bounds = [env.observation_space.low[0], -0.5, env.observation_space.low[2], -math.radians(50) / 1.]

In [5]:
# Cell 2: transform continuous abservations to discrete states

def transform_state(observation):
    state = list()
    for i in range(len(observation)):
        scale = (observation[i] + abs(lower_bounds[i])) / (upper_bounds[i]-lower_bounds[i])
        tmp_state = int(round((buckets[i]-1)*scale))
        tmp_state = min(buckets[i]-1, max(0,tmp_state))
        state.append(tmp_state)
    return tuple(state)


In [6]:
# Cell 3: select the action for a given state from the policy

def choose_action_from_state(state, this_epsilon):
    probability = np.random.random()
    if probability >= this_epsilon:
        action = np.argmax(q_table[state])
    else:
        action = env.action_space.sample()

    return action

In [7]:
# Cell 4: the q-learning part of updating the Q_table

def update_q(state,action,reward,next_state):
    current_q = q_table[state][action]
    new_q = reward + gamma*np.max(q_table[next_state])
    q_table[state][action] += alpha * (new_q - current_q)
    

In [8]:
# Cell 5: train the agent

q_table = np.zeros(buckets + (env.action_space.n,))

for episode in tqdm(range(num_episodes)):       
    done = False
    observation = env.reset()
    current_state = transform_state(observation)
    
    # now do the q_learning
    while not done:

        # get an action from policy
        action = choose_action_from_state(current_state, epsilon)
                
        # take an action
        observation, reward, done, info = env.step(action)
        next_state = transform_state(observation)

        # q-learning: update Q-table
        update_q(current_state,action,reward,next_state)
        current_state = next_state
        
        # Decay epsilon
        if epsilon > 0:
            epsilon -= epsilon/num_episodes

        
print('Finished training!')




100%|██████████| 1000/1000 [00:09<00:00, 107.42it/s]

Finished training!





In [9]:
# Cell 6: run the agent
#env = gym.wrappers.Monitor(env,'cartpoleQ')

for _ in range(10):

    t = 0
    done = False
    observation = env.reset()
    current_state = transform_state(observation)

    while not done:    
        env.render()
        action = choose_action_from_state(current_state, 0)               
        observation, reward, done, info = env.step(action)
        next_state = transform_state(observation)
        current_state = next_state
        t += 1


    print("episode completed in ",t)   


episode completed in  200
