In [1]:
import numpy as np
import pandas as pd
import random
from pylab import plt, mpl
import time

In [4]:
import gym
env = gym.make('CartPole-v0')

In [5]:
def discretize(x):
    return tuple((x/np.array([0.25, 0.25, 0.01, 0.1])).astype(np.int_))

Q = {}
actions = (0,1)
# Set the parameters required by QLearing
alpha = 0.5
gamma = 1.0
epsilon = 0.90

# Get value in current environment, set it to 0 if it does not exist
def qvalues(state):
    return [Q.get((state,a),0) for a in actions]

# Compress the left and right possibilities to (0, 1) and add them to 1
def probs(v,eps=1e-4):
    v = v-v.min()+eps
    v = v/v.sum()
    return v

for step in range(20000):
    state = env.reset()
    done = False
    #do the simulation
    while not done:
        s = discretize(state)
        if random.random()<epsilon:
            # chose the action according to Q-Table probabilities
            v = probs(np.array(qvalues(s)))
            a = random.choices(actions,weights=v)[0]
        else:
            a = np.random.randint(env.action_space.n)

        state, reward, done, info = env.step(a)
        ns = discretize(state)
        Q[(s,a)] = Q.get((s,a),0) + alpha * (reward + gamma * max(qvalues(ns))-Q.get((s,a),0))

In [6]:
def run_one_episode(env):
  state = env.reset()
  for step in range(200):
    s = discretize(state)
    v = probs(np.array(qvalues(s)))
    a = random.choices(actions,weights=v)[0]
    state, reward, done, info = env.step(a)
    if done:
      break
    env.render()
    time.sleep(0.05)
    print(f'step={step}|obs={state}|action={a}|reward={reward}')
  return step

In [7]:
nosteps = run_one_episode(env)
if nosteps >= 200:
  print(f'SUCCESS')
else:
  print(f'FAIL')
print(f'nosteps = {nosteps}')
env.close()

If you want to render in human mode, initialize the environment in this way: gym.make('EnvName', render_mode='human') and don't call the render method.
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


step=0|obs=[ 0.0403476   0.18933974  0.01567825 -0.31409067]|action=1|reward=1.0
step=1|obs=[ 0.04413439  0.38423488  0.00939644 -0.6017882 ]|action=1|reward=1.0
step=2|obs=[ 0.05181909  0.18898275 -0.00263933 -0.30616048]|action=0|reward=1.0
step=3|obs=[ 0.05559875 -0.00610148 -0.00876254 -0.0143111 ]|action=0|reward=1.0
step=4|obs=[ 0.05547672  0.18914503 -0.00904876 -0.3097458 ]|action=1|reward=1.0
step=5|obs=[ 0.05925962 -0.00584683 -0.01524367 -0.01993025]|action=0|reward=1.0
step=6|obs=[ 0.05914268  0.18949038 -0.01564228 -0.3173835 ]|action=1|reward=1.0
step=7|obs=[ 0.06293249 -0.00540533 -0.02198995 -0.02967434]|action=0|reward=1.0
step=8|obs=[ 0.06282438 -0.20020515 -0.02258344  0.2559902 ]|action=0|reward=1.0
step=9|obs=[ 0.05882028 -0.3949975  -0.01746363  0.5414653 ]|action=0|reward=1.0
step=10|obs=[ 0.05092033 -0.19963449 -0.00663433  0.24333142]|action=1|reward=1.0
step=11|obs=[ 0.04692764 -0.39466107 -0.0017677   0.5339143 ]|action=0|reward=1.0
step=12|obs=[ 0.03903442 -