In [1]:
import gym
import random, time, math
import numpy as np
from typing import Tuple
from sklearn.preprocessing import KBinsDiscretizer


env = gym.make('CartPole-v1')

In [2]:
#env = gym.make('CartPole-v1')

#policy = lambda _,__,___, tip_velocity : int( tip_velocity > 0 )                 policy that works
#policy = lambda obs: 1

#for i in range(3):
#    obs = env.reset()
#    for i in range(80):
#        actions = policy(*obs)
#        obs ,reward, done, info = env.step(actions)
#        env.render()
#        time.sleep(0.05)
#env.close()


In [3]:
#Put continuous data into discrete buckets; put pole angle into 6, pole velocity into 12

n_bins = ( 6 , 12 )
lower_bounds = [ env.observation_space.low[2], -math.radians(50) ]
upper_bounds = [ env.observation_space.high[2], math.radians(50) ]

def discretizer( _ , __ , angle, pole_velocity ) -> Tuple[int,...]:
    est = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
    est.fit([lower_bounds, upper_bounds ])
    return tuple(map(int,est.transform([[angle, pole_velocity]])[0]))

In [4]:
#init q values = 0

Q_table = np.zeros(n_bins + (env.action_space.n,))
Q_table.shape

(6, 12, 2)

In [5]:
#choosing action based on epsilon greedy strategy
def policy( state : tuple ):
    return np.argmax(Q_table[state])

In [6]:
#updating q values
def new_Q_value( reward : float ,  new_state : tuple , discount_factor=1 ) -> float:
    future_optimal_value = np.max(Q_table[new_state])
    learned_value = reward + discount_factor * future_optimal_value
    return learned_value

In [None]:
exploration_rate = 1
learning_rate = 1
n_episodes = 10000 

for i in range(n_episodes):
    
    # Discretize state into buckets
    current_state, done = discretizer(*env.reset()), False
    
    #epsilon
    if exploration_rate > 0.1:
            exploration_rate = exploration_rate * 0.99
    if learning_rate > 0.01:
            learning_rate = learning_rate *0.99 
    
    total_reward = 0
    
    while done==False:
        
        # policy action 
        action = policy(current_state)
        
        # explore (epsilon-greedy strat)
        if np.random.random() < exploration_rate : 
            action = env.action_space.sample()
         
        # change enviroment
        obs, reward, done, _ = env.step(action)
        new_state = discretizer(*obs)
        
        # Update Q-Table  
        lr = learning_rate
        learnt_value = new_Q_value(reward , new_state )
        total_reward += reward
        old_value = Q_table[current_state][action]
        Q_table[current_state][action] = (1-lr)*old_value + lr*learnt_value
        
        current_state = new_state

        env.render()
        
    print("Episode: {}, Total Reward: {}, ".format(i, total_reward))
    #print(Q_table)

Episode: 0, Total Reward: 20.0, 
Episode: 1, Total Reward: 12.0, 
Episode: 2, Total Reward: 13.0, 
Episode: 3, Total Reward: 16.0, 
Episode: 4, Total Reward: 16.0, 
Episode: 5, Total Reward: 23.0, 
Episode: 6, Total Reward: 21.0, 
Episode: 7, Total Reward: 24.0, 
Episode: 8, Total Reward: 16.0, 
Episode: 9, Total Reward: 14.0, 
Episode: 10, Total Reward: 14.0, 
Episode: 11, Total Reward: 33.0, 
Episode: 12, Total Reward: 21.0, 
Episode: 13, Total Reward: 14.0, 
Episode: 14, Total Reward: 11.0, 
Episode: 15, Total Reward: 17.0, 
Episode: 16, Total Reward: 41.0, 
Episode: 17, Total Reward: 22.0, 
Episode: 18, Total Reward: 37.0, 
Episode: 19, Total Reward: 15.0, 
Episode: 20, Total Reward: 17.0, 
Episode: 21, Total Reward: 12.0, 
Episode: 22, Total Reward: 31.0, 
Episode: 23, Total Reward: 57.0, 
Episode: 24, Total Reward: 43.0, 
Episode: 25, Total Reward: 60.0, 
Episode: 26, Total Reward: 26.0, 
Episode: 27, Total Reward: 12.0, 
Episode: 28, Total Reward: 28.0, 
Episode: 29, Total Rewar

In [None]:
#record 

# 158   exploration rate: 99