# Reinforcement Learning Simulation in OpenAI Gym Environment

## 1. CartPole-v1

In [1]:
import gym
import random
import math
import time
import numpy as np
from IPython.display import clear_output

env = gym.make("CartPole-v1")
print(env.observation_space)
print(env.action_space)

Box(4,)
Discrete(2)


In [2]:
class Agent():
    def __init__(self, env):
        self.is_discrete = type(env.action_space) == gym.spaces.discrete.Discrete
        self.env_discrete = type(env.observation_space) == gym.spaces.discrete.Discrete
        
        if self.is_discrete:
            self.action_size = env.action_space.n
            print("||Action-Space|| = {}".format(self.action_size))
        else:
            self.action_low = env.action_space.low
            self.action_high = env.action_space.high
            self.action_shape = env.action_space.shape
            print("||Action-Space|| range:", self.action_low, self.action_high)
            
        if self.env_discrete:
            self.state_size = env.observation_space.n
            print("||State-Space|| = {}".format(self.state_size))
        else:
            self.state_low = env.observation_space.low
            self.state_high = env.observation_space.high
            self.state_shape = env.observation_space.shape
            print("||State-Space|| range:", self.state_low, self.state_high)      
        
        self.states = env.observation_space
        self.actions = env.action_space        
    
    def get_action_random(self, state):
        if self.is_discrete:
            action = random.choice(range(self.action_size))
        else:
            action = np.random.uniform(self.action_low, self.action_high, self.action_shape)
        return action
    
    def get_action(self, state):
        pole_angle = state[2]
        action = 0 if pole_angle < 0 else 1
        return action
    
myagent = Agent(env)

||Action-Space|| = 2
||State-Space|| range: [-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38] [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]


In [3]:
n = int(input("Enter the no. of episodes for simulation: "))

for episode in range(n):
    state = env.reset()
    for t in range(10000):
        env.render()
        print("In State: ", state)
        action = myagent.get_action(state)
        state, reward, done, info = env.step(action)
        if done:
            print("Episode {} done in {} timesteps!\n".format(episode+1, t+1))
            break
    else:
        print("Episode {} NOT done in limit of {} timesteps! Agent Terminated!\n".format(episode+1, t+1))
    
env.close()   

Enter the no. of episodes for simulation: 5
In State:  [-0.02678381  0.01326885  0.04522186 -0.03733366]
In State:  [-0.02651843  0.20771412  0.04447518 -0.31541269]
In State:  [-0.02236415  0.40217526  0.03816693 -0.59374457]
In State:  [-0.01432064  0.59674277  0.02629204 -0.87416493]
In State:  [-0.00238579  0.79149758  0.00880874 -1.15846721]
In State:  [ 0.01344416  0.98650363 -0.0143606  -1.4483752 ]
In State:  [ 0.03317424  0.79156116 -0.04332811 -1.16021351]
In State:  [ 0.04900546  0.59702964 -0.06653238 -0.88142457]
In State:  [ 0.06094605  0.40287145 -0.08416087 -0.61037773]
In State:  [ 0.06900348  0.20902047 -0.09636842 -0.34534455]
In State:  [ 0.07318389  0.01539202 -0.10327532 -0.08453796]
In State:  [ 0.07349173 -0.17810957 -0.10496607  0.17385891]
In State:  [ 0.06992954 -0.37158487 -0.1014889   0.43167118]
In State:  [ 0.06249784 -0.56513434 -0.09285547  0.69071496]
In State:  [ 0.05119515 -0.75885355 -0.07904117  0.95278166]
In State:  [ 0.03601808 -0.95282809 -0.05

In State:  [ 0.06877519 -0.39017131 -0.07860984  0.54032877]
In State:  [ 0.06097176 -0.58410538 -0.06780326  0.80724403]
In State:  [ 0.04928966 -0.77823578 -0.05165838  1.07785164]
In State:  [ 0.03372494 -0.97263879 -0.03010135  1.35388635]
In State:  [ 0.01427216 -1.16737022 -0.00302362  1.63700271]
In State:  [-0.00907524 -1.36245658  0.02971643  1.92874197]
In State:  [-0.03632437 -1.1676653   0.06829127  1.6454194 ]
In State:  [-0.05967768 -0.97340565  0.10119966  1.37477117]
In State:  [-0.07914579 -0.77968355  0.12869508  1.11537668]
In State:  [-0.09473946 -0.58646407  0.15100261  0.86567627]
In State:  [-0.10646874 -0.39368417  0.16831614  0.6240232 ]
In State:  [-0.11434243 -0.20126227  0.1807966   0.38872086]
In State:  [-0.11836767 -0.00910517  0.18857102  0.15804833]
In State:  [-0.11854978  0.18288698  0.19173199 -0.06972285]
In State:  [-0.11489204  0.37481704  0.19033753 -0.29631799]
In State:  [-0.10739569  0.56678816  0.18441117 -0.52345234]
In State:  [-0.09605993 

## Q Learning

In [4]:
class QLAgent(Agent):
    def __init__(self, env, discount_rate=0.97, learning_rate=0.01):
        super().__init__(env)        
        
        self.eps = 1.0
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        
        self.state_size = (1, 1, 6, 12,)
        self.build_model()
        print("QTable constructed with size: {}".format(self.q_table.shape))
        
    def discretize(self, state):
        upper_bounds = [self.state_high[0], 0.5, self.state_high[2], math.radians(50)]
        lower_bounds = [self.state_low[0], -0.5, self.state_low[2], -math.radians(50)]
        ratios = [(state[i] + abs(lower_bounds[i])) / (upper_bounds[i] - lower_bounds[i]) for i in range(len(state))]
        new_obs = [int(round((self.state_size[i] - 1) * ratios[i])) for i in range(len(state))]
        new_obs = [min(self.state_size[i] - 1, max(0, new_obs[i])) for i in range(len(state))]
        return tuple(new_obs)
        
    def build_model(self):
        self.q_table = 1e-4*np.random.random(self.state_size + (self.action_size,))
        
    def get_action(self, state):
        state = self.discretize(state)
        q_state = self.q_table[state]
        action_greedy = np.argmax(q_state)
        action_random = super().get_action_random(state)
        return action_random if random.random() < self.eps else action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        state = self.discretize(state)
        next_state = self.discretize(next_state)
        
        q_next = self.q_table[next_state]
        q_next = np.zeros([self.action_size]) if done else q_next
        q_target = reward + self.discount_rate * np.max(q_next)
        #print((*state,action))
        q_update = q_target - self.q_table[(*state,action)]
        self.q_table[(*state,action)] += self.learning_rate * q_update
        
        if done:
            self.eps = self.eps * 0.99
        
myagent = QLAgent(env)

||Action-Space|| = 2
||State-Space|| range: [-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38] [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]
QTable constructed with size: (1, 1, 6, 12, 2)


In [5]:
n = int(input("Enter the no. of episodes for simulation: "))

total_reward = 0
for ep in range(n):
    state = env.reset()
    for t in range(10000):
        env.render()        
        action = myagent.get_action(state)
        next_state, reward, done, info = env.step(action)     
        
        myagent.train((state,action,next_state,reward,done))
        state = next_state
        total_reward += reward
        
        #print("In State: ", state)   
        #print("In State (Discrete): ", myagent.discretize(state), "Action: ", action)       
        #print("Episode: {}, Total reward: {}, eps: {}".format(ep,total_reward,myagent.eps))
        #print(myagent.q_table)
        #time.sleep(0.05)
        #clear_output(wait=True)
        
        if done:
            print("Episode {} done in {} timesteps! Total reward: {}, eps: {}\n".format(ep+1, t+1, total_reward, myagent.eps))
            break
    else:
        print("Episode {} NOT done in limit of {} timesteps! Agent Terminated!\n".format(ep+1, t+1))  
    
env.close()   

Enter the no. of episodes for simulation: 100
Episode 1 done in 11 timesteps! Total reward: 11.0, eps: 0.99

Episode 2 done in 14 timesteps! Total reward: 25.0, eps: 0.9801

Episode 3 done in 14 timesteps! Total reward: 39.0, eps: 0.9702989999999999

Episode 4 done in 68 timesteps! Total reward: 107.0, eps: 0.96059601

Episode 5 done in 28 timesteps! Total reward: 135.0, eps: 0.9509900498999999

Episode 6 done in 18 timesteps! Total reward: 153.0, eps: 0.9414801494009999

Episode 7 done in 18 timesteps! Total reward: 171.0, eps: 0.9320653479069899

Episode 8 done in 58 timesteps! Total reward: 229.0, eps: 0.92274469442792

Episode 9 done in 11 timesteps! Total reward: 240.0, eps: 0.9135172474836407

Episode 10 done in 20 timesteps! Total reward: 260.0, eps: 0.9043820750088043

Episode 11 done in 22 timesteps! Total reward: 282.0, eps: 0.8953382542587163

Episode 12 done in 13 timesteps! Total reward: 295.0, eps: 0.8863848717161291

Episode 13 done in 17 timesteps! Total reward: 312.0, 

In [None]:
print(myagent.q_table)