<a href="https://colab.research.google.com/github/isaacchunn/cartpole-balancing/blob/main/cartpole_balancing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import progress bar and other dependencies
!pip install tqdm
!pip3 install ipywidgets --user



In [None]:
import gym
from gym import logger as gymlogger
from gym.wrappers import RecordVideo
gymlogger.set_level(40) #error only
import tensorflow as tf
import numpy as np
import random
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import math
import glob
import io
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay
from tqdm.notebook import trange


# Render Video Function
---

In [None]:
def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else:
    print("Could not find video")

# Loading CartPole Environment
---

In [None]:
env = gym.make("CartPole-v1")

In [None]:
#Print our observation space
print("Observation Space", env.observation_space)
print("Sample Observation", env.observation_space.sample()) #Display a random observation

Observation Space Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)
Sample Observation [-2.7142329e+00 -9.3176386e+37  4.4986211e-02 -1.9643822e+38]


In [None]:
#Print action space
print("Action Space Shape", env.action_space.n)
print("Action Space Sample", env.action_space.sample())

Action Space Shape 2
Action Space Sample 1


## Initializing Hyperameters
---


Definition of each hyperparameters :
1. learning_rate =
2. discount =
3. epsilon =
4. num_episodes =
5.

## Initializing Q-Table
---



In [None]:
#state_space = env.observation_space.shape
#print("There are ", state_space, " possible states")

#action_space = env.action_space.n
#print("There are ", action_space, " possible actions")

In [None]:
#Training parameters
learning_rate = 0.2
gamma = 0.95
num_episodes = 50000

#Environment parameters
observation_size = [40,40,50,50]
np_array_win_size = np.array([0.25, 0.25, 0.05, 0.5])

#Exploration parameters
max_epsilon = 1.0
min_epsilon = 0.05
epsilon_decay_rate = 0.0005

rewardArr = []

In [None]:
q_table = np.zeros((observation_size + [env.action_space.n]))
q_table.shape

(40, 40, 50, 50, 2)

## Discretize the state space
---

In [None]:
def discretize_state(state):
    discrete_state = state/np_array_win_size + np.array([15,10,1,10])
    return tuple(discrete_state.astype(int))

In [None]:
print("Action Space Shape : ", env.action_space.n)
print("Observation Space : ", env.observation_space)

Action Space Shape :  2
Observation Space :  Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)


In [None]:
#Epsilon greedy policy (acting policy)
def epsilon_greedy_policy(q_table, state, epsilon):
    random_d = np.random.random()
    if(random_d > epsilon):
        action = np.argmax(q_table[state])
    else:
        action = env.action_space.sample()
    return action

## Task 1 - Training the Agent
---


In [None]:
hello = env.reset()
def train_agent(env, num_episodes, learning_rate, gamma, max_epsilon, epsilon_decay_rate, q_table):
  global hello
  for episode in trange(num_episodes):
    #Calculate the new epsilon
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-epsilon_decay_rate * episode)
    #Reset the environment
    state = env.reset()
    #Find out the curr discrete state
    curr_discrete_state = discretize_state(state[0])
    hello = curr_discrete_state
    done = False
    #print(q_table[curr_discrete_state])
    #Run till all the episodes are complete
    while not done:
        #Determine our policy based on epsilon greedy
        action = epsilon_greedy_policy(q_table, curr_discrete_state, epsilon)

        #Take the chosen action and observe the new state
        new_state, reward, done, info = env.step(action)
        #Find the new discrete state given the new state calculated from action
        new_discrete_state = discretize_state(new_state)
        #print(q_table[curr_discrete_state])
        #Update our q table based on the formula
        q_table[curr_discrete_state + (action,)] += learning_rate * (reward + gamma * np.max(q_table[new_discrete_state]) - q_table[curr_discrete_state + (action,)])
        #print(q_table[curr_discrete_state + (action,)])
        #Setup some rewards
        if done:
          break

        #update the current state to the new state
        curr_discrete_state = new_discrete_state

  return q_table, curr_discrete_state, new_discrete_state


In [None]:
#Train the table
q_table_cart, cd, nd = train_agent(env, num_episodes, learning_rate, gamma, max_epsilon, epsilon_decay_rate, q_table)

#q_table_cart
cd

  0%|          | 0/50000 [00:00<?, ?it/s]

(16, 4, -3, 9)

In [None]:
q_table_cart[(15, 10, -3, 8)]

array([7.30845074, 4.28693444])

In [None]:
def count(env, q_table):
  counter = 0
  while True:
    counter += 1
    env.render()
    #your agent goes here
    action = np.argmax(q_table[discrete_state])
    new_state, reward, done, info = env.step(action)
    discrete_state = discretize_state(new_state)
    if done:
      break;
  return counter

In [None]:
env = RecordVideo(gym.make("CartPole-v1", render_mode="human"), "./video")
observation = env.reset()
discrete_state = discretize_state(observation[0])
while True:
    env.render()
    #your agent goes here
    action = np.argmax(q_table[discrete_state])
    new_state, reward, done, info = env.step(action)
    discrete_state = discretize_state(new_state)
    if done:
      break;
    if
env.close()
show_video()

## Task 2 - Effectiveness of Agent
---

**Episode End**
1. The episode ends if any one of the following occurs:
2. Termination: Pole Angle is greater than ±12°
3. Termination: Cart Position is greater than ±2.4 (center of the cart reaches the edge of the display)
4. Truncation: Episode length is greater than 500.


  and should_run_async(code)


In [None]:
# def train_agent(env, NUM_EPISODES, LEARNING_RATE, DISCOUNT, max_epsilon, epsilon_decay_rate, q_table):
#   total_reward = 0
#   rewardsArr = []

#   for episode in range(NUM_EPISODES):
#     #reset the environment and initialize the state
#     discrete_state = discretize_state(env.reset()[0])
#     done = False
#     episode_reward = 0

#     #Run all the episodes until done flag is set to True
#     while not done:

#         #Using Q-Table, we choose the best action to take for current state
#         action = np.argmax(q_table[discrete_state])

#         #obtain the new state, reward and done flag after taking the best action
#         updated_env = env.step(action)
#         updated_state, reward, done = updated_env
#         episode_reward += reward

#         #We truncate the training if episodes length is greater than 500
#         if episode_reward > 500:
#           done = True

#         #discretize the new state and update discrete_state
#         discrete_state = discretize_state(updated_state)

#         #if episode is completed, we reset the environment and reinitialize the observation and discrete state
#         if done:
#           observation = env.reset()
#           discrete_state = discretize_state(observation[0])

#     rewardsArr.append(episode_reward)

#     env.close()
#     return q_table, rewardArr

# q_table, rewards = train_agent(env, NUM_EPISODES, LEARNING_RATE, DISCOUNT, max_epsilon, epsilon_decay_rate, q_table)