In [1]:
import numpy as np
import gym
import time
import math
import matplotlib.pyplot as plt 

In [2]:
env = gym.make("CartPole-v1")
print(env.action_space.n)

2


In [3]:
# Learning Rate: learning rate is associated with how big you take a leap
lr = 0.1

#Discount Factor
gamma = 0.95

#Amount of iterations we are going to run until we see our model is trained
epochs = 60000
total_time = 0
total_reward = 0
prev_reward = 0

Observation = [30, 30, 50, 50]
step_size = np.array([0.25, 0.25, 0.01, 0.1])

# epsilon is associated with how random you take an action.
epsilon = 0.3


In [4]:
#randomly initializing values in our q table
q_table = np.random.uniform(low=0, high=1, size=(Observation + [env.action_space.n]))
q_table.shape
print(q_table[0][0])

[[[0.08888318 0.11043499]
  [0.19144053 0.68855304]
  [0.82919344 0.40852539]
  ...
  [0.26321901 0.81223593]
  [0.78816355 0.31318497]
  [0.1533373  0.70473027]]

 [[0.06831697 0.75739909]
  [0.21750777 0.08098787]
  [0.45057348 0.87402463]
  ...
  [0.64799527 0.2161011 ]
  [0.42058057 0.2162505 ]
  [0.98505941 0.30953452]]

 [[0.14763662 0.74359422]
  [0.47821828 0.76279485]
  [0.09868625 0.20715004]
  ...
  [0.78465265 0.76026116]
  [0.96212772 0.46402701]
  [0.57828113 0.91025313]]

 ...

 [[0.06858037 0.07982005]
  [0.12310675 0.00798051]
  [0.10968309 0.70932384]
  ...
  [0.82087392 0.32876739]
  [0.9744877  0.87694726]
  [0.5746189  0.11644789]]

 [[0.21585327 0.86317965]
  [0.4801732  0.70373039]
  [0.05769934 0.88179597]
  ...
  [0.5222696  0.58061199]
  [0.96605412 0.29803704]
  [0.99106708 0.52200592]]

 [[0.79269669 0.85159061]
  [0.62681482 0.23615697]
  [0.03252239 0.50896871]
  ...
  [0.38113312 0.98250726]
  [0.85519393 0.65067975]
  [0.94815247 0.07710507]]]


In [5]:
#converting the state space from box format to discrete format. Used to input into our q learing algorithm.
def get_discrete_state(state):
    discrete_state = state/step_size+ np.array([15,10,1,10])
    return tuple(discrete_state.astype(np.int))

In [6]:
#iterating through n epochs
for epoch in range(epochs + 1): 
    #set the initial time, so we can calculate how much each action takes
    t_initial = time.time() 
    
    #get the discrete state for the restarted environment
    discrete_state = get_discrete_state(env.reset()) 
    
    #we create a boolean that will tell us whether our game is running or not
    done = False
    
    #our reward is intialized at zero at the beginning of every eisode
    epoch_reward = 0 

    #Every 1000 epochs we have an episode. Measuring reward for every set of epochs.
    if epoch % 1000 == 0: 
        print("Episode: " + str(epoch)) 

    while not done: 
        #Now we are in our gameloop
        #if some random number is greater than epsilon, then we take the best possible action (max reward) we have explored so far
        if np.random.random() > epsilon:

            action = np.argmax(q_table[discrete_state])
        
        #else, we will explore and take a random action
        else:

            action = np.random.randint(0, env.action_space.n) 

        #now we will intialize our new_state, reward, and done variables
        new_state, reward, done, _ = env.step(action) 
    
        epoch_reward += reward 
        
        #we discretize our new state
        new_discrete_state = get_discrete_state(new_state)
        
        #we render our environment after 2000 steps
        if epoch % 2000 == 0: 
            env.render()

        #if the game loop is still running, update the q-table
        if not done:
            max_new_q = np.max(q_table[new_discrete_state])

            current_q = q_table[discrete_state + (action,)]

            new_q = (1 - lr) * current_q + lr * (reward + gamma* max_new_q)

            q_table[discrete_state + (action,)] = new_q

        discrete_state = new_discrete_state
    # if our epsilon is greater than .05m , and if our reward is greater than the previous and if we reached past our 10000 epoch, we recalculate episilon
    

    #we calculate the final time
    tfinal = time.time() 
    
    #total epoch time
    episode_total = tfinal - t_initial 
    total_time += episode_total
    
    #calculate and update rewards
    total_reward += epoch_reward
    prev_reward = epoch_reward

    #every 1000 episodes print the average time and the average reward
    if epoch % 1000 == 0: 
        mean_time = total_time / 1000
        print("Time Average: " + str(mean_time))
        total_time = 0

        mean_reward = total_reward / 1000
        print("Mean Reward: " + str(mean_reward))
        total_reward = 0

        print("Epsilon: " + str(epsilon))
   
env.close()


Episode: 0


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  after removing the cwd from sys.path.


Time Average: 0.001366215467453003
Mean Reward: 0.024
Epsilon: 0.3
Episode: 1000
Time Average: 0.002335740327835083
Mean Reward: 23.655
Epsilon: 0.3
Episode: 2000
Time Average: 0.0030568537712097167
Mean Reward: 26.056
Epsilon: 0.3
Episode: 3000
Time Average: 0.0028766300678253172
Mean Reward: 29.254
Epsilon: 0.3
Episode: 4000
Time Average: 0.003846673011779785
Mean Reward: 33.975
Epsilon: 0.3
Episode: 5000
Time Average: 0.003573683738708496
Mean Reward: 37.908
Epsilon: 0.3
Episode: 6000
Time Average: 0.005874752044677734
Mean Reward: 48.092
Epsilon: 0.3
Episode: 7000
Time Average: 0.005986695528030396
Mean Reward: 62.101
Epsilon: 0.3
Episode: 8000
Time Average: 0.008444921731948853
Mean Reward: 69.655
Epsilon: 0.3
Episode: 9000
Time Average: 0.007505507946014405
Mean Reward: 78.201
Epsilon: 0.3
Episode: 10000
Time Average: 0.0090417320728302
Mean Reward: 87.703
Epsilon: 0.3
Episode: 11000
Time Average: 0.008296995401382446
Mean Reward: 88.519
Epsilon: 0.3


KeyboardInterrupt: 