In [35]:
import numpy as np
import gym
import time
import math
import matplotlib.pyplot as plt 

In [36]:
env = gym.make("CartPole-v1")
print(env.action_space.n)

2


In [37]:
# Learning Rate: learning rate is associated with how big you take a leap
lr = 0.1

#Discount Factor
gamma = 0.95

#Amount of iterations we are going to run until we see our model is trained
epochs = 60000
total_time = 0
total_reward = 0
prev_reward = 0

Observation = [30, 30, 50, 50]
step_size = np.array([0.25, 0.25, 0.01, 0.1])

# epsilon is associated with how random you take an action.
epsilon = 1

#exploration is decaying and we will get to a state of full exploitation
epsilon_decay_value = 0.99995

In [38]:
#randomly initializing values in our q table
q_table = np.random.uniform(low=0, high=1, size=(Observation + [env.action_space.n]))
q_table.shape
print(q_table[0][0])

[[[0.72518348 0.90041983]
  [0.98388023 0.78404145]
  [0.04404439 0.05335759]
  ...
  [0.79374952 0.69640752]
  [0.54901046 0.89813909]
  [0.44403952 0.19269215]]

 [[0.81780292 0.1966845 ]
  [0.88855277 0.20422001]
  [0.91927978 0.38199477]
  ...
  [0.51611586 0.67368189]
  [0.12398253 0.31379755]
  [0.31720295 0.45792757]]

 [[0.67284937 0.64415605]
  [0.94294981 0.07878876]
  [0.82179551 0.99994506]
  ...
  [0.15011306 0.75753863]
  [0.33335768 0.23204788]
  [0.73380282 0.19758886]]

 ...

 [[0.7961721  0.61509859]
  [0.67496057 0.34220116]
  [0.11228732 0.49291801]
  ...
  [0.16509234 0.48136071]
  [0.68529075 0.88067023]
  [0.29163903 0.15628938]]

 [[0.77553512 0.87249752]
  [0.69612311 0.8526038 ]
  [0.16428905 0.07940243]
  ...
  [0.90628774 0.30219662]
  [0.46774261 0.27053387]
  [0.13003637 0.27245645]]

 [[0.70315078 0.17038873]
  [0.94270504 0.52137848]
  [0.46131213 0.29738409]
  ...
  [0.70499141 0.67778197]
  [0.42470335 0.02608297]
  [0.85947873 0.7789011 ]]]


In [39]:
#converting the state space from box format to discrete format. Used to input into our q learing algorithm.
def get_discrete_state(state):
    discrete_state = state/step_size+ np.array([15,10,1,10])
    return tuple(discrete_state.astype(np.int))

In [40]:
#iterating through n epochs
for epoch in range(epochs + 1): 
    #set the initial time, so we can calculate how much each action takes
    t_initial = time.time() 
    
    #get the discrete state for the restarted environment
    discrete_state = get_discrete_state(env.reset()) 
    
    #we create a boolean that will tell us whether our game is running or not
    done = False
    
    #our reward is intialized at zero at the beginning of every eisode
    epoch_reward = 0 

    #Every 1000 epochs we have an episode. Measuring reward for every set of epochs.
    if epoch % 1000 == 0: 
        print("Episode: " + str(epoch)) 

    while not done: 
        #Now we are in our gameloop
        #if some random number is greater than epsilon, then we take the best possible action (max reward) we have explored so far
        if np.random.random() > epsilon:

            action = np.argmax(q_table[discrete_state])
        
        #else, we will explore and take a random action
        else:

            action = np.random.randint(0, env.action_space.n) 

        #now we will intialize our new_state, reward, and done variables
        new_state, reward, done, _ = env.step(action) 
    
        epoch_reward += reward 
        
        #we discretize our new state
        new_discrete_state = get_discrete_state(new_state)
        
        #we render our environment after 2000 steps
        if epoch % 2000 == 0: 
            env.render()

        #if the game loop is still running, update the q-table
        if not done:
            max_new_q = np.max(q_table[new_discrete_state])

            current_q = q_table[discrete_state + (action,)]

            new_q = (1 - lr) * current_q + lr * (reward + gamma* max_new_q)

            q_table[discrete_state + (action,)] = new_q

        discrete_state = new_discrete_state
    # if our epsilon is greater than .05m , and if our reward is greater than the previous and if we reached past our 10000 epoch, we recalculate episilon
    
    if epsilon > 0.05: 
        if epoch_reward > prev_reward and epoch > 10000:
            epsilon = math.pow(epsilon_decay_value, epoch - 10000)

           

    #we calculate the final time
    tfinal = time.time() 
    
    #total epoch time
    episode_total = tfinal - t_initial 
    total_time += episode_total
    
    #calculate and update rewards
    total_reward += epoch_reward
    prev_reward = epoch_reward

    #every 1000 episodes print the average time and the average reward
    if epoch % 1000 == 0: 
        mean_time = total_time / 1000
        print("Time Average: " + str(mean_time))
        total_time = 0

        mean_reward = total_reward / 1000
        print("Mean Reward: " + str(mean_reward))
        total_reward = 0

        print("Epsilon: " + str(epsilon))
   
env.close()


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  after removing the cwd from sys.path.


Episode: 0
Time Average: 0.00019043374061584472
Mean Reward: 0.01
Epsilon: 1
Episode: 1000
Time Average: 0.0008965773582458496
Mean Reward: 22.68
Epsilon: 1
Episode: 2000
Time Average: 0.0010990478992462158
Mean Reward: 22.218
Epsilon: 1
Episode: 3000
Time Average: 0.0008782286643981934
Mean Reward: 22.472
Epsilon: 1
Episode: 4000
Time Average: 0.0011349844932556153
Mean Reward: 22.022
Epsilon: 1
Episode: 5000
Time Average: 0.0008684959411621093
Mean Reward: 22.588
Epsilon: 1
Episode: 6000
Time Average: 0.001029799222946167
Mean Reward: 22.04
Epsilon: 1
Episode: 7000
Time Average: 0.0008994495868682861
Mean Reward: 22.724
Epsilon: 1
Episode: 8000
Time Average: 0.0015126793384552001
Mean Reward: 22.542
Epsilon: 1
Episode: 9000
Time Average: 0.0008637683391571045
Mean Reward: 21.901
Epsilon: 1
Episode: 10000
Time Average: 0.0011169664859771728
Mean Reward: 22.31
Epsilon: 1
Episode: 11000
Time Average: 0.005805375814437866
Mean Reward: 141.217
Epsilon: 0.04904089407128572
Episode: 12000
T