In [2]:
import gym
import tensorflow as tf
import numpy as np
from tensorflow import keras

from collections import deque
import time
import random
import matplotlib.pyplot as plt
import seaborn as sns

In [50]:

RANDOM_SEED = 5
tf.random.set_seed(RANDOM_SEED)

env = gym.make('CartPole-v1')
# env.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

print("Action Space: {}".format(env.action_space))
print("State space: {}".format(env.observation_space))

# An episode a full game
train_episodes = 1500
# test_episodes = 200


Action Space: Discrete(2)
State space: Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)


In [51]:

def agent(state_shape, action_shape):
    """ The agent maps X-states to Y-actions
    e.g. The neural network output is [.1, .7, .1, .3]
    The highest value 0.7 is the Q-Value.
    The index of the highest action (0.7) is action #1.
    """
    learning_rate = 0.001
    init = tf.keras.initializers.HeUniform()
    model = keras.Sequential()
    model.add(keras.layers.Dense(24, input_shape=state_shape, activation='relu', kernel_initializer=init))
    model.add(keras.layers.Dense(12, activation='relu', kernel_initializer=init))
    model.add(keras.layers.Dense(action_shape, activation='linear', kernel_initializer=init))
    model.compile(loss=tf.keras.losses.Huber(), optimizer=tf.keras.optimizers. legacy.Adam(learning_rate=learning_rate), metrics=['accuracy'])
    return model


In [52]:

def get_qs(model, state, step):
    return model.predict(state.reshape([1, state.shape[0]]))[0]


In [53]:

def train(env, replay_memory, model, target_model, done):
    learning_rate = 0.7 # Learning rate
    discount_factor = 0.618

    MIN_REPLAY_SIZE = 1000
    if len(replay_memory) < MIN_REPLAY_SIZE:
        return

    batch_size = 64 * 2
    mini_batch = random.sample(replay_memory, batch_size)
    current_states = np.array([transition[0] for transition in mini_batch])
    current_qs_list = model.predict(current_states)
    new_current_states = np.array([transition[3] for transition in mini_batch])
    future_qs_list = target_model.predict(new_current_states)

    X = []
    Y = []
    for index, (observation, action, reward, new_observation, done) in enumerate(mini_batch):
        if not done:
            max_future_q = reward + discount_factor * np.max(future_qs_list[index])
        else:
            max_future_q = reward

        current_qs = current_qs_list[index]
        current_qs[action] = (1 - learning_rate) * current_qs[action] + learning_rate * max_future_q

        X.append(observation)
        Y.append(current_qs)
    model.fit(np.array(X), np.array(Y), batch_size=batch_size, verbose=0, shuffle=True)


In [54]:

def main(results):
    epsilon = 1 # Epsilon-greedy algorithm in initialized at 1 meaning every step is random at the start
    max_epsilon = 1 # You can't explore more than 100% of the time
    min_epsilon = 0.01 # At a minimum, we'll always explore 1% of the time
    decay = 0.01

    # 1. Initialize the Target and Main models
    # Main Model (updated every 4 steps)
    model = agent(env.observation_space.shape, env.action_space.n)
    # Target Model (updated every 100 steps)
    target_model = agent(env.observation_space.shape, env.action_space.n)
    target_model.set_weights(model.get_weights())

    replay_memory = deque(maxlen=50_000)

    target_update_counter = 0

    # X = states, y = actions
    X = []
    y = []

    steps_to_update_target_model = 0

    for episode in range(train_episodes):
        total_training_rewards = 0
        observation = env.reset()[0]
        done = False
        while not done:
            steps_to_update_target_model += 1
            if True:
                env.render()

            random_number = np.random.rand()
            # 2. Explore using the Epsilon Greedy Exploration Strategy
            if random_number <= epsilon:
                # Explore
                action = env.action_space.sample()
            else:
                # Exploit best known action
                # model dims are (batch, env.observation_space.n)
                encoded = observation
                encoded_reshaped = encoded.reshape([1, encoded.shape[0]])
                predicted = model.predict(encoded_reshaped).flatten()
                action = np.argmax(predicted)
            new_observation, reward, done,trunc, info = env.step(action)
            replay_memory.append([observation, action, reward, new_observation, done])

            # 3. Update the Main Network using the Bellman Equation
            if steps_to_update_target_model % 4 == 0 or done:
                train(env, replay_memory, model, target_model, done)

            observation = new_observation
            total_training_rewards += reward

            if done:
                print('Total training rewards: {} after n steps = {} with final reward = {}'.format(total_training_rewards, episode, reward))
                total_training_rewards += 1

                if steps_to_update_target_model >= 100:
                    print('Copying main network weights to the target network weights')
                    target_model.set_weights(model.get_weights())
                    steps_to_update_target_model = 0
                break

        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay * episode)
        results.append(total_training_rewards)
    env.close()

    return model


In [55]:

if __name__ == '__main__':
    results = []
    model = main(results)

Total training rewards: 20.0 after n steps = 0 with final reward = 1.0
Total training rewards: 48.0 after n steps = 1 with final reward = 1.0
Total training rewards: 27.0 after n steps = 2 with final reward = 1.0
Total training rewards: 18.0 after n steps = 3 with final reward = 1.0
Copying main network weights to the target network weights


2023-09-05 21:47:12.609348: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Total training rewards: 43.0 after n steps = 4 with final reward = 1.0
Total training rewards: 37.0 after n steps = 5 with final reward = 1.0
Total training rewards: 23.0 after n steps = 6 with final reward = 1.0
Copying main network weights to the target network weights
Total training rewards: 24.0 after n steps = 7 with final reward = 1.0
Total training rewards: 80.0 after n steps = 8 with final reward = 1.0
Copying main network weights to the target network weights
Total training rewards: 47.0 after n steps = 9 with final reward = 1.0
Total training rewards: 36.0 after n steps = 10 with final reward = 1.0
Total training rewards: 21.0 after n steps = 11 with final reward = 1.0
Copying main network weights to the target network weights
Total training rewards: 75.0 after n steps = 12 with final reward = 1.0
Total training rewards: 22.0 after n steps = 13 with final reward = 1.0
Total training rewards: 13.0 after n steps = 14 with final reward = 1.0
Copying main network weights to the t

2023-09-05 21:47:16.280108: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-09-05 21:47:16.430242: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Total training rewards: 42.0 after n steps = 37 with final reward = 1.0
Total training rewards: 13.0 after n steps = 38 with final reward = 1.0
Total training rewards: 13.0 after n steps = 39 with final reward = 1.0
Total training rewards: 20.0 after n steps = 40 with final reward = 1.0
Copying main network weights to the target network weights
Total training rewards: 37.0 after n steps = 41 with final reward = 1.0
Total training rewards: 35.0 after n steps = 42 with final reward = 1.0
Total training rewards: 13.0 after n steps = 43 with final reward = 1.0
Total training rewards: 10.0 after n steps = 44 with final reward = 1.0
Total training rewards: 13.0 after n steps = 45 with final reward = 1.0
Copying main network weights to the target network weights
Total training rewards: 12.0 after n steps = 46 with final reward = 1.0
Total training rewards: 11.0 after n steps = 47 with final reward = 1.0
Total training rewards: 19.0 after n steps = 48 with final reward = 1.0
Total training rew

In [10]:
sns.set_theme()
plt.plot(results, linewidth = 0.5)

NameError: name 'results' is not defined

: 

In [7]:
def reshape(obs):
    return obs.reshape([1, obs.shape[0]])

In [8]:
def test(env, model, episodes):
    rewards, steps, solved = 0, 0, 0

    for episode in range(0,episodes):
        steps += 1 
      
        score = 0
        terminated, done = False, False
        observation = env.reset()[0]

        while not terminated:
            # choose best action
            observation.reshape([1, observation.shape[0]])
            action = np.argmax(model.predict(observation))
            
            # Take step
            new_observation, reward, terminated, truncated, info  = env.step(action)
            score+=reward
          
            observation = new_observation

In [9]:
test_env = gym.make("CartPole-v1", render_mode="human")
test_env.reset()

episodes = 20
test(test_env, model, episodes)



2023-09-06 09:07:18.147792: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




In [58]:
model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_30 (Dense)            (None, 24)                120       
                                                                 
 dense_31 (Dense)            (None, 12)                300       
                                                                 
 dense_32 (Dense)            (None, 2)                 26        
                                                                 
Total params: 446 (1.74 KB)
Trainable params: 446 (1.74 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [59]:
model.save("1400")

INFO:tensorflow:Assets written to: 1400/assets


INFO:tensorflow:Assets written to: 1400/assets


In [3]:
new_model = tf.keras.models.load_model('1400')

2023-09-06 09:06:26.908504: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Max
2023-09-06 09:06:26.908537: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 64.00 GB
2023-09-06 09:06:26.908544: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 24.00 GB
2023-09-06 09:06:26.908629: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-09-06 09:06:26.908868: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
