Julien Gauthier

# Reinforcement Learning (Deep Q) Project

#### WARNING : install the correct versions of gym/tensorflow/keras-rl2 in a virtual environment.

In [None]:
# These are the correct versions for this project to work :
%pip install tensorflow==2.12.0 keras-rl2==1.0.5 gym==0.25.2
%pip install pygame

### I. Setting up the OpenAI Cart Pole environment

In [2]:
import gym
import random

The environment is where the experiment takes place, the states are the different input parameters (in this case : cart position, cart velocity, pole angle, pole tip velocity) and the actions are the output possibilities (move the cart left or right).

In [3]:
env = gym.make("CartPole-v1", render_mode="human")
states = env.observation_space.shape[0]
actions = env.action_space.n

  deprecation(
  deprecation(


Testing the environment with random actions.

In [3]:
episodes = 10
for episode in range(1, episodes + 1) :
    state = env.reset()
    score = 0
    done = False
    
    while not done :
        env.render()
        action = random.choice([0, 1])
        next_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:22.0
Episode:2 Score:12.0
Episode:3 Score:27.0
Episode:4 Score:11.0
Episode:5 Score:13.0
Episode:6 Score:12.0
Episode:7 Score:23.0
Episode:8 Score:19.0
Episode:9 Score:28.0
Episode:10 Score:27.0


### II. Deep Learning model with Keras.

In [4]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers.legacy import Adam

  import sre_constants


Let's create a function that will build our model.

In [5]:
def build_model(states, actions) :
    model = Sequential()
    model.add(Flatten(input_shape=(1, states)))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

We can now use our function to create and show an instance of the model :

In [6]:
model = build_model(states, actions)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 4)                 0         
                                                                 
 dense (Dense)               (None, 24)                120       
                                                                 
 dense_1 (Dense)             (None, 24)                600       
                                                                 
 dense_2 (Dense)             (None, 2)                 50        
                                                                 
Total params: 770
Trainable params: 770
Non-trainable params: 0
_________________________________________________________________


### III. Agent creation with Keras-RL

In [7]:
from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

Let's create a function to build an agent with a given model and possible actions. We'll use the Boltzmann Q Policy and the DQN Algorithm.

In [8]:
def build_agent(model, actions) :
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, 
                   nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)
    return dqn

Now, let's build the agent

In [9]:
dqn = build_agent(model, actions)

### IV. Training (and visualizing) the agent

We can now train the agent : (visualize=True to see the progress in real time)

In [17]:
env = gym.make("CartPole-v1", render_mode="human")
states = env.observation_space.shape[0]
actions = env.action_space.n

dqn.compile(Adam(learning_rate=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=100000, visualize=True, verbose=1)

Training for 100000 steps ...
Interval 1 (0 steps performed)
    4/10000 [..............................] - ETA: 3:22 - reward: 1.0000 

  updates=self.state_updates,
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


   10/10000 [..............................] - ETA: 3:24 - reward: 1.0000

  batch_idxs = np.random.random_integers(low, high - 1, size=size)


   21/10000 [..............................] - ETA: 9:12 - reward: 1.0000 

  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)


   30/10000 [..............................] - ETA: 7:23 - reward: 1.0000

  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)


   39/10000 [..............................] - ETA: 6:25 - reward: 1.0000

  batch_idxs = np.random.random_integers(low, high - 1, size=size)


107 episodes - episode_reward: 92.140 [9.000, 400.000] - loss: 2.511 - mae: 19.275 - mean_q: 39.033

Interval 2 (10000 steps performed)
47 episodes - episode_reward: 212.596 [157.000, 368.000] - loss: 3.900 - mae: 41.360 - mean_q: 83.631

Interval 3 (20000 steps performed)
47 episodes - episode_reward: 212.340 [150.000, 335.000] - loss: 2.871 - mae: 45.206 - mean_q: 91.028

Interval 4 (30000 steps performed)
44 episodes - episode_reward: 228.545 [156.000, 379.000] - loss: 1.629 - mae: 43.524 - mean_q: 87.490

Interval 5 (40000 steps performed)
44 episodes - episode_reward: 229.614 [164.000, 320.000] - loss: 0.969 - mae: 40.241 - mean_q: 80.914

Interval 6 (50000 steps performed)
36 episodes - episode_reward: 271.333 [169.000, 500.000] - loss: 0.629 - mae: 38.402 - mean_q: 77.128

Interval 7 (60000 steps performed)
32 episodes - episode_reward: 314.531 [174.000, 466.000] - loss: 2.423 - mae: 43.066 - mean_q: 86.731

Interval 8 (70000 steps performed)
31 episodes - episode_reward: 318.29

<keras.callbacks.History at 0x207c1b9a410>

Save your model before pygame crashes ! (TO DO : FIX)

In [18]:
dqn.save_weights('pre-trained-model-100ksteps.h5f', overwrite=True)

#### WARNING : this will reset your model ! (ONLY IF YOU WANT TO RE-TRAIN YOUR MODEL)

In [16]:
del model
del dqn
model = build_model(states, actions)
dqn = build_agent(model, actions)

# You can now re-run the previous cell to train the model or import a pre-trained model in the next cell

##### Test a saved model in the Cart Pole environment :

In [14]:
env = gym.make('CartPole-v1')
actions = env.action_space.n
states = env.observation_space.shape[0]
model = build_model(states, actions)
dqn = build_agent(model, actions)
dqn.compile(Adam(learning_rate=1e-3), metrics=['mae'])

dqn.load_weights('pre-trained-model-100ksteps.h5f') # Load the pre-trained model from the repository

In [15]:
dqnscores = dqn.test(env, nb_episodes=3, visualize=True)
print(np.mean(dqnscores.history['episode_reward']))

Testing for 5 episodes ...
Episode 1: reward: 190.000, steps: 190
Episode 2: reward: 239.000, steps: 239
Episode 3: reward: 203.000, steps: 203
Episode 4: reward: 176.000, steps: 176
Episode 5: reward: 180.000, steps: 180
197.6
