In [1]:
# Install Dependencies
# This specific package of tf is needed to agree with keras-rl2.
!pip install tensorflow==2.8.0

Collecting tensorflow==2.8.0
  Downloading tensorflow-2.8.0-cp310-cp310-manylinux2010_x86_64.whl (497.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m497.6/497.6 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting keras-preprocessing>=1.1.1 (from tensorflow==2.8.0)
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboard<2.9,>=2.8 (from tensorflow==2.8.0)
  Downloading tensorboard-2.8.0-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m43.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tf-estimator-nightly==2.8.0.dev2021122109 (from tensorflow==2.8.0)
  Downloading tf_estimator_nightly-2.8.0.dev2021122109-py2.py3-none-any.whl (462 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.5/462.5 kB[0m [31m20.1 MB/s[0m eta [36m0:0

In [2]:
!pip install keras-rl2

Collecting keras-rl2
  Downloading keras_rl2-1.0.5-py3-none-any.whl (52 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/52.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.1/52.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: keras-rl2
Successfully installed keras-rl2-1.0.5


In [3]:
# Import Dependencies
import random
import numpy as np
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam

# Different agents within the Keras model
from rl.agents.dqn import DQNAgent
# policy based RL. Specific policy
from rl.policy import BoltzmannQPolicy
# For our agent we need to maintain memory
from rl.memory import SequentialMemory

Setting up RPSLS Environment

In [4]:
# Set up environment for agent to learn. Includes defining actions and rewards.
class RockPaperScissorsLizardSpockEnvironment:
     def __init__(self):
      self.actions = [0, 1, 2, 3, 4]  # rock: 0, paper: 1, scissors: 2, lizard: 3, spock: 4
      self.rewards = {(0, 0): 0, (0, 1): -1, (0, 2): 1, (0, 3): 1, (0, 4): -1,
       (1, 0): 1, (1, 1): 0, (1, 2): -1, (1, 3): -1, (1, 4): 1, (2, 0): -1,
        (2, 1): 1, (2, 2): 0, (2, 3): 1, (2, 4): -1, (3, 0): -1, (3, 1): 1,
         (3, 2): -1, (3, 3): 0, (3, 4): 1, (4, 0): 1, (4, 1): -1, (4, 2): 1, (4, 3): -1, (4, 4): 0
                      }

      self.max_steps = 10
      self.current_step = 0


     def step(self, action):
        #encoded_action = encode_action(action)
        opponent_action = random.choice(self.actions)
        reward = self.rewards[(action, opponent_action)]
        self.current_step += 1
        done = self.current_step >= self.max_steps
        #encoded_state = encode_state(opponent_action)
        info = {}
        return opponent_action, reward, done, info

     def reset(self):
        self.current_step = 0
        action = random.choice(self.actions)
        #encoded_action = encode_action(action)
        return action


In [5]:
# Define the Q-network model 2
def build_model(states, actions):
    model = Sequential()
    model.add(Dense(24, activation='relu', input_shape=(1,)))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

In [6]:
# Set up the environment and agent
env = RockPaperScissorsLizardSpockEnvironment()
states = 1
actions = [0, 1, 2, 3, 4]
model = build_model(states, len(actions))
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

In [7]:
# Summary of model
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 24)                48        
                                                                 
 dense_1 (Dense)             (None, 24)                600       
                                                                 
 dense_2 (Dense)             (None, 5)                 125       
                                                                 
Total params: 773
Trainable params: 773
Non-trainable params: 0
_________________________________________________________________


In [8]:
# Set up DQN agent
memory = SequentialMemory(limit=50000, window_length=1)
policy = BoltzmannQPolicy()
dqn = DQNAgent(model=model, memory=memory, policy=policy,
               nb_actions=len(actions), nb_steps_warmup=10, target_model_update=1e-2)
dqn.compile(optimizer=Adam(learning_rate=1e-3), metrics=['mae'])

In [9]:
# Train the DQN agent
dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)

Training for 50000 steps ...
Interval 1 (0 steps performed)
    1/10000 [..............................] - ETA: 8:47 - reward: 1.0000

  updates=self.state_updates,


1000 episodes - episode_reward: 0.203 [-8.000, 8.000] - loss: 0.408 - mae: 0.515 - mean_q: 0.504

Interval 2 (10000 steps performed)
1000 episodes - episode_reward: 0.067 [-8.000, 9.000] - loss: 0.408 - mae: 0.510 - mean_q: 0.475

Interval 3 (20000 steps performed)
1000 episodes - episode_reward: 0.017 [-7.000, 10.000] - loss: 0.405 - mae: 0.418 - mean_q: 0.350

Interval 4 (30000 steps performed)
1000 episodes - episode_reward: 0.077 [-9.000, 10.000] - loss: 0.403 - mae: 0.363 - mean_q: 0.275

Interval 5 (40000 steps performed)
done, took 392.735 seconds


<keras.callbacks.History at 0x7926f66dda80>

In [10]:
# Evaluate the trained agent
scores = dqn.test(env, nb_episodes=100, visualize=False)
print(np.mean(scores.history['episode_reward']))

Testing for 100 episodes ...
Episode 1: reward: 1.000, steps: 10
Episode 2: reward: 1.000, steps: 10
Episode 3: reward: -2.000, steps: 10
Episode 4: reward: 5.000, steps: 10
Episode 5: reward: -5.000, steps: 10
Episode 6: reward: -2.000, steps: 10
Episode 7: reward: 3.000, steps: 10
Episode 8: reward: -4.000, steps: 10
Episode 9: reward: 0.000, steps: 10
Episode 10: reward: -1.000, steps: 10
Episode 11: reward: -6.000, steps: 10
Episode 12: reward: -1.000, steps: 10
Episode 13: reward: 3.000, steps: 10
Episode 14: reward: 3.000, steps: 10
Episode 15: reward: 0.000, steps: 10
Episode 16: reward: -1.000, steps: 10
Episode 17: reward: 3.000, steps: 10
Episode 18: reward: -3.000, steps: 10
Episode 19: reward: -2.000, steps: 10
Episode 20: reward: 0.000, steps: 10
Episode 21: reward: 0.000, steps: 10
Episode 22: reward: -1.000, steps: 10
Episode 23: reward: 1.000, steps: 10
Episode 24: reward: -7.000, steps: 10
Episode 25: reward: 2.000, steps: 10
Episode 26: reward: 7.000, steps: 10
Episod

In [11]:
# Calculate sum, mean and median of the rewards
sum_reward = np.sum(scores.history['episode_reward'])
mean_reward = np.mean(scores.history['episode_reward'])
median_reward = np.median(scores.history['episode_reward'])

print("Sum Reward:", sum_reward)
print("Mean Reward:", mean_reward)
print("Median Reward:", median_reward)

Sum Reward: 1.0
Mean Reward: 0.01
Median Reward: 0.0
