In [1]:
%matplotlib inline

import gym
import itertools
import matplotlib
import random
import numpy as np
import sys
from collections import deque

matplotlib.style.use('ggplot')

In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__)
print(tf.keras.__version__)

2.0.0-alpha0
2.2.4-tf


# Agent and Training Function

In [3]:
class DQNAgent:
    def __init__(self, observation_size, action_size, one_hot):
        self.observation_size = observation_size
        self.action_size = action_size
        self.hidden_units=64
        self.batch_size = 32
        self.one_hot = one_hot
        
        self.gamma = 1.0
        self.epsilon = 0.1
        self.learning_rate = 0.001
        
        self.memory = deque(maxlen=2000)
            
        self.model = self._build_model()

    def _build_model(self):
        model = tf.keras.Sequential([
            layers.Dense(self.hidden_units, activation='relu', input_shape=(self.observation_size,)),
            layers.Dense(self.hidden_units, activation='relu'),
            layers.Dense(self.action_size, activation='linear')
        ])

        model.compile(optimizer=tf.keras.optimizers.Adam(0.001),
                      loss='mse',
                      metrics=['accuracy'])
        return model
            
    def _act_egreedy(self, best_action):
        action_probs = np.ones(self.action_size, dtype=float) * self.epsilon / self.action_size
        action_probs[best_action] += (1.0 - self.epsilon)
        return np.random.choice(np.arange(len(action_probs)), p=action_probs)
    
    def _shape_observation(self, observation):
        if (self.one_hot == True):
            return tf.keras.utils.to_categorical(observation, self.observation_size)[np.newaxis, :]
        else:
            return np.reshape(observation, [1, self.observation_size])        
    
    def remember(self, observation, action, reward, next_observation, done):
        observation = self._shape_observation(observation)
        next_observation = self._shape_observation(next_observation)
        
        self.memory.append((observation, action, reward, next_observation, done))

    def act(self, observation, train=True):
        observation = self._shape_observation(observation)
        q_values = self.model.predict(observation)[0]
        action = np.argmax(q_values)
        
        if (train==True):
            # if we are training.  choose e-greedy.
            action = self._act_egreedy(action)
        
        return action
    
    def replay(self):
        if (len(self.memory) < self.batch_size):
            return
        
        # Select a batch or replays from memory   
        minibatch = random.sample(self.memory, self.batch_size)
        
        # Compute the target q for each observation.
        # Set target_q = reward if the episode ends at s+1, otherwise set target_q = r + gamma*maxQ(s', a')
        for observation, action, reward, next_observation, done in minibatch:
            target = reward
            
            if not done:
                target = reward + self.gamma * np.amax(self.model.predict(next_observation)[0])
                
            target_q = self.model.predict(observation)
            
            target_q[0][action] = target
            
            self.model.fit(observation, target_q, epochs=1, verbose=0)
        
            
def train_agent (env, agent, num_episodes):
    for episodeIx in range(num_episodes):
        # reset state in the beginning of each game
        done = False
        observation = env.reset()

        while not done:
            action = agent.act(observation)
            next_observation, reward, done, _ = env.step(action)
            agent.remember(observation, action, reward, next_observation, done)
            observation = next_observation

        print("\repisode: {}/{}".format(episodeIx+1, num_episodes), end="")

        # train the agent with the experience of the episode
        agent.replay()
        
def average_episodes (env, agent, num_episodes):
    rewards = 0
    for episodeIx in range(num_episodes):
        # reset state in the beginning of each game
        done = False
        observation = env.reset()

        while not done:
            action = agent.act(observation, train=False)
            observation, reward, done, _ = env.step(action)

        rewards += reward
    return rewards

# Blackjack

In [4]:
envBlackjack = gym.make('Blackjack-v0')

PLAYER_STATE = 0
DEALER_STATE = 1
USABLE_ACE = 2

PLAYER_STATE_COUNT = envBlackjack.observation_space.spaces[PLAYER_STATE].n
DEALER_STATE_COUNT = envBlackjack.observation_space.spaces[DEALER_STATE].n
USABLE_ACE_COUNT = envBlackjack.observation_space.spaces[USABLE_ACE].n

ENVIRONMENT_SPACE = PLAYER_STATE_COUNT * DEALER_STATE_COUNT * USABLE_ACE_COUNT


def get_state_index(state):
    index = state[PLAYER_STATE] * (DEALER_STATE_COUNT * USABLE_ACE_COUNT)
    
    index += state[DEALER_STATE] * USABLE_ACE_COUNT
    
    if (state[USABLE_ACE] == True):
        index += 1
        
    return index

correct_policy = np.ones([PLAYER_STATE_COUNT, DEALER_STATE_COUNT, USABLE_ACE_COUNT]).astype(int)
correct_policy[12, 4:7, 0] = 0
correct_policy[13, 2:7, 0] = 0
correct_policy[14, 2:7, 0] = 0
correct_policy[15, 2:7, 0] = 0
correct_policy[16, 2:7, 0] = 0
correct_policy[17:22, :, 0] = 0
correct_policy[18, 2:9, 1] = 0
correct_policy[19:22, :, 1] = 0

In [5]:
observation_size = len(envBlackjack.observation_space.spaces)
action_size = envBlackjack.action_space.n

agent = DQNAgent(observation_size, action_size, one_hot=False)

train_agent(envBlackjack, agent, 5000)

episode: 5000/5000

In [6]:
state = np.zeros((1,3))
view_learned_policy = np.zeros([PLAYER_STATE_COUNT, DEALER_STATE_COUNT, USABLE_ACE_COUNT]).astype(int)
for playerIx in range(PLAYER_STATE_COUNT):
    for dealerIx in range(DEALER_STATE_COUNT):
        for usableAceIx in range(USABLE_ACE_COUNT):
            state[0,PLAYER_STATE] = playerIx
            state[0,DEALER_STATE] = dealerIx
            state[0,USABLE_ACE] = usableAceIx
            view_learned_policy[playerIx, dealerIx, usableAceIx] = agent.act(state, train=False)

view_correct_policy = correct_policy.reshape(PLAYER_STATE_COUNT, DEALER_STATE_COUNT, USABLE_ACE_COUNT)
error_rate = np.mean((np.bitwise_xor(view_learned_policy[4:22, :, :], view_correct_policy[4:22, :, :])))
print(error_rate)

rewards = average_episodes(envBlackjack, agent, 10000)
print(rewards)

0.06565656565656566
-549.0
