In [1]:
import matplotlib
import gym
import numpy as np
import pandas as pd
import random
import tensorflow as tf
from tensorflow.keras import layers
from collections import deque, defaultdict, namedtuple

from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

In [2]:
env = gym.make('Blackjack-v0')

PLAYER_STATE = 0
DEALER_STATE = 1
USABLE_ACE = 2

PLAYER_STATE_COUNT = env.observation_space.spaces[PLAYER_STATE].n
DEALER_STATE_COUNT = env.observation_space.spaces[DEALER_STATE].n
USABLE_ACE_COUNT = env.observation_space.spaces[USABLE_ACE].n

ENVIRONMENT_SPACE = PLAYER_STATE_COUNT * DEALER_STATE_COUNT * USABLE_ACE_COUNT
ACTION_SPACE = env.action_space.n

def get_state_index(state):
    index = state[0,PLAYER_STATE] * (DEALER_STATE_COUNT * USABLE_ACE_COUNT)
    
    index += state[0,DEALER_STATE] * USABLE_ACE_COUNT
    
    if (state[0,USABLE_ACE] == True):
        index += 1
        
    return index

correct_policy = np.ones([PLAYER_STATE_COUNT, DEALER_STATE_COUNT, USABLE_ACE_COUNT]).astype(int)
correct_policy[12, 4:7, 0] = 0
correct_policy[13, 2:7, 0] = 0
correct_policy[14, 2:7, 0] = 0
correct_policy[15, 2:7, 0] = 0
correct_policy[16, 2:7, 0] = 0
correct_policy[17:22, :, 0] = 0
correct_policy[18, 2:9, 1] = 0
correct_policy[19:22, :, 1] = 0
correct_policy = correct_policy.flatten()

In [3]:
 # Deep Q-learning Agent
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 1.0    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()
        
    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = tf.keras.Sequential()
        model.add(layers.Dense(128, input_dim=self.state_size, activation='relu'))
        model.add(layers.Dense(128, activation='relu'))
        model.add(layers.Dense(self.action_size, activation='linear'))
        model.compile(loss='mse',
                      optimizer=tf.optimizers.Adam(lr=self.learning_rate))
        return model
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        
    def act(self, state, train=True):
        if (train is True):
            if np.random.rand() <= self.epsilon:
                return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action
    
    def replay(self, batch_size):
        if (len(self.memory) < batch_size):
            return
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = reward + self.gamma * \
                       np.amax(self.model.predict(next_state)[0])
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

In [4]:
state_size = len(env.observation_space.spaces)
action_size = env.action_space.n

agent = DQNAgent(state_size, action_size)

In [5]:
episodes = 5000
# Iterate the game
for episode in range(episodes):
    # reset state in the beginning of each game
    done = False
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    # time_t represents each frame of the game
    # Our goal is to keep the pole upright as long as possible until score of 500
    # the more time_t the more score
    while not done:
        action = agent.act(state)
        #action = correct_policy[get_state_index(state)]
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        
    if (episode % 500 == 0):
        print("episode: {}/{}".format(episode, episodes))
            
    # train the agent with the experience of the episode
    agent.replay(64)

episode: 0/5000
episode: 500/5000
episode: 1000/5000
episode: 1500/5000
episode: 2000/5000
episode: 2500/5000
episode: 3000/5000
episode: 3500/5000
episode: 4000/5000
episode: 4500/5000


In [6]:
state = np.zeros((1,3))
view_learned_policy = np.zeros([PLAYER_STATE_COUNT, DEALER_STATE_COUNT, USABLE_ACE_COUNT]).astype(int)
for playerIx in range(PLAYER_STATE_COUNT):
    for dealerIx in range(DEALER_STATE_COUNT):
        for usableAceIx in range(USABLE_ACE_COUNT):
            state[0,PLAYER_STATE] = playerIx
            state[0,DEALER_STATE] = dealerIx
            state[0,USABLE_ACE] = usableAceIx
            view_learned_policy[playerIx, dealerIx, usableAceIx] = agent.act(state, train=False)

view_correct_policy = correct_policy.reshape(PLAYER_STATE_COUNT, DEALER_STATE_COUNT, USABLE_ACE_COUNT)
error_rate = np.mean((np.bitwise_xor(view_learned_policy[4:22, :, :], view_correct_policy[4:22, :, :])))
print(error_rate)

0.13131313131313133


In [7]:
for dealerIx in range(1, DEALER_STATE_COUNT):
    print("Dealer Shows: " + str(dealerIx))
    print(np.bitwise_xor(view_learned_policy[4:22, dealerIx, 0], view_correct_policy[4:22, dealerIx, 0]))

Dealer Shows: 1
[0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0]
Dealer Shows: 2
[0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0]
Dealer Shows: 3
[0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0]
Dealer Shows: 4
[0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0]
Dealer Shows: 5
[0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0]
Dealer Shows: 6
[0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0]
Dealer Shows: 7
[0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
Dealer Shows: 8
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Dealer Shows: 9
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Dealer Shows: 10
[0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]


In [8]:
rewards = 0

for episode in range(10000):

    done = False
    state = env.reset()

    while not done:
        action = agent.act(np.reshape(state, [1, state_size]), train=False)
        #print(state, action)
        next_state, reward, done, _ = env.step(action)
        state = next_state
    
        rewards += reward

    #print(reward)
    #print("========")
print(rewards)

-760.0
