# Blackjack

### Random Action Player

## DQN Agent

In [54]:
import random
import gym
import numpy as np
from tqdm import tqdm
import tensorflow as tf
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam, RMSprop
import keras.layers as layers
import matplotlib.pyplot as plt
from keras import backend as K

In [9]:
class QMemoryUnit():
    def __init__(self, state, action, reward, done, next_state):
        self.state = state
        self.action = action
        self.reward = reward
        self.done = done
        self.next_state = next_state

In [68]:
def mse_nan(y_true, y_pred):
    index = ~K.tf.is_nan(y_true)
    y_true = K.tf.boolean_mask(y_true, index)
    y_pred = K.tf.boolean_mask(y_pred, index)
    return K.mean((y_true - y_pred) ** 2)

In [None]:
memory_size = 10000
episodes = 100000
epsilon = 0.1
gamma = 0.2
minibatch_size = 32

tf.set_random_seed(0)
np.random.seed(0)
model = Sequential()
model.add(Dense(10, input_dim=3, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(2, activation='linear'))
model.compile(loss=mse_nan, optimizer=RMSprop())

memory = deque(maxlen=memory_size)
env = gym.make('Blackjack-v0')
test_env = gym.make('Blackjack-v0')

action_space = [0, 1]

steps = 0
reward_plot = []
loss_plot = []
rewards = []
for episode in tqdm(range(episodes)):
    done = False
    state = np.array(env.reset()).reshape(1,3)
    total_reward = 0
    while(not done):
        
        #random choice
        if(random.random() < epsilon):
            action = np.random.choice(action_space)
        
        #action with greatest expected value
        else:
            action = np.argmax(model.predict(state))
            
        next_state, reward, done, _ = env.step(action)
        total_reward += reward
        
        memory.append(QMemoryUnit(state, action, reward, done, next_state))
        state = np.array(next_state).reshape(1,3)
        
        minibatch = np.zeros((minibatch_size, 3))
        labels = np.zeros((minibatch_size, 2))
        labels.fill(np.nan)
        #generate minibatch and labels
        for i in range(minibatch_size):
            index = np.random.randint(0, len(memory))
            sample = memory[index]
            
            reward = sample.reward
            if(not sample.done):
                reward += gamma * np.max(model.predict(np.array(sample.next_state).reshape(1,3)))
            
            minibatch[i] = sample.state
            labels[i, sample.action] = reward
            
        history = model.fit(minibatch, labels, verbose=0)
        loss_plot.append(history.history['loss'])
        if(steps%50 == 0):
            total_reward = 0
            for i in range(1000):
                test_env.seed(i)
                state = test_env.reset()
                done = False
                while(not done):
                    action = np.argmax(model.predict(np.array(state).reshape(1,3)))
                    state, reward, done, _ = test_env.step(action)
                total_reward += reward
            reward_plot.append(total_reward)
            
            
        steps+=1
        
    rewards.append(total_reward)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "

Exception in thread Thread-18:
Traceback (most recent call last):
  File "/anaconda/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/anaconda/lib/python3.6/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/anaconda/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration

 12%|█▏        | 11765/100000 [40:50<5:06:20,  4.80it/s]

In [None]:
total_reward = 0
for i in range(1000):
    env.seed(i)
    state = env.reset()
    done = False
    while(not done):
        action = np.argmax(model.predict(np.array(state).reshape(1,3)))
        state, reward, done, _ = env.step(action)
    total_reward += reward

In [None]:
total_reward

In [None]:
plt.plot(loss_plot)
plt.xlabel('Training Step')
plt.ylabel('MSE Loss')
plt.show()

In [None]:
plt.plot(reward_plot)
plt.xlabel('Training Step/50')
plt.ylabel('Total Reward over 1000 Games')
plt.show()