# Blackjack

### Random Action Player

## DQN Agent

In [11]:
import random
import gym
import numpy as np
from tqdm import tqdm
import tensorflow as tf
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam, RMSprop
import keras.layers as layers

In [9]:
class QMemoryUnit():
    def __init__(self, state, action, reward, done, next_state):
        self.state = state
        self.action = action
        self.reward = reward
        self.done = done
        self.next_state = next_state

In [None]:
memory_size = 10000
episodes = 100000
epsilon = 0.1
gamma = 0.2
minibatch_size = 32

tf.set_random_seed(0)
np.random.seed(0)
model = Sequential()
model.add(Dense(10, input_dim=4, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='linear'))
model.compile(loss='mean_squared_error', optimizer=RMSprop())

memory = deque(maxlen=memory_size)
env = gym.make('Blackjack-v0')

#valid actions
action_space = np.array([0,1])

#state + actions
action_state = np.zeros((2,4))

steps = 0
reward_plot = []
loss_plot = []
rewards = []
for episode in tqdm(range(episodes)):
    done = False
    state = np.array(env.reset())
    total_reward = 0
    while(not done):
        action_state[:,:3] = state
        action_state[:,3] = action_space
        
        #random choice
        if(random.random() < epsilon):
            action = np.random.choice(action_space)
        
        #action with greatest expected value
        else:
            action = np.argmax(model.predict(action_state))
            
        next_state, reward, done, _ = env.step(action)
        total_reward += reward
        
        memory.append(QMemoryUnit(state, action, reward, done, next_state))
        state = next_state
        
        minibatch = np.zeros((minibatch_size, 4))
        labels = np.zeros(minibatch_size)
        #generate minibatch and labels
        for i in range(minibatch_size):
            index = np.random.randint(0, len(memory))
            sample = memory[index]
            
            reward = sample.reward
            if(not sample.done):
                action_state[:,:3] = sample.next_state
                action_state[:,3] = action_space
                reward += gamma * np.max(model.predict(action_state))
            
            minibatch[i,:3] = sample.state
            minibatch[i,3] = sample.action
            labels[i] = reward
            
        history = model.fit(minibatch, labels, verbose=0)
        loss_plot.append(history.history['loss'])
        if(steps%50 == 0):
            total_reward = 0
            for i in range(1000):
                env.seed(i)
                state = env.reset()
                done = False
                while(not done):
                    action_state[:,:3] = state
                    action = np.argmax(model.predict(action_state))
                    state, reward, done, _ = env.step(action)
                total_reward += reward
            reward_plot.append(total_reward)
            
            
        steps+=1
        
    rewards.append(total_reward)

  0%|          | 174/100000 [00:11<1:50:30, 15.06it/s]

In [19]:
total_reward = 0
for i in range(1000):
    env.seed(i)
    state = env.reset()
    done = False
    while(not done):
        action_state[:,:3] = state
        action = np.argmax(model.predict(action_state))
        state, reward, done, _ = env.step(action)
    total_reward += reward

In [20]:
total_reward

-16.0