<a href="https://colab.research.google.com/github/jsleroux/ReinforcementLearningProjects/blob/master/Monte_Carlo_BlackJack.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#remove " > /dev/null 2>&1" to see what is going on under the hood
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

In [0]:
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[atari] > /dev/null 2>&1
#!pip install torch==1.0.0 > /dev/null 2>&1

Requirement already up-to-date: setuptools in /usr/local/lib/python3.6/dist-packages (41.0.1)


In [0]:
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor

gymlogger.set_level(40) #error only
import tensorflow as tf
import numpy as np
import random
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from collections import deque
%matplotlib inline

import math
import glob
import io
import base64

from IPython.display import HTML
from IPython import display as ipythondisplay

from mpl_toolkits import mplot3d
#from mpl_toolkits.mplot3d import Axes3D

In [0]:
class Counter(dict):
    # dictionary that returns zero for missing keys
    # keys with zero values are not stored

    def __missing__(self,key):
        return 0

    def __setitem__(self, key, value):
        if value==0:
            if key in self:  # returns zero anyway, so no need to store it
                del self[key]
        else:
            dict.__setitem__(self, key, value)

In [0]:
env = gym.make('Blackjack-v0')

In [0]:
env.observation_space

Tuple(Discrete(32), Discrete(11), Discrete(2))

### Monte Carlo Prediction

##### First-visit MC prediction, for estimating V

In [0]:
def generate_stick20up():
    policy = np.zeros([32, 11, 2], dtype='int')

    policy[0:20, :, :] = 1
    policy[20:, :, :] = 0
    
    return policy

pi = generate_stick20up()
assert pi.shape == (32, 11, 2), "Policy has invalid shape"
assert pi[20:, :, :].sum() == 0, "Invalid policy. Actions must only be stick (0)"
assert np.sum(pi[0:19, :, :]!=1)==0, 'Invalid policy. Actions must only be hit(1)'

In [0]:
%%time
env.reset()
runs = 500000
gamma = 1
counter = Counter()
V = np.zeros([32, 11, 2])

    
for run in range(runs):
    episode = [] # Resets episode
    done = False # Not initialized when nhands = 1
    reward = 0   # Not initialized when nhands = 1
    nhands = 0   # Number of blackjack hands
    action = -1  # No action for the initial hand
    
    # Generate an episode from a policy
    while True:  
        if nhands == 0:
            state = env.reset()
        else:
            action = pi[mycount, hiscount, ace]
            state, reward, done, _ = env.step(action)
        
        nhands+=1
        
        mycount, hiscount, ace = state
        ace = 0 if ace is False else 1 # Quick hack, observation is boolean, we need 0 or 1
        
        episode.append((state, action, reward))

        if done == True:
            break

    g = 0
    done_steps_of_episode = [] # first visit MC
        
    # We reverse the episode because MC predictions works the episode
    # starting from the end ...
    episode.reverse()
    for (state, action, reward) in episode:
        mycount, hiscount, ace = state
        ace = 0 if ace is False else 1
        
        g = (gamma * g) + reward
        # This condition is there for first visit MC
        if state not in done_steps_of_episode:
            counter[state]+=1                                
            V[mycount, hiscount, ace] = V[mycount, hiscount, ace] + 1/counter[state]*(g - V[mycount, hiscount, ace])

        done_steps_of_episode.append(state)

CPU times: user 23.8 s, sys: 182 Âµs, total: 23.8 s
Wall time: 23.8 s


### Create data for graph

In [0]:
iterables = [range(V.shape[0]), range(V.shape[1])]
index = pd.MultiIndex.from_product(iterables, names=['mycount', 'hiscount'])
df = pd.DataFrame(index=index)

In [0]:
for mycount in range(V.shape[0]):
    for hiscount in range(V.shape[1]):
        df.loc[(mycount, hiscount), 'V'] = V[mycount, hiscount, 0]

In [0]:
df = df.reset_index()

### Plot Data

In [0]:
ax = plt.axes(projection='3d')
ax.set_zlim([-1,1])
df = df[(df['mycount']>=12)&(df['mycount']<=21)]
df = df[(df['hiscount']>0)]
ax.plot_trisurf(df['hiscount'], df['mycount'], df['V'], cmap='viridis')#, rstride=1, cstride=1, cmap='viridis', edgecolor='none')
ax.set_title('Approximate state-value functions for blackjack policy that sticks at 20 and 21')

##### On-policy first-visit MC control (for e-soft policies)

#### todo
- small error, 3rd arguments of policy are not actions, but usable ace
- also, adapt V array to contains action values and not only values of state

In [0]:
def generate_arbitrary():
    policy = np.zeros([32, 11, 2, 2]) # mycount, hiscount, ace, action

    policy[:, :, :, :] = 0.5
    
    return policy

pi = generate_arbitrary()
assert pi.shape == (32, 11, 2, 2), "Policy has invalid shape"
assert np.all(pi.sum(axis=3)==1), "Actions probabilities don't sum to 1"

In [0]:
%%time
env.reset()
runs = 500000
gamma = 1
epsilon = 0.6
counter = Counter()
V = np.zeros([32, 11, 2, 2]) # mycount, hiscount, ace, action

    
for run in range(runs):
    episode = [] # Resets episode
    done = False # Not initialized when nhands = 1
    reward = 0   # Not initialized when nhands = 1
    nhands = 0   # Number of blackjack hands
    action = -1  # No action for the initial hand
    
    # Generate an episode from a policy
    while True:  
        if nhands == 0:
            state = env.reset()
        else:
            # argmax with tiebreaking
            action = np.random.choice(np.flatnonzero(pi[mycount, hiscount, ace] == pi[mycount, hiscount, ace].max()))
            
            state, reward, done, _ = env.step(action)
        
        nhands+=1
        
        mycount, hiscount, ace = state
        ace = 0 if ace is False else 1 # Quick hack, observation is boolean, we need 0 or 1
        
        episode.append((state, action, reward))

        if done == True:
            break

    g = 0
    done_steps_of_episode = [] # first visit MC
        
    # We reverse the episode because MC predictions works the episode
    # starting from the end ...
    episode.reverse()
    for (state, action, reward) in episode:
        mycount, hiscount, ace = state
        ace = 0 if ace is False else 1
        
        g = (gamma * g) + reward
        # This condition is there for first visit MC
        if (state, action) not in done_steps_of_episode:
            counter[(state, action)] += 1             
            
            V[mycount, hiscount, ace, action] = V[mycount, hiscount, ace, action] + (1 / counter[(state, action)]) * (g - V[mycount, hiscount, ace, action])
            
            a_ = np.random.choice(np.flatnonzero(V[mycount, hiscount, ace] == V[mycount, hiscount, ace].max()))
            for a in range(env.action_space.n):
                if a == a_:
                    pi[mycount, hiscount, ace, a] = 1 - epsilon + (epsilon / env.action_space.n)
                else:
                    pi[mycount, hiscount, ace, a] = (epsilon / env.action_space.n)
                

        done_steps_of_episode.append((state, action))

CPU times: user 1min, sys: 385 ms, total: 1min
Wall time: 1min


In [0]:
assert np.argmax(V[20, 6, 1, :]) == 0, 'Policy should be stick'
assert np.argmax(V[15, 6, 1, :]) == 1, 'Policy should be hit'