<a href="https://colab.research.google.com/github/lionadis/Easy21/blob/master/Easy21.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
from numpy.random import choice, rand
from enum import Enum
from copy import copy
from collections import defaultdict
from tqdm import tqdm

In [0]:
class State:
  def __init__(self, player=0, dealer=0, is_terminated=False):
    self.player = player
    self.dealer = dealer
    self.is_terminated = is_terminated
  
  def tuple(self):
    return (self.player, self.dealer)

In [0]:
class Action(Enum):
  HIT = 0
  STICK = 1
  
  @staticmethod
  def get_actions():
    return [Action.HIT, Action.STICK]

In [0]:
class Player:

  def action(self, s):
    raise Exception("Needs to be implemented")

In [0]:
class RandomPlayer(Player):
  def action(self, s):
    return choice(Action.get_actions())

In [0]:
class Dealer:
  def action(self, s):
    return Action.HIT if s.dealer < 17 else Action.STICK

In [0]:
class Easy21Env:

  def __init__(self, dealer):
    self.dealer = dealer
  
  def step(self, s, a):
    if s.is_terminated:
      print("Can't take any actions, Game is over")
      return s, 0
    next_s = copy(s)
    if a == Action.HIT:
      next_s.player += self._deal()
      next_s.is_terminated = self._check_bust(next_s.player)
    else:
      while(not next_s.is_terminated and self.dealer.action(next_s) == Action.HIT):
        next_s.dealer += self._deal()
        next_s.is_terminated = self._check_bust(next_s.dealer)
      next_s.is_terminated = True
    return next_s, self._compute_reward(next_s)
  
  def _compute_reward(self, s):
    if not s.is_terminated or s.player == s.dealer:
      return 0
    if s.player > s.dealer:
      return -1 if self._check_bust(s.player) else 1
    else:
      return 1 if self._check_bust(s.dealer) else -1
    
  
  def _check_bust(self, value):
    return value <= 1 or value >= 22
  
  def _deal(self, color=None):
    value = choice(range(1, 11))
    if color:
      coef = 1 if color=='b' else -1
    else:
      coef = choice([1, -1], p=[2.0/3, 1.0/3])
    return value * coef
   
  def reset(self):
      return State(env._deal('b'), env._deal('b'))


In [38]:
env = Easy21Env(Dealer())
player = RandomPlayer()
s = env.reset()
print(s.__dict__)
while not s.is_terminated:
  action = player.action(s)
  s, reward = env.step(s, action)
  print(action)
  print(s.__dict__)
  print(reward)

{'player': 6, 'dealer': 10, 'is_terminated': False}
Action.HIT
{'player': 11, 'dealer': 10, 'is_terminated': False}
0
Action.STICK
{'player': 11, 'dealer': 25, 'is_terminated': True}
1


#Monte-Carlo Control

Repeat forever

1.   Sample the k-th episode using $\pi_t$, $\{S_1,A_1,R_2,.....,S_T\} \sim \pi_t $
2.   $\forall(S_t,A_t)$:
  *   $N(S_t,A_t) \leftarrow N(S_t,A_t) + 1$
  *   $Q(S_t, A_t) \leftarrow Q(S_t, A_t) + \frac{1}{N(S_t,A_t)}(G_t - Q(S_t,A_t))$
3.   $\pi_{t+1} \leftarrow \epsilon{\text -}greedy(Q)$







In [0]:
class MCAgent(Player):
  def __init__(self, N0=100,gamma=1):
    self.Q = defaultdict(int) #creates a dict with a dictionnary with a default value of 0
    self.N = defaultdict(int) # number of time the pair (s, a) is visited
    self.N0 = N0
    self.gamma = gamma # discount factor
    
  def action(self, s):
    '''
      compute the number of times we visisted state s
      N(S) = sum N(s,a) for all a in actions
    '''
    N_total = 0
    actions = Action.get_actions()
    for action in actions:
      N_total += self.N[(s.tuple(), action)]
    '''
      choose a random action with eps probability
    '''
    eps = 1.0 * self.N0 / (self.N0 `+ N_total)
    if rand() <= eps:
      return choice(actions)
    '''
      else pick the best action, argmax_a Q[(s, a)]
    '''
    best_action, best_value = None, -1e9
    for action in actions:
      if best_value < self.Q[(s, action)]:
        best_action, best_value = action, self.Q[(s, action)]
    return best_action
  
  def learn(self, h):
    '''
      h: the history of an episode
    '''
    h = h[::-1]
    g_t = 0
    for (s, a, r) in h:
      g_t += r
      self.N[(s.tuple(), a)] += 1
      self.Q[(s.tuple(), a)] += 1.0 / self.N[(s.tuple(),a)] * (g_t - self.Q[(s.tuple(), a)])
  
  def value_function(self):
    v = defaultdict(int)
    for (s, a), q in self.Q.items():
      v[s] = max(v[s], q)
    return v

In [114]:
def monte_carlo_control(num_iterations=100):
  agent = MCAgent()
  for _ in tqdm(range(num_iterations)):
    env = Easy21Env(Dealer())
    s = env.reset()
    h = [] # history
    while not s.is_terminated:
      a = agent.action(s)
      next_s, r = env.step(s, a)
      h.append((s, a, r))
      s = next_s
    agent.learn(h)
  return agent.value_function()
v = monte_carlo_control()

100%|██████████| 100/100 [00:00<00:00, 3641.14it/s]


In [105]:
def plot_value_function(v):
  

AttributeError: ignored

In [84]:
v

defaultdict(int, {})

In [0]:
d  = defaultdict(int)

In [14]:
d[(1,2)]

0

In [15]:
dict(d)

{(1, 2): 0}

In [0]:
s1 = State()

In [0]:
s == s1

False

In [0]:
d[(s,1)]=1

In [0]:
d[(s1,1)]

KeyError: ignored

In [79]:
np.meshgrid(range(1,3),range(1,3))

[array([[1, 2],
        [1, 2]]), array([[1, 1],
        [2, 2]])]