In [2]:
import gym
env = gym.make('Blackjack-v0')
env.reset()

(17, 10, False)

In [3]:
# Blackjack의 Action Space는 이산형이며 0 (Stand) 또는 1 (Hit) 값을 가짐.
print(env.action_space)

Discrete(2)


In [4]:
import gym
env = gym.make('Blackjack-v0')
state = env.reset()
print(state)
action = 1

# env.step(action)
# (next_state, reward, done, _)
# 다음 State, 보상, Done : 게임의 승리 여부

# Done = True면 승부가 났다는 의미
# Agent가 이기면 Reward = 1.0, 지면 Reward = -1, 비기면 Reward = 0.0을 반환함.
print(env.step(action))

(13, 5, True)
((16, 5, True), 0.0, False, {})


In [5]:
import gym
env = gym.make('Blackjack-v0')
state = env.reset()

episode = []

num_timesteps = 20

for i in range(num_timesteps):
    random_action = env.action_space.sample()
    new_state, reward, done, _ = env.step(random_action)
    episode.append((state, action, reward))
    
    if done:
        break

    state = new_state
    
print(episode)

[((15, 8, False), 1, 0.0), ((18, 8, False), 1, -1.0)]


In [6]:
# Blackjack 환경에서 Agent(나)의 승률을 최대로 하는 최적 Policy를 구하기 위한 프로그램

import gym
import pandas as pd
import random
from collections import defaultdict

env = gym.make('Blackjack-v0')

Q = defaultdict(float)
total_return = defaultdict(float)
N = defaultdict(int)

In [9]:
# Epsilon-Greedy Policy와 Episode를 생성하는 프로그램

# 0 ~ 1 사이의 값을 Uniform 분포로부터 추출하여 0.2보다 작으면 임의의 Action을 선택
# 그렇지 않으면 Q 함수로부터 Greedy하게 Action을 선택

def epsilon_greedy(state, Q):
    epsilon = 0.2
    
    if random.uniform(0, 1) < epsilon:
        return env.action_space.sample()
    else:
        return max(list(range(env.action_space.n)), key=lambda x: Q[(state, x)])
    
nun_timesteps = 50

def generate_episode(Q):
    episode = []
    state = env.reset()
    
    for t in range(num_timesteps):
        action = epsilon_greedy(state, Q)
        
        # step
        # observation, reward, done, truncated, info = env.step(action)
        next_state, reward, done, _ = env.step(action)
        episode.append((state, action, reward))
        
        if done:
            break
        
        state = next_state
    return episode

In [10]:
# episode_greedy()와 generate_episode() 함수를 이용하여 500,000개의 episode를 생성하여 최적 Q 함수 구하기

num_episodes = 500000

for i in range(num_episodes):
    episode = generate_episode(Q)
    all_state_action_pairs = [(s, a) for (s, a, r) in episode]
    rewards = [r for (s, a, r) in episode]
    
    for t, (state, action, _) in enumerate(episode):
        if not (state, action) in all_state_action_pairs[:t]:
            G = sum(rewards[t:])
            total_return[(state, action)] = total_return[(state, action)] + G
            N[(state, action)] += 1
            Q[(state, action)] = total_return[(state, action)] / N[(state, action)]

In [11]:
# Dictionary 데이터인 Q를 DataFrame으로 전환하여 최초의 20개의 Q값을 출력

# Q의 Key : (state, action)
# 그리고 이에 대응하는 최적 Q값

# ((16, 4, False), 0)
# -0.234533
# 첫 번째 자료의 State는 (16, 4, False)이며 Action은 0이며 이에 대응하는 최적 Q값은 -0.2345임.
# State (16, 4, False)에서 Action 1 (Hit)가 최적 Policy임을 보여주고 있음.

df = pd.DataFrame(Q.items(), columns=['state_action_pair', 'Q_value'])
df.head(20)

Unnamed: 0,state_action_pair,Q_value
0,"((16, 4, False), 0)",-0.234553
1,"((16, 4, False), 1)",-0.505682
2,"((19, 3, False), 1)",-0.806034
3,"((21, 6, False), 0)",0.917577
4,"((21, 6, False), 1)",-1.0
5,"((14, 6, False), 1)",-0.340125
6,"((15, 9, False), 0)",-0.552743
7,"((15, 9, False), 1)",-0.532852
8,"((11, 2, False), 0)",-0.3
9,"((11, 2, False), 1)",0.126159


In [12]:
df.tail()

Unnamed: 0,state_action_pair,Q_value
555,"((17, 4, True), 0)",0.017778
556,"((17, 4, True), 1)",-0.044444
557,"((17, 3, True), 0)",-0.159763
558,"((4, 4, False), 0)",-0.207729
559,"((4, 4, False), 1)",-0.375


In [13]:
df['Q_value'].values

array([-0.23455272, -0.50568182, -0.80603448,  0.91757696, -1.        ,
       -0.34012539, -0.55274262, -0.53285226, -0.3       ,  0.1261586 ,
        0.42998305, -0.86322049, -0.66726084,  0.66138082, -0.88111888,
       -0.64384851, -0.69462366, -0.2748538 ,  0.16970803,  0.92142483,
        0.14754098, -0.14426808, -0.28365019,  0.29276808, -0.57188755,
       -0.53552052, -0.19578497, -0.64720812, -0.2826087 , -0.00928074,
       -0.43340381, -0.52287582, -0.33189655, -0.22222222,  0.06234414,
       -0.49041096, -0.25966608, -0.59083095, -0.47233691, -0.55868545,
        0.10879512, -0.52      , -0.28726147, -0.02303984, -0.33333333,
        0.01188119,  0.92435302, -1.        ,  0.16373057, -0.62539683,
        0.03639241, -0.50439883, -0.57332251, -0.58290006, -0.50354654,
       -0.52941176, -0.25114155,  0.93207941,  0.12149533, -0.37735849,
       -0.03157122, -0.17024165, -0.27021277, -0.88214905, -0.7827476 ,
       -0.49662921, -0.32615464, -0.26947637, -0.48450704, -0.12

In [14]:
import numpy as np
print(np.mean(df['Q_value'].values))
print(np.min(df['Q_value'].values))
print(np.max(df['Q_value'].values))

-0.20319428325128291
-1.0
0.9320794148380356


In [18]:
# State가 (19, 8, True)일 때 action = 0 (stand)의 Q값이 action = 1 (hit)일 때보다 크므로
# action = 0가 최적 policy, 즉 Agent가 승리할 확률이 높음.

print([df[df['state_action_pair']==((19, 8, True), 0)]])
print([df[df['state_action_pair']==((19, 8, True), 1)]])
print('-' * 50)

# State가 (13, 8, False)일 때는 Q값이 음의 값이므로 agent가 질 확률이 높지만
# action = 1 (hit)이 최적임을 보여주고 있음.

print([df[df['state_action_pair']==((13, 8, False), 0)]])
print([df[df['state_action_pair']==((13, 8, False), 1)]])

[      state_action_pair   Q_value
462  ((19, 8, True), 0)  0.557427]
[      state_action_pair   Q_value
371  ((19, 8, True), 1) -0.126437]
--------------------------------------------------
[       state_action_pair   Q_value
225  ((13, 8, False), 0) -0.539749]
[       state_action_pair   Q_value
240  ((13, 8, False), 1) -0.367865]
