In [41]:
#coding: utf-8
import numpy as np

def random_action():
    """
    actionはランダムに行われる
    """
    return np.random.choice([0, 1])

def get_action(next_state, episode):
    epsilon = 0.5 * (1 / (episode + 1))    #徐々に最適な行動のみ取るようになる（ε-greedy法）
    
    if epsilon <= np.random.uniform(0, 1):    #np.random.uniform(0,1) ~０から１までのランダムなfloat
        a = np.where(q_table[next_state]==q_table[next_state].max())[0]
        next_action = np.random.choice(a)
    else:
        next_action = random_action()
        
    return next_action

def step(state, action):
    reward = 0
    if state == 0:
        if action == 0:
            state = 1
        else:
            state = 0
            
    else:
        if action == 0:
            state = 0
        else:
            state = 1
            reward = 1
    return state, reward

def update_Qtable(q_table, state, action, reward, next_state):
    gamma = 0.9
    alpha = 0.5
    next_maxQ = max(q_table[next_state])
    q_table[state, action] = (1 - alpha) * q_table[state, action] + alpha * (reward + gamma * next_maxQ)
    
    return q_table

max_number_of_steps = 10    #1試行のstep数
num_episodes = 20    #総試行回数
q_table = np.zeros((2, 2))

for episode in range(num_episodes):
    state = 0
    episode_reward = 0
    
    for t in range(max_number_of_steps):
        action = get_action(state, episode)    #a_{t-1}
        next_state, reward = step(state, action)
        print(state, action, reward)
        episode_reward += reward
        q_table = update_Qtable(q_table, state, action, reward, next_state)
        state = next_state
    
    print("episode: {} total reward {}".format(episode+1, episode_reward))
    print()
        


0 1 0
0 0 0
1 0 0
0 0 0
1 1 1
1 1 1
1 0 0
0 0 0
1 1 1
1 1 1
episode: 1 total reward 4

0 0 0
1 1 1
1 1 1
1 0 0
0 0 0
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
episode: 2 total reward 7

0 1 0
0 0 0
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
episode: 3 total reward 8

0 0 0
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
1 0 0
0 0 0
episode: 4 total reward 7

0 0 0
1 1 1
1 1 1
1 1 1
1 0 0
0 0 0
1 1 1
1 1 1
1 1 1
1 1 1
episode: 5 total reward 7

0 0 0
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
episode: 6 total reward 9

0 0 0
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
episode: 7 total reward 9

0 0 0
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
episode: 8 total reward 9

0 0 0
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
episode: 9 total reward 9

0 0 0
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
episode: 10 total reward 9

0 0 0
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
episode: 11 total reward 9

0 0 0
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1
1 1 1

## OpenAI Gymによる倒立振子

In [52]:
import numpy as np
import gym
import time

def digitize_state(observation):
    p, a, v, w = observation
    d = num_digitized
    pn = np.digitize(p, np.linspace(-2.4, 2.4, d+1)[1:-1])
    vn = np.digitize(v, np.linspace(-3.0, 3.0, d+1)[1:-1])
    an = np.digitize(a, np.linspace(-0.5, 0.5, d+1)[1:-1])
    wn = np.digitize(w, np.linspace(-2.0, 2.0, d+1)[1:-1])
    
    return pn + vn*d + an*d + wn*d**3

def get_action(next_state, episode):
    epsilon = 0.5 * (1 / (episode + 1))
    if epsilon <= np.random.uniform(0, 1):
        a = np.where(q_table[next_state]==q_table[next_state].max())[0]
        next_action = np.random.choice(a)
    else:
        next_action = random_action()
        
    return next_action

def update_Qtable(q_table, state, action, reward, next_state):
    gamma = 0.9
    alpha = 0.5
    next_maxQ = max(q_table[next_state])
    q_table[state, action] = (1 - alpha) * q_table[state, action] + alpha * (reward + gamma * next_maxQ)
    
    return q_table

env = gym.make("CartPole-v0")
max_number_of_steps = 200    #１試行のStep数
num_episodes = 1000
num_digitized = 6    #振り子の位置の分割数
q_table = np.random.uniform(low=-1, high=1, size=(num_digitized**4, env.action_space.n))
#q_table = np.loadtxt("Qvalue.txt")

for episode in range(num_episodes):
    #環境の初期化
    observation = env.reset()
    state = digitize_state(observation)
    action = np.argmax(q_table[state])
    episode_reward = 0
    
    for t in range(max_number_of_steps):
        if episode % 10 == 0:
            env.render()
        observation, reward, done, info = env.step(action)
        if done and t < max_number_of_steps - 1:
            reward -= max_number_of_steps    #倒れたら罰則
        episode_reward += reward
        next_state = digitize_state(observation)    #t+1の観測状態を、離散地に変換（棒の角度）
        q_table = update_Qtable(q_table, state, action, reward, next_state)
        action = get_action(next_state, episode)    #a_{t+1}
        state = next_state
        if done:
            break
    print("episode:{}\nreward:{}".format(episode, episode_reward))
np.savetxt("Qtable.txt", q_table)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
episode:0
reward:-165.0
episode:1
reward:-184.0
episode:2
reward:-171.0
episode:3
reward:-180.0
episode:4
reward:-182.0
episode:5
reward:-176.0
episode:6
reward:-147.0
episode:7
reward:-133.0
episode:8
reward:-173.0
episode:9
reward:-126.0
episode:10
reward:-179.0
episode:11
reward:-123.0
episode:12
reward:-168.0
episode:13
reward:-93.0
episode:14
reward:-183.0
episode:15
reward:-129.0
episode:16
reward:-123.0
episode:17
reward:-55.0
episode:18
reward:-162.0
episode:19
reward:-119.0
episode:20
reward:-173.0
episode:21
reward:-137.0
episode:22
reward:-91.0
episode:23
reward:-107.0
episode:24
reward:-96.0
episode:25
reward:-120.0
episode:26
reward:-135.0
episode:27
reward:-143.0
episode:28
reward:-99.0
episode:29
reward:-165.0
episode:30
reward:-133.0
episode:31
reward:-114.0
episode:32
reward:-149.0
episode:33
reward:-154.0
episode:34
reward:-141.0
episode:35
reward:-132.0
episode

KeyboardInterrupt: 