In [1]:
# coding:utf-8
# [0]ライブラリのインポート
from operator import ne
import gym  #倒立振子(cartpole)の実行環境
from gym import wrappers  #gymの画像保存
import numpy as np
import time
import csv
from statistics import mean
 
memory = []
cart_normal = None
 
# [1]Q関数を離散化して定義する関数　------------
# 観測した状態を離散値にデジタル変換する
def bins(clip_min, clip_max, num):
    return np.linspace(clip_min, clip_max, num + 1)[1:-1]
 
# 各値を離散値に変換
def digitize_state(observation):
    cart_pos, cart_v, pole_angle, pole_v = observation
    digitized = [
        np.digitize(cart_pos, bins=bins(-2.4, 2.4, num_dizitized)),
        np.digitize(cart_v, bins=bins(-3.0, 3.0, num_dizitized)),
        np.digitize(pole_angle, bins=bins(-0.5, 0.5, num_dizitized)),
        np.digitize(pole_v, bins=bins(-2.0, 2.0, num_dizitized))
    ]
    return sum([x * (num_dizitized**i) for i, x in enumerate(digitized)])
 
 
# [2]行動a(t)を求める関数 -------------------------------------
def get_action(next_state, episode):
           #徐々に最適行動のみをとる、ε-greedy法
    epsilon = 0.5 * (1 / (episode + 1e-8))
    if np.random.random() < epsilon:
        next_action = np.random.choice([0, 1])
    else:
        next_action = np.argmax(q_table[next_state])
    return next_action
 
 
# [3]Qテーブルを更新する関数 -------------------------------------
def update_Qtable(q_table, state, action, reward, next_state):
    gamma = 0.9
    alpha = 0.1
    next_Max_Q=max(q_table[next_state][0],q_table[next_state][1] )
    q_table[state, action] = (1 - alpha) * q_table[state, action] +\
            alpha * (reward + gamma * next_Max_Q)
   
    return q_table
 
# [4]. メイン関数開始 パラメータ設定--------------------------------------------------------
env = gym.make('CartPole-v0')
max_number_of_steps = 200  #1試行のstep数
num_consecutive_iterations = 100  #学習完了評価に使用する平均試行回数
num_episodes = 800  #総試行回数
goal_average_reward = 195  #この報酬を超えると学習終了（中心への制御なし）
# 状態を6分割^（4変数）にデジタル変換してQ関数（表）を作成
num_dizitized = 6  #分割数
q_table = np.zeros((num_dizitized**4, env.action_space.n))
 
total_reward_vec = np.zeros(num_consecutive_iterations)  #各試行の報酬を格納
final_x = np.zeros((num_episodes, 1))  #学習後、各試行のt=200でのｘの位置を格納
islearned = 0  #学習が終わったフラグ
isrender = 0  #描画フラグ
 
def memorize(ep,a_cnt,action,state,reward,q,cart_pos,cart_v,pole_angle,pole_v):

    memory.append({'episode':ep,'a_cnt':a_cnt,'action':action,'state':state,\
                    'reward':reward, 'q':q,\
                    'cart_pos':cart_pos,'cart_v':cart_v,'pole_angle':pole_angle,'pole_v':pole_v
                    })

def writer_csv(n):
    with open("sampleA-{}.csv".format(n), "w", newline="") as f:
        fieldnames = ['episode','a_cnt','action','state','reward','q','cart_pos','cart_v','pole_angle','pole_v']

        dict_writer = csv.DictWriter(f, fieldnames = fieldnames)
        dict_writer.writeheader()
        dict_writer.writerows(memory)

log_step = [] #行動回数メモリ
_log_step = [] #行動回数メモリ
# [5] メインルーチン--------------------------------------------------
for episode in range(1, num_episodes+1):  #試行数分繰り返す
    # 環境の初期化
    observation = env.reset()
    state = digitize_state(observation)
    action = np.argmax(q_table[state])
    episode_reward = 0
    a_cnt = 0
    for t in range(max_number_of_steps):  #1試行のループ
        if islearned == 1:  #学習終了したらcartPoleを描画する
            env.render()
            time.sleep(0.1)
            print (observation[0])  #カートのx位置を出力
 
        # 行動a_tの実行により、s_{t+1}, r_{t}などを計算する
        observation, reward, done, info = env.step(action)
        a_cnt += 1 #行動回数カウント
        cart_pos, cart_v, pole_angle, pole_v = observation
 
        # 報酬を設定し与える
        if done:
            if t < 195:
                reward = -200  #こけたら罰則
            else:
                reward = 1  #立ったまま終了時は罰則はなし
        else:
            reward = 1  #各ステップで立ってたら報酬追加
 
        episode_reward += reward  #報酬を追加
 
        # 離散状態s_{t+1}を求め、Q関数を更新する
        next_state = digitize_state(observation)  #t+1での観測状態を、離散値に変換
        if (episode % 50 == 0) or (episode < 100):
            memorize(episode,t,action,state,reward,q_table[state][action],cart_pos, cart_v, pole_angle, pole_v)
        q_table = update_Qtable(q_table, state, action, reward, next_state)

        #  次の行動a_{t+1}を求める 
        action = get_action(next_state, t)    # a_{t+1} 
        
        state = next_state
        
        #終了時の処理
        if done:
            print('%d Episode finished after %f time steps / mean %f' %
                  (episode, t + 1, total_reward_vec.mean()))
            total_reward_vec = np.hstack((total_reward_vec[1:],
                                          episode_reward))  #報酬を記録
            _log_step.append(a_cnt)
            if episode % 50 == 0:
                log_step.append(mean(_log_step))
                _log_step = []
            if islearned == 1:  #学習終わってたら最終のx座標を格納
                final_x[episode, 0] = observation[0]
            break

    if (total_reward_vec.mean() >=
            goal_average_reward):  # 直近の100エピソードが規定報酬以上であれば成功
        #print('Episode %d train agent successfuly!' % episode)
        #islearned = 1
        #np.savetxt('learned_Q_table.csv',q_table, delimiter=",") #Qtableの保存する場合
        if isrender == 0:
            #env = wrappers.Monitor(env, './movie/cartpole-experiment-1') #動画保存する場合
            isrender = 1
    #10エピソードだけでどんな挙動になるのか見たかったら、以下のコメントを外す
    #if episode>10:
    #    if isrender == 0:
    #        env = wrappers.Monitor(env, './movie/cartpole-experiment-1') #動画保存する場合
    #        isrender = 1
    #    islearned=1;
print(log_step)
writer_csv(49)
if islearned:
    np.savetxt('final_x.csv', final_x, delimiter=",")

1 Episode finished after 7.000000 time steps / mean 0.000000
2 Episode finished after 7.000000 time steps / mean -1.940000
3 Episode finished after 8.000000 time steps / mean -3.880000
4 Episode finished after 8.000000 time steps / mean -5.810000
5 Episode finished after 7.000000 time steps / mean -7.740000
6 Episode finished after 7.000000 time steps / mean -9.680000
7 Episode finished after 11.000000 time steps / mean -11.620000
8 Episode finished after 13.000000 time steps / mean -13.520000
9 Episode finished after 8.000000 time steps / mean -15.400000
10 Episode finished after 10.000000 time steps / mean -17.330000
11 Episode finished after 10.000000 time steps / mean -19.240000
12 Episode finished after 17.000000 time steps / mean -21.150000
13 Episode finished after 14.000000 time steps / mean -22.990000
14 Episode finished after 9.000000 time steps / mean -24.860000
15 Episode finished after 27.000000 time steps / mean -26.780000
16 Episode finished after 123.000000 time steps /

131 Episode finished after 194.000000 time steps / mean -39.160000
132 Episode finished after 153.000000 time steps / mean -37.460000
133 Episode finished after 180.000000 time steps / mean -36.060000
134 Episode finished after 92.000000 time steps / mean -34.370000
135 Episode finished after 158.000000 time steps / mean -33.910000
136 Episode finished after 130.000000 time steps / mean -32.460000
137 Episode finished after 148.000000 time steps / mean -31.460000
138 Episode finished after 156.000000 time steps / mean -30.300000
139 Episode finished after 200.000000 time steps / mean -28.820000
140 Episode finished after 119.000000 time steps / mean -25.050000
141 Episode finished after 158.000000 time steps / mean -24.110000
142 Episode finished after 200.000000 time steps / mean -22.980000
143 Episode finished after 200.000000 time steps / mean -19.190000
144 Episode finished after 200.000000 time steps / mean -15.480000
145 Episode finished after 145.000000 time steps / mean -12.630

256 Episode finished after 7.000000 time steps / mean 154.020000
257 Episode finished after 200.000000 time steps / mean 152.250000
258 Episode finished after 200.000000 time steps / mean 152.250000
259 Episode finished after 200.000000 time steps / mean 152.250000
260 Episode finished after 200.000000 time steps / mean 152.250000
261 Episode finished after 132.000000 time steps / mean 152.250000
262 Episode finished after 200.000000 time steps / mean 149.560000
263 Episode finished after 200.000000 time steps / mean 149.560000
264 Episode finished after 188.000000 time steps / mean 149.560000
265 Episode finished after 200.000000 time steps / mean 147.430000
266 Episode finished after 200.000000 time steps / mean 149.670000
267 Episode finished after 200.000000 time steps / mean 149.670000
268 Episode finished after 200.000000 time steps / mean 149.670000
269 Episode finished after 200.000000 time steps / mean 149.670000
270 Episode finished after 154.000000 time steps / mean 149.6700

386 Episode finished after 192.000000 time steps / mean 133.830000
387 Episode finished after 200.000000 time steps / mean 131.740000
388 Episode finished after 200.000000 time steps / mean 131.740000
389 Episode finished after 200.000000 time steps / mean 131.740000
390 Episode finished after 200.000000 time steps / mean 131.740000
391 Episode finished after 200.000000 time steps / mean 131.740000
392 Episode finished after 200.000000 time steps / mean 134.020000
393 Episode finished after 200.000000 time steps / mean 136.120000
394 Episode finished after 200.000000 time steps / mean 136.120000
395 Episode finished after 200.000000 time steps / mean 136.120000
396 Episode finished after 139.000000 time steps / mean 136.120000
397 Episode finished after 200.000000 time steps / mean 137.400000
398 Episode finished after 200.000000 time steps / mean 137.400000
399 Episode finished after 200.000000 time steps / mean 137.430000
400 Episode finished after 200.000000 time steps / mean 137.43

513 Episode finished after 200.000000 time steps / mean 144.710000
514 Episode finished after 200.000000 time steps / mean 146.960000
515 Episode finished after 200.000000 time steps / mean 146.960000
516 Episode finished after 113.000000 time steps / mean 150.880000
517 Episode finished after 200.000000 time steps / mean 148.000000
518 Episode finished after 200.000000 time steps / mean 150.850000
519 Episode finished after 200.000000 time steps / mean 150.870000
520 Episode finished after 200.000000 time steps / mean 150.870000
521 Episode finished after 200.000000 time steps / mean 150.870000
522 Episode finished after 200.000000 time steps / mean 150.870000
523 Episode finished after 200.000000 time steps / mean 153.050000
524 Episode finished after 200.000000 time steps / mean 153.050000
525 Episode finished after 200.000000 time steps / mean 153.050000
526 Episode finished after 188.000000 time steps / mean 153.050000
527 Episode finished after 131.000000 time steps / mean 150.92

639 Episode finished after 200.000000 time steps / mean 134.200000
640 Episode finished after 200.000000 time steps / mean 134.200000
641 Episode finished after 200.000000 time steps / mean 136.960000
642 Episode finished after 200.000000 time steps / mean 139.350000
643 Episode finished after 199.000000 time steps / mean 142.120000
644 Episode finished after 200.000000 time steps / mean 142.110000
645 Episode finished after 200.000000 time steps / mean 142.110000
646 Episode finished after 200.000000 time steps / mean 144.230000
647 Episode finished after 200.000000 time steps / mean 144.230000
648 Episode finished after 200.000000 time steps / mean 144.230000
649 Episode finished after 200.000000 time steps / mean 146.530000
650 Episode finished after 200.000000 time steps / mean 146.530000
651 Episode finished after 200.000000 time steps / mean 146.530000
652 Episode finished after 132.000000 time steps / mean 146.530000
653 Episode finished after 112.000000 time steps / mean 146.07

762 Episode finished after 191.000000 time steps / mean 139.230000
763 Episode finished after 9.000000 time steps / mean 137.130000
764 Episode finished after 200.000000 time steps / mean 135.600000
765 Episode finished after 200.000000 time steps / mean 137.690000
766 Episode finished after 200.000000 time steps / mean 140.760000
767 Episode finished after 200.000000 time steps / mean 143.590000
768 Episode finished after 200.000000 time steps / mean 146.670000
769 Episode finished after 200.000000 time steps / mean 150.270000
770 Episode finished after 200.000000 time steps / mean 152.840000
771 Episode finished after 200.000000 time steps / mean 156.060000
772 Episode finished after 200.000000 time steps / mean 158.720000
773 Episode finished after 200.000000 time steps / mean 158.720000
774 Episode finished after 200.000000 time steps / mean 158.720000
775 Episode finished after 200.000000 time steps / mean 158.720000
776 Episode finished after 200.000000 time steps / mean 162.0300