In [2]:
import  tensorflow as tf
from tensorflow import keras
import numpy as np

In [3]:
def play_one_game(env, obs, model, loss_fn):
    with tf.GradientTape() as tape:
        ask_proba = model(obs[np.newaxis])  # 買い注文
        action = (tf.random.uniform[1, 1] > ask_proba)  # random.uniformは常に1を返す
        y_target = tf.constant([[1.]]) - tf.cast(action, tf.float32)
        loss = tf.reduce_mean(loss_fn(y_target, ask_proba))
    grads = tape.gradient(loss, model.trainable_variables)
    obs, raward, done, info = env.step(int(action[0, 0].numpy()))
    return obs, reward, done, grads

In [4]:
def play_multiple_episodes(env, n_episodes, n_max_steps, model, loss_fn):
    all_rewards = []
    all_grads = []
    for episode in range(n_episodes):
        current_rewards = []
        current_grads = []
        obs = env.reset()
        for step in range(n_max_steps):
            obs, reward, done, grads = play_one_step(env, obs, model, loss_fn)
            current_rewards.append(reward)
            current_grads.append(grads)
            if done:
                break
        all_rewards.append(current_rewards)
        all_grads.append(current_grads)
    return all_rewards, all_grads




In [None]:
def discount_rewards(rewards, discount_rate):  # 報酬の総和(累積報酬)
    discounted = np.array(rewards)
    for step in range(len(rewards) - 2, -1, -1):
        discounted[step] += discounted[step + 1] * discount_rate
    return discounted

def discount_and_normalize_rewards(all_rewards, discount_rate):  # 累積報酬を正規化
    all_discounted_rewards = [discount_rewards(rewards, discount_rate)
                              for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    return [(discounted_rewards - reward_mean) / reward_std
            for discounted_rewards in all_discounted_rewards]

In [5]:
discount_rewards([10, 0, -50], discount_rate=0.8)

array([-22, -40, -50])

In [6]:
discount_and_normalize_rewards([[10, 0, -50], [10, 20]], discount_rate=0.8)

[array([-0.28435071, -0.86597718, -1.18910299]),
 array([1.26665318, 1.0727777 ])]

In [7]:
n_iterations = 150
n_episodes_per_update = 10
n_max_steps = 200
discount_rate = 0.95

In [8]:
optimizer = keras.optimizers.Adam(lr=0.01)
loss_fn = keras.losses.binary_crossentropy

In [10]:
for iteration in range(n_iterations):
    all_rewards, all_grads = play_multiple_episodes(
        env, n_episodes_per_update, n_max_steps, model, loss_fn)
    total_rewards = sum(map(sum, all_rewards))                     # Not shown in the book
    print("\rIteration: {}, mean rewards: {:.1f}".format(          # Not shown
        iteration, total_rewards / n_episodes_per_update), end="") # Not shown
    all_final_rewards = discount_and_normalize_rewards(all_rewards,
                                                       discount_rate)
    all_mean_grads = []
    for var_index in range(len(model.trainable_variables)):
        mean_grads = tf.reduce_mean(
            [final_reward * all_grads[episode_index][step][var_index]
             for episode_index, final_rewards in enumerate(all_final_rewards)
                 for step, final_reward in enumerate(final_rewards)], axis=0)
        all_mean_grads.append(mean_grads)
    optimizer.apply_gradients(zip(all_mean_grads, model.trainable_variables))

NameError: name 'env' is not defined

In [None]:
##***************為替通貨価格が上がったら売る、下がったら買う
def basic_policy(obs):
    angle = obs[2]
    return 0 if angle < 0 else 1 # angle < 0の時return 0 それ以外はreturn 1

totals = []

obs = 0  # 観測データ
reward = 0  # 報酬
done = False  # エピソード終了フラグ


for episode in range(500):  # 500エピソード実行する
    episode_rewards = 0
    
    obs = env.reset()
    for step in range(200): # 200回実行する
        action = basic_policy(obs)
        obs, reward, done, info = env.step(action)
        episode_rewards += reward
        if done:
            break
    totals.append(episode_rewards)

In [12]:
import random
#********為替エピソード******************
money = 500000
not_act = 0  # 何もしない
ask_act = 1  # 買い注文
bid_act = 2  # 売り注文
act = [not_act, ask_act, bid_act]
random_act = random.choice(act)



print(random_act)

0
