In [2]:
import gym
from gym import spaces
import numpy as np

# カスタム環境の定義（上で提供したものを使用）

class _MyEnv(gym.Env):
    def __init__(self):
        super(MyEnv, self).__init__()
        
        # アクションの空間を定義します。例えば、-1から1の連続値を持つアクションを考えた場合：
        self.action_space = spaces.Box(low=-1, high=1, shape=(1,), dtype=float)
        
        # 観測の空間を定義します。例えば、0から1の値を5つ持つ観測を考えた場合：
        self.observation_space = spaces.Box(low=0, high=1, shape=(5,), dtype=float)
        
    def reset(self):
        # 環境を初期状態にリセットし、初期の観測を返します。
        initial_observation = ...  # 初期の観測を設定します
        return initial_observation

    def step(self, action):
        # アクションを受け取り、次の観測、報酬、終了フラグ、その他の情報を返します。
        observation = ...  # 次の観測を取得します
        reward = ...  # 報酬を計算します
        done = ...  # エピソードが終了したかどうかを判断します
        info = {}  # 追加の情報を含めることができます
        return observation, reward, done, info

    def render(self, mode='human'):
        # 環境の現在の状態を可視化します。
        pass

    def close(self):
        # 環境を閉じる際の後処理を行います。
        pass

class MyEnv(gym.Env):
    def __init__(self):
        super(MyEnv, self).__init__()
        
        # アクションの空間を定義します。例えば、-1から1の連続値を持つアクションを考えた場合：
        self.action_space = spaces.Box(low=-1, high=1, shape=(1,), dtype=float)
        
        # 観測の空間を定義します。例えば、0から1の値を5つ持つ観測を考えた場合：
        self.observation_space = spaces.Box(low=0, high=1, shape=(5,), dtype=float)

    def reset(self):
        # 初期の観測をランダムに設定して返します。
        initial_observation = np.random.rand(5)
        return initial_observation

    def step(self, action):
        # 次の観測をランダムに取得します。
        observation = np.random.rand(5)
        
        # 報酬をランダムに計算します。
        reward = np.random.randn()  # 例として標準正規分布からのランダムな値を報酬とします。
        
        # エピソードが終了したかどうかをランダムに判断します。
        done = np.random.choice([True, False])
        
        info = {}
        return observation, reward, done, info



# エージェントの定義
class RandomAgent:
    def __init__(self, action_space):
        self.action_space = action_space

    def act(self):
        return self.action_space.sample()

# 環境とエージェントの初期化
env = MyEnv()
agent = RandomAgent(env.action_space)

# エピソード数を設定
num_episodes = 10

# エピソードごとのループ
for i in range(num_episodes):
    observation = env.reset()
    done = False
    episode_reward = 0

    while not done:
        action = agent.act()
        next_observation, reward, done, _ = env.step(action)
        episode_reward += reward

    print(f"Episode {i + 1}: Total Reward = {episode_reward}")

env.close()

Episode 1: Total Reward = 0.03079634545408738
Episode 2: Total Reward = -0.14457185647563012
Episode 3: Total Reward = -1.9599700517692185
Episode 4: Total Reward = -3.129478549211365
Episode 5: Total Reward = 0.2681274184778025
Episode 6: Total Reward = -2.454748704243692
Episode 7: Total Reward = 2.17620275295787
Episode 8: Total Reward = -1.1505743286129937
Episode 9: Total Reward = 1.2381142737842674
Episode 10: Total Reward = 0.12865619136223905


In [4]:
import sqlite3
import gym
from gym import spaces
import pandas as pd

class TimeSeriesEnv(gym.Env):
    def __init__(self, db_path):
        super(TimeSeriesEnv, self).__init__()
        
        # Load the time series data from the database
        conn = sqlite3.connect(db_path)
        self.data = pd.read_sql('SELECT * FROM ratest_rate', conn)
        conn.close()
        
        # Define the action and observation spaces
        # self.action_space = spaces.Discrete(2)  # 0: Price will go down, 1: Price will go up 
        # ↓ -1 ~ 1の範囲に変更
        self.action_space = spaces.Box(low=-1, high=1, shape=(1,), dtype=float)

        self.observation_space = spaces.Box(low=0, high=float('inf'), shape=(len(self.data.columns),), dtype=float)
        
        # Additional variables
        self.current_step = 0
        self.done = False

    def reset(self):
        self.current_step = 0
        self.done = False
        return self.data.iloc[self.current_step].values

    def step(self, action):
        self.current_step += 1
        if self.current_step >= len(self.data) - 1:
            self.done = True
        
        # Just an example reward calculation (you might want to change this)
        reward = 0
        if action == 1 and self.data.iloc[self.current_step]['last'] > self.data.iloc[self.current_step - 1]['last']:
            reward = 1
        elif action == 0 and self.data.iloc[self.current_step]['last'] < self.data.iloc[self.current_step - 1]['last']:
            reward = 1
        
        return self.data.iloc[self.current_step].values, reward, self.done, {}

    def render(self, mode='human'):
        pass

    def close(self):
        pass

# Initialize the environment
env = TimeSeriesEnv(r'C:\Users\yamaguchi\MyDocument\pytest\virtual_currency\gmo\original_gym\myenv\2023-09-20_ratest_rate.db')

# Test the environment
initial_state = env.reset()
print("Initial State:", initial_state)

action = env.action_space.sample()
next_state, reward, done, _ = env.step(action)
print("Next State after taking action:", action, "is:", next_state)
print("Reward:", reward)


Initial State: [1 '2023-09-20 00:20:11.931000' 'ticker' 4021627 4020401 4053439 4020401
 3949961 'BTC' 146.6095]
Next State after taking action: 0 is: [2 '2023-09-20 00:20:51.559000' 'ticker' 4026629 4023899 4053439 4023900
 3949961 'BTC' 146.6461]
Reward: 0


In [7]:
class RandomAgent:
    def __init__(self, action_space):
        self.action_space = action_space

    def act(self):
        return self.action_space.sample()

# Initialize the environment and agent
env = TimeSeriesEnv(r'C:\Users\yamaguchi\MyDocument\pytest\virtual_currency\gmo\original_gym\myenv\2023-09-20_ratest_rate.db')
agent = RandomAgent(env.action_space)

# Number of episodes
num_episodes = 10

# Simulate the interaction
for i in range(num_episodes):
    state = env.reset()
    done = False
    episode_reward = 0

    while not done:
        action = agent.act()
        next_state, reward, done, _ = env.step(action)
        episode_reward += reward

    print(f"Episode {i + 1}: Total Reward = {episode_reward}")

env.close()

Episode 1: Total Reward = 657
Episode 2: Total Reward = 709
Episode 3: Total Reward = 668
Episode 4: Total Reward = 673
Episode 5: Total Reward = 729
Episode 6: Total Reward = 719
Episode 7: Total Reward = 665
Episode 8: Total Reward = 682
Episode 9: Total Reward = 664
Episode 10: Total Reward = 685


In [21]:
conn = sqlite3.connect(r'C:\Users\yamaguchi\MyDocument\pytest\virtual_currency\gmo\original_gym\myenv\2023-09-20_ratest_rate.db')
data = pd.read_sql('SELECT * FROM ratest_rate', conn)

print(data.iloc[1])
data.iloc[1]['last']

id                                    2
timestamp    2023-09-20 00:20:51.559000
channel                          ticker
ask                             4026629
bid                             4023899
high                            4053439
last                            4023900
low                             3949961
symbol                              BTC
column                         146.6461
Name: 1, dtype: object


4023900

# ひな型

## Env(環境)

In [27]:
import sqlite3
import gym
from gym import spaces
import pandas as pd
from collections import namedtuple

# 新しいタプルのサブクラスを定義
State = namedtuple('State', ['timestamp', 'ask', 'bid', 'high', 'low', 'last', 'volume', 'jpy', 'btc'])
# Step = collections.namedtuple('Step', ['state', 'action', 'reward'])

class TimeSeriesEnv(gym.Env):
    def __init__(self, db_path, initial_jpy=1e5):
        super(TimeSeriesEnv, self).__init__()
        
        # Load the time series data from the database
        conn = sqlite3.connect(db_path)
        self.data = pd.read_sql('SELECT * FROM ratest_rate', conn)
        conn.close()

        self.data = self.data.drop(['id', 'channel', 'symbol'], axis = 1)
        
        # Initial holdings
        self.initial_jpy = initial_jpy
        self.initial_btc = initial_jpy/self.data.iloc[0]['last']
        self.jpy = self.initial_jpy
        self.btc = self.initial_btc
        
        # Define the action and observation spaces
        self.action_space = spaces.Box(low=-1, high=1, shape=(1,), dtype=float)
        self.observation_space = spaces.Box(low=0, high=float('inf'), shape=(len(self.data.columns) + 2,), dtype=float)
        
        # Additional variables
        self.current_step = 0
        self.done = False

    def reset(self):
        self.jpy = self.initial_jpy
        self.btc = self.initial_btc
        self.current_step = 0
        self.done = False
        return self._next_observation()

    def _next_observation(self):
        obs = self.data.iloc[self.current_step].values.tolist()
        obs.extend([self.jpy, self.btc])
        self.state = State(timestamp = obs[0], ask = obs[1], bid = obs[2], high = obs[3], low = obs[5], last = obs[4], volume = obs[6], jpy = obs[7], btc = obs[8])
        return self.state

    def step(self, action):
        # prev_value = self.jpy + self.btc * self.data.iloc[self.current_step]['last']
        prev_value = self.jpy + self.btc * self.state.last
        

        # tradeはbtcの単位とする
        # Update holdings based on the action
        trade_amount = action[0]  # Assume the action is a numpy array
        if trade_amount > 0:  # Buying BTC
            self.jpy -= trade_amount * self.data.iloc[self.current_step]['last']
            self.btc += trade_amount
        else:  # Selling BTC
            self.jpy += abs(trade_amount) * self.data.iloc[self.current_step]['last']
            self.btc -= abs(trade_amount)

        
        
        self.current_step += 1
        if self.current_step >= len(self.data) - 1:
            self.done = True
        
        # Compute the reward
        current_value = self.jpy + self.btc * self.data.iloc[self.current_step]['last']
        reward = current_value - prev_value
        
        return self._next_observation(), reward, self.done, {}

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# TimeSeriesEnvの動作確認

# ダミーデータベースパスを使用して環境を初期化
dummy_db_path = r"C:\Users\yamaguchi\MyDocument\pytest\virtual_currency\gmo\original_gym\myenv\2023-09-20_ratest_rate.db"
time_series_env = TimeSeriesEnv(dummy_db_path)

# 初期状態の取得
initial_observation_ts = time_series_env.reset()
print(initial_observation_ts)

State(timestamp='2023-09-20 00:20:11.931000', ask=4021627, bid=4020401, high=4053439, low=3949961, last=4020401, volume=146.6095, jpy=100000.0, btc=0.02487314076381933)


In [28]:
# TimeSeriesEnvの動作確認

# ダミーデータベースパスを使用して環境を初期化
dummy_db_path = r"C:\Users\yamaguchi\MyDocument\pytest\virtual_currency\gmo\original_gym\myenv\2023-09-20_ratest_rate.db"
time_series_env = TimeSeriesEnv(dummy_db_path)

# 初期状態の取得
initial_observation_ts = time_series_env.reset()

# ランダムアクションの適用
action_ts = time_series_env.action_space.sample()
next_observation_ts, reward_ts, done_ts, info_ts = time_series_env.step(action_ts)

initial_observation_ts, action_ts, next_observation_ts, reward_ts, done_ts, info_ts


(State(timestamp='2023-09-20 00:20:11.931000', ask=4021627, bid=4020401, high=4053439, low=3949961, last=4020401, volume=146.6095, jpy=100000.0, btc=0.02487314076381933),
 array([0.79272294]),
 State(timestamp='2023-09-20 00:20:51.559000', ask=4026629, bid=4023899, high=4053439, low=3949961, last=4023900, volume=146.6461, jpy=-3087064.100196822, btc=0.8175960806389267),
 2860.768686155323,
 False,
 {})

In [45]:
# namedtupleの使い方

from collections import namedtuple

# 新しいタプルのサブクラスを定義
Person = namedtuple('Person', ['name', 'age', 'gender'])

# インスタンスを作成
john = Person(name="John Doe", age=30, gender="male")

# 属性としてのアクセス
print(john.name)  # "John Doe"
print(john.age)   # 30
print(john[2])    # "male" (通常のタプルのようなインデックスアクセスも可能)
print(john)

John Doe
30
male
Person(name='John Doe', age=30, gender='male')


## Actor, Critic

In [29]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow.python import keras
from tensorflow.keras.optimizers import Adam

from matplotlib import pyplot as plt

from collections import deque
import random


# pip install gym=0.25.2 ※0.26.2だと正しく学習できない（原因は未調査）


class ActorModel(keras.Model):
    def __init__(self, action_space):
        super().__init__()

        self.noise_stdev = 0.2  # ノイズ用の標準偏差
        self.action_space = action_space # 連続値の値域が与えられる. Box(-1.0, 1.0, (1,), float32)]

        # Envアクション用 スケールの変更用(今回は特に意味はない)
        self.action_centor = (action_space.high + action_space.low)/2  # 中心値は0.0
        self.action_scale = action_space.high - self.action_centor     # スケールは2.0

        # 各レイヤーを定義
        self.dense1 = keras.layers.Dense(32, activation="relu")
        self.dense2 = keras.layers.Dense(32, activation="relu")
        self.dense3 = keras.layers.Dense(32, activation="relu")
        self.actions = keras.layers.Dense(action_space.shape[0], activation="tanh")
        
        # optimizer
        self.optimizer = Adam(learning_rate=0.003)

    # Forward pass
    def call(self, inputs, training=False): # __call__メソッドであり，self()の形で実行可能
        x = self.dense1(inputs)
        x = self.dense2(x)
        x = self.dense3(x)
        actions = self.actions(x)
        return actions

    # 状態を元にactionを算出
    def sample_action(self, state, training=False):

        state = np.array(state[1:]) # namedtupleからndarrayに変換

        actions = self(state.reshape((1,-1))) # def call()を呼び出している
        action = actions[0].numpy()

        if training:
            # 学習用
            # ノイズを混ぜる
            noise = np.random.normal(0, self.noise_stdev, size=self.action_space.shape)
            action = np.clip(action + noise, -1, 1) # action は -1 ~ 1の範囲で出力 ∵actionsの出力がtanh

            # 環境用のアクションと学習用のアクションを返す
            return (action * self.action_scale + self.action_centor), action
        else:
            # テスト用、環境に渡すアクションのみを返す
            return action * self.action_scale + self.action_centor


class CriticModel(keras.Model):
    def __init__(self):
        super().__init__()

        # 各レイヤーを定義
        self.dense1 = keras.layers.Dense(32, activation="relu")
        self.dense2 = keras.layers.Dense(32, activation="relu")
        self.dense3 = keras.layers.Dense(32, activation="relu")
        self.value1 = keras.layers.Dense(1, activation="linear")
        self.dense4 = keras.layers.Dense(32, activation="relu")
        self.dense5 = keras.layers.Dense(32, activation="relu")
        self.dense6 = keras.layers.Dense(32, activation="relu")
        self.value2 = keras.layers.Dense(1, activation="linear")

        # optimizer
        self.optimizer = Adam(learning_rate=0.003)

    # Forward pass
    def call(self, states, actions, training=False):
        x = tf.concat([states, actions], axis=1)
        x1 = self.dense1(x)
        x1 = self.dense2(x1)
        x1 = self.dense3(x1)
        q1 = self.value1(x1)
        x2 = self.dense4(x)
        x2 = self.dense5(x2)
        x2 = self.dense6(x2)
        q2 = self.value2(x2)
        return q1, q2

        # CriticModelのインスタンスでそのまま呼び出し可能 ∵__call__メソッド

### ActorがTimeSeriesEnvと合うかチェック

In [30]:
# 環境とエージェントの初期化
db_path = r'C:\Users\yamaguchi\MyDocument\pytest\virtual_currency\gmo\original_gym\myenv\2023-09-20_ratest_rate.db'
env = TimeSeriesEnv(db_path)
actor = ActorModel(env.action_space)
critic = CriticModel()

state = env.reset()
episode_reward = 0
done = False

    
# 学習のループ
num_episodes = 10
for episode in range(num_episodes):
    state = env.reset()
    episode_reward = 0
    done = False
    
    while not done:
        # エージェントのアクションの取得
        action = actor.sample_action(state, training=True)[0]  # 学習時のアクションを取得
        
        # 環境でのステップの実行
        next_state, reward, done, _ = env.step(action)
        
        # Replay bufferにデータを保存、ネットワークの更新などの処理を行います。
        
        state = next_state
        episode_reward += reward
    
    print(f"Episode: {episode + 1}, Total Reward: {episode_reward}")
    

Episode: 1, Total Reward: -21066986.264745712
Episode: 2, Total Reward: -21014713.89624214


KeyboardInterrupt: 

## update

In [31]:
def update_model(
        actor_model, 
        target_actor_model, 
        critic_model, 
        target_critic_model, 
        experiences, 
        batch_size, 
        gamma,
        all_train_count,
        actor_update_interval,
        target_policy_noise_stddev,
        target_policy_clip_range,
    ):

    # ランダムに経験を取得してバッチを作成
    batchs = random.sample(experiences, batch_size)

    # Target Networkを用いて次の状態の価値を出す
    n_states = np.asarray([e["n_state"] for e in batchs])
    n_actions = target_actor_model(n_states)

    # Target Actionのノイズ
    clipped_noise = np.clip(np.random.normal(0, target_policy_noise_stddev, n_actions.shape), -target_policy_clip_range, target_policy_clip_range)
    n_actions = np.clip(n_actions + clipped_noise, -1, 1)

    # 2つのQ値から小さいほうを採用
    n_qvals1, n_qvals2 = target_critic_model(n_states, n_actions)
    n_qvals = [min(q1, q2) for q1, q2 in zip(n_qvals1.numpy(), n_qvals2.numpy())]

    # Qを計算 : reward if done else (reward + gamma * n_qval)
    q_vals = np.asarray([
        [reward] if done else [reward] + gamma * n_qval
        for reward, done, n_qval in zip(
            [e["reward"] for e in batchs],
            [e["done"] for e in batchs],
            n_qvals,
        )
    ])

    # データ整形
    states = np.asarray([e["state"] for e in batchs])
    actions = np.asarray([e["action"] for e in batchs])

    #--- Actorの学習
    # Actorの学習は少し減らす
    if all_train_count % actor_update_interval == 0:
        with tf.GradientTape() as tape:
            actor_actions = actor_model(states, training=True)
            q, _ = critic_model(states, actor_actions)
            actor_loss = -tf.reduce_mean(q)  # 最大化
        
        grads = tape.gradient(actor_loss, actor_model.trainable_variables)
        actor_model.optimizer.apply_gradients(zip(grads, actor_model.trainable_variables))
    else:
        actor_loss = 0

    #--- Criticの学習 MSEで学習
    with tf.GradientTape() as tape:
        q1, q2 = critic_model(states, actions, training=True)
        loss1 = tf.reduce_mean(tf.square(q_vals - q1))
        loss2 = tf.reduce_mean(tf.square(q_vals - q2))
        critic_loss = loss1 + loss2
    
    grads = tape.gradient(critic_loss, critic_model.trainable_variables)
    critic_model.optimizer.apply_gradients(zip(grads, critic_model.trainable_variables))

    return actor_loss, critic_loss



def update_target_model(actor_model, target_actor_model, critic_model, target_critic_model, soft_tau):

    target_actor_model.set_weights(
        (1 - soft_tau) * np.array(target_actor_model.get_weights(), dtype=object)
        + (soft_tau) * np.array(actor_model.get_weights(), dtype=object))

    target_critic_model.set_weights(
        (1 - soft_tau) * np.array(target_critic_model.get_weights(), dtype=object)
        + (soft_tau) * np.array(critic_model.get_weights(), dtype=object))

### updateがうまくいくかチェック

In [34]:
# ハイパーパラメータ
buffer_size = 10000  # キューの最大容量
warmup_size = 500    # 最低限キューに入れる数
train_interval = 10  # 学習間隔
batch_size = 32      # バッチサイズ
gamma = 0.9          # 割引率
soft_tau = 0.02      # Target network の近づく割合
actor_update_interval = 2         # Actorの更新間隔
target_policy_noise_stddev = 0.2  # Target policy ノイズの標準偏差
target_policy_clip_range = 0.5    # Target policy ノイズのclip範囲

# 環境とエージェントの初期化
db_path = r'C:\Users\yamaguchi\MyDocument\pytest\virtual_currency\gmo\original_gym\myenv\2023-09-20_ratest_rate.db'
env = TimeSeriesEnv(db_path)

# モデルの定義
actor_model = ActorModel(env.action_space)
target_actor_model = ActorModel(env.action_space)
critic_model = CriticModel()
target_critic_model = CriticModel()


state = env.reset()
episode_reward = 0
done = False

# 収集する経験は上限を決め、古いものから削除する
experiences = deque(maxlen=buffer_size)

all_step_count = 0
all_train_count = 0


# 記録用
history_rewards = []
history_metrics = []
history_metrics_y = []

    
# 学習のループ
num_episodes = 10
for episode in range(num_episodes):
    state = env.reset()
    total_reward = 0
    done = False

    metrics_list = []
    
    while not done:
        # エージェントのアクションの取得
        action = actor_model.sample_action(state, training=True)[0]  # 学習時のアクションを取得
        
        # 環境でのステップの実行
        next_state, reward, done, _ = env.step(action)
        total_reward += reward
        
        # Replay bufferにデータを保存、ネットワークの更新などの処理を行います。

        experiences.append({
            "state": np.array(state[1:]),
            "action": action, # 今回は範囲が-1~1なので env_action = action
            "reward": reward,
            "n_state": np.array(next_state[1:]),
            "done": done,
        })
        
        state = next_state

        if len(experiences) == warmup_size-1:
            # pdb.set_trace()
            print("train start")

         # warmup貯まったら train_interval 毎に学習する
        if len(experiences) >= warmup_size and all_step_count % train_interval == 0:
            # モデルの更新
            metrics = update_model(
                actor_model, 
                target_actor_model, 
                critic_model, 
                target_critic_model, 
                experiences, 
                batch_size, 
                gamma,
                all_train_count,
                actor_update_interval,
                target_policy_noise_stddev,
                target_policy_clip_range,
            )
            # Soft-target
            update_target_model(
                actor_model, 
                target_actor_model, 
                critic_model, 
                target_critic_model, 
                soft_tau
            )
            all_train_count += 1
            metrics_list.append(metrics)
            
            
        all_step_count += 1


    # 報酬
    history_rewards.append(total_reward)

    # メトリクス
    if len(metrics_list) > 0:
        history_metrics.append(np.mean(metrics_list, axis=0))  # 平均を保存
        history_metrics_y.append(episode)

    #--- print
    interval = 1
    if episode % interval == 0:
        print("{} (min,ave,max)reward {:.1f} {:.1f} {:.1f}".format(
            episode,
            min(history_rewards[-interval:]),
            np.mean(history_rewards[-interval:]),
            max(history_rewards[-interval:]),
        ))
    
    # print(f"Episode: {episode + 1}, Total Reward: {episode_reward}")



train start
0 (min,ave,max)reward 209916.6 209916.6 209916.6
1 (min,ave,max)reward 211034.6 211034.6 211034.6
2 (min,ave,max)reward 211209.0 211209.0 211209.0
3 (min,ave,max)reward 208454.2 208454.2 208454.2
4 (min,ave,max)reward 209975.0 209975.0 209975.0
5 (min,ave,max)reward 208886.9 208886.9 208886.9
6 (min,ave,max)reward 208955.0 208955.0 208955.0
7 (min,ave,max)reward 211044.3 211044.3 211044.3
8 (min,ave,max)reward 207953.9 207953.9 207953.9
9 (min,ave,max)reward 210782.7 210782.7 210782.7


一応学習自体はできているっぽい．
rewardが悪くなっているので，いろいろな指標を試す．

Env再掲

In [33]:
import sqlite3
import gym
from gym import spaces
import pandas as pd
from collections import namedtuple

# 新しいタプルのサブクラスを定義
State = namedtuple('State', ['timestamp', 'ask', 'bid', 'last', 'volume', 'jpy', 'btc'])
# Step = collections.namedtuple('Step', ['state', 'action', 'reward'])

class TimeSeriesEnv(gym.Env):
    def __init__(self, db_path, initial_jpy=1e5):
        super(TimeSeriesEnv, self).__init__()
        
        # Load the time series data from the database
        conn = sqlite3.connect(db_path)
        self.data = pd.read_sql('SELECT * FROM ratest_rate', conn)
        conn.close()

        self.data = self.data.drop(['id', 'channel', 'symbol'], axis = 1)
        
        # Initial holdings
        self.initial_jpy = initial_jpy
        self.initial_btc = initial_jpy/self.data.iloc[0]['last']
        self.jpy = self.initial_jpy
        self.btc = self.initial_btc
        
        # Define the action and observation spaces
        self.action_space = spaces.Box(low=-1, high=1, shape=(1,), dtype=float) 
        self.action_space = spaces.Box(low=-0.01, high=0.01, shape=(1,), dtype=float) # 今は0.01 ~ 0.01 btcの間にする
        self.observation_space = spaces.Box(low=0, high=float('inf'), shape=(len(self.data.columns) + 2,), dtype=float)
        
        # Additional variables
        self.current_step = 0
        self.done = False

    def reset(self):
        self.jpy = self.initial_jpy
        self.btc = self.initial_btc
        self.current_step = 0
        self.done = False
        return self._next_observation()

    def _next_observation(self):
        obs = self.data.iloc[self.current_step].values.tolist()
        obs.extend([self.jpy, self.btc])
        self.state = State(timestamp = obs[0], ask = obs[1], bid = obs[2], last = obs[4], volume = obs[6], jpy = obs[7], btc = obs[8])
        return self.state

    def step(self, action):
        # prev_value = self.jpy + self.btc * self.data.iloc[self.current_step]['last']
        prev_value = self.jpy + self.btc * self.state.last
        

        # tradeはbtcの単位とする
        # Update holdings based on the action
        trade_amount = action[0]  # Assume the action is a numpy array
        # if trade_amount > 0:  # Buying BTC
        #     self.jpy -= trade_amount * self.data.iloc[self.current_step]['last']
        #     self.btc += trade_amount
        # else:  # Selling BTC
        #     self.jpy += abs(trade_amount) * self.data.iloc[self.current_step]['last']
        #     self.btc -= abs(trade_amount)
        
        self.jpy -= trade_amount * self.data.iloc[self.current_step]['last']
        self.btc += trade_amount
        
        self.current_step += 1
        # 終了判定
        if self.current_step >= len(self.data) - 1:
            self.done = True
        
        # Compute the reward
        current_value = self.jpy + self.btc * self.data.iloc[self.current_step]['last']
        reward = current_value - prev_value
        
        return self._next_observation(), reward, self.done, {}

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# TimeSeriesEnvの動作確認

# ダミーデータベースパスを使用して環境を初期化
dummy_db_path = r"C:\Users\yamaguchi\MyDocument\pytest\virtual_currency\gmo\original_gym\myenv\2023-09-20_ratest_rate.db"
time_series_env = TimeSeriesEnv(dummy_db_path)

# 初期状態の取得
initial_observation_ts = time_series_env.reset()
print(initial_observation_ts)

State(timestamp='2023-09-20 00:20:11.931000', ask=4021627, bid=4020401, last=4020401, volume=146.6095, jpy=100000.0, btc=0.02487314076381933)


### 学習時のデータを確認

In [45]:
experiences

deque([{'state': array([ 4.00380000e+06,  4.00365000e+06,  4.00380000e+06,  1.41502100e+02,
                6.77926821e+07, -1.67970619e+01]),
        'action': array([-0.0088305]),
        'reward': -13444.713895082474,
        'n_state': array([ 4.00460000e+06,  4.00391600e+06,  4.00460000e+06,  1.41499100e+02,
                6.78280376e+07, -1.68058924e+01]),
        'done': False},
       {'state': array([ 4.00460000e+06,  4.00391600e+06,  4.00460000e+06,  1.41499100e+02,
                6.78280376e+07, -1.68058924e+01]),
        'action': array([-0.01]),
        'reward': -3363.1784737706184,
        'n_state': array([ 4.00480000e+06,  4.00479900e+06,  4.00480000e+06,  1.41500100e+02,
                6.78680836e+07, -1.68158924e+01]),
        'done': False},
       {'state': array([ 4.00480000e+06,  4.00479900e+06,  4.00480000e+06,  1.41500100e+02,
                6.78680836e+07, -1.68158924e+01]),
        'action': array([-0.01]),
        'reward': -3365.1784737706184,
        '

In [44]:
tmp_experiences = experiences
rewards_history = [dd['reward'] for dd in tmp_experiences]
actions_history = [dd['action'] for dd in tmp_experiences]
rewards_history

[-13444.713895082474,
 -3363.1784737706184,
 -3365.1784737706184,
 -3367.089886173606,
 -23582.383897587657,
 -30335.17508418858,
 -2158.4150048196316,
 -11405.361667335033,
 -20272.305585995317,
 27949.244221299887,
 6759.104343503714,
 -1673.3574889600277,
 -2553.688291415572,
 -6768.322645545006,
 -3386.161322772503,
 -16938.738899469376,
 -13556.593155354261,
 -3391.148288846016,
 -3393.148288846016,
 -3395.1482888311148,
 -3396.485371798277,
 -3398.0391657352448,
 -3399.9082446098328,
 -3401.1549508571625,
 -3403.1549508422613,
 -7831.406936079264,
 58595.25207312405,
 17367.33007630706,
 17.04971708357334,
 10235.830250263214,
 8534.858541890979,
 8522.778824821115,
 17.08971707522869,
 11969.321918234229,
 5132.354532942176,
 17117.848443090916,
 41483.648929178715,
 -376.97849920392036,
 -10284.749097436666,
 -3430.2496991604567,
 -3432.2496991455555,
 -3434.2496991604567,
 -3436.2496991455555,
 -1.4901161193847656e-08,
 -3439.4158964008093,
 -3441.3768555969,
 -3443.3768556118

In [None]:
15
16
17
18
19
20

In [39]:
import json

experiences_list = list(experiences)
# Save to a JSON file
output_file_path = r"C:\Users\yamaguchi\MyDocument\pytest\virtual_currency\gmo\original_gym\myenv\experiences.json"
with open(output_file_path, 'w') as json_file:
    json.dump(experiences_list, json_file)

output_file_path

TypeError: Object of type ndarray is not JSON serializable

In [40]:
# Custom function to handle JSON serialization of ndarray
def default_serialize(obj):
    import numpy as np
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    raise TypeError(f"Object of type {obj.__class__.__name__} is not JSON serializable")

# Save to a JSON file using the custom serialization function
output_file_path = r"C:\Users\yamaguchi\MyDocument\pytest\virtual_currency\gmo\original_gym\myenv\experiences.json"
with open(output_file_path, 'w') as json_file:
    json.dump(experiences_list, json_file, default=default_serialize)

output_file_path


'C:\\Users\\yamaguchi\\MyDocument\\pytest\\virtual_currency\\gmo\\original_gym\\myenv\\experiences.json'

In [20]:

class DDPGAgent:
    def __init__(self, action_space, state_dim, buffer_size=100000, batch_size=64):
        self.action_space = action_space
        self.actor = ActorModel(action_space)
        self.critic = CriticModel()
        self.target_actor = ActorModel(action_space)
        self.target_critic = CriticModel()
        self.memory = deque(maxlen=buffer_size) # replay buffer
        self.batch_size = batch_size
        
        # Initialize target networks with same weights as original networks
        self.target_actor.set_weights(self.actor.get_weights())
        self.target_critic.set_weights(self.critic.get_weights())

    def act(self, state):
        # Use the actor model to get the action
        return self.actor.sample_action(state)

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))



class RandomAgent:
    def __init__(self, action_space):
        self.action_space = action_space

    def act(self, state):
        return self.action_space.sample()



# Initialize the environment and agent
db_path = r'C:\Users\yamaguchi\MyDocument\pytest\virtual_currency\gmo\original_gym\myenv\2023-09-20_ratest_rate.db'
env = TimeSeriesEnv(db_path, initial_jpy=1e5)
agent = RandomAgent(env.action_space)

# Number of episodes
num_episodes = 10

# Simulate the interaction
for i in range(num_episodes):
    state = env.reset()
    done = False
    episode_reward = 0

    while not done:
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        episode_reward += reward

    print(f"Episode {i + 1}: Total Reward = {episode_reward}")

env.close()


Episode 1: Total Reward = 1306.6490147816949
Episode 2: Total Reward = 6552.967734882375
Episode 3: Total Reward = -2097.5651815774036
Episode 4: Total Reward = -10653.347941236105
Episode 5: Total Reward = 3113.800839958829
Episode 6: Total Reward = 1028.6448619500734
Episode 7: Total Reward = -3046.6106279648375
Episode 8: Total Reward = 7490.688114629826
Episode 9: Total Reward = 3440.4046937696403
Episode 10: Total Reward = -10879.293722467497


# DDPGのモデル

## Actor, Critic

In [21]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow.python import keras
from tensorflow.keras.optimizers import Adam

from matplotlib import pyplot as plt

from collections import deque
import random


# pip install gym=0.25.2 ※0.26.2だと正しく学習できない（原因は未調査）


class ActorModel(keras.Model):
    def __init__(self, action_space):
        super().__init__()

        self.noise_stdev = 0.2  # ノイズ用の標準偏差
        self.action_space = action_space # 連続値の値域が与えられる. Box(-2.0, 2.0, (1,), float32)]
        # Boxについて
        # from gym.space import Box でimportする
        # -2.0 ~ 2.0 の値域で (1,) 1次元 という意味

        # Envアクション用
        self.action_centor = (action_space.high + action_space.low)/2  # 中心値は0.0
        self.action_scale = action_space.high - self.action_centor     # スケールは2.0

        # 各レイヤーを定義
        self.dense1 = keras.layers.Dense(32, activation="relu")
        self.dense2 = keras.layers.Dense(32, activation="relu")
        self.dense3 = keras.layers.Dense(32, activation="relu")
        self.actions = keras.layers.Dense(action_space.shape[0], activation="tanh")
        
        # optimizer
        self.optimizer = Adam(learning_rate=0.003)

    # Forward pass
    def call(self, inputs, training=False): # __call__メソッドであり，self()の形で実行可能
        x = self.dense1(inputs)
        x = self.dense2(x)
        x = self.dense3(x)
        actions = self.actions(x)
        return actions

    # 状態を元にactionを算出
    def sample_action(self, state, training=False):
        actions = self(state.reshape((1,-1))) # def call()を呼び出している
        action = actions[0].numpy()

        if training:
            # 学習用
            # ノイズを混ぜる
            noise = np.random.normal(0, self.noise_stdev, size=self.action_space.shape)
            action = np.clip(action + noise, -1, 1) # action は -1 ~ 1の範囲で出力 ∵actionsの出力がtanh

            # 環境用のアクションと学習用のアクションを返す
            return (action * self.action_scale + self.action_centor), action
        else:
            # テスト用、環境に渡すアクションのみを返す
            return action * self.action_scale + self.action_centor


class CriticModel(keras.Model):
    def __init__(self):
        super().__init__()

        # 各レイヤーを定義
        self.dense1 = keras.layers.Dense(32, activation="relu")
        self.dense2 = keras.layers.Dense(32, activation="relu")
        self.dense3 = keras.layers.Dense(32, activation="relu")
        self.value1 = keras.layers.Dense(1, activation="linear")
        self.dense4 = keras.layers.Dense(32, activation="relu")
        self.dense5 = keras.layers.Dense(32, activation="relu")
        self.dense6 = keras.layers.Dense(32, activation="relu")
        self.value2 = keras.layers.Dense(1, activation="linear")

        # optimizer
        self.optimizer = Adam(learning_rate=0.003)

    # Forward pass
    def call(self, states, actions, training=False):
        x = tf.concat([states, actions], axis=1)
        x1 = self.dense1(x)
        x1 = self.dense2(x1)
        x1 = self.dense3(x1)
        q1 = self.value1(x1)
        x2 = self.dense4(x)
        x2 = self.dense5(x2)
        x2 = self.dense6(x2)
        q2 = self.value2(x2)
        return q1, q2

        # CriticModelのインスタンスでそのまま呼び出し可能 ∵__call__メソッド
    

## update(モデルの更新)

In [22]:
def update_model(
        actor_model, 
        target_actor_model, 
        critic_model, 
        target_critic_model, 
        experiences, 
        batch_size, 
        gamma,
        all_train_count,
        actor_update_interval,
        target_policy_noise_stddev,
        target_policy_clip_range,
    ):

    # ランダムに経験を取得してバッチを作成
    batchs = random.sample(experiences, batch_size)

    # Target Networkを用いて次の状態の価値を出す
    n_states = np.asarray([e["n_state"] for e in batchs])
    n_actions = target_actor_model(n_states)

    # Target Actionのノイズ
    clipped_noise = np.clip(np.random.normal(0, target_policy_noise_stddev, n_actions.shape), -target_policy_clip_range, target_policy_clip_range)
    n_actions = np.clip(n_actions + clipped_noise, -1, 1)

    # 2つのQ値から小さいほうを採用
    n_qvals1, n_qvals2 = target_critic_model(n_states, n_actions)
    n_qvals = [min(q1, q2) for q1, q2 in zip(n_qvals1.numpy(), n_qvals2.numpy())]

    # Qを計算 : reward if done else (reward + gamma * n_qval)
    q_vals = np.asarray([
        [reward] if done else [reward] + gamma * n_qval
        for reward, done, n_qval in zip(
            [e["reward"] for e in batchs],
            [e["done"] for e in batchs],
            n_qvals,
        )
    ])

    # データ整形
    states = np.asarray([e["state"] for e in batchs])
    actions = np.asarray([e["action"] for e in batchs])

    #--- Actorの学習
    # Actorの学習は少し減らす
    if all_train_count % actor_update_interval == 0:
        with tf.GradientTape() as tape:
            actor_actions = actor_model(states, training=True)
            q, _ = critic_model(states, actor_actions)
            actor_loss = -tf.reduce_mean(q)  # 最大化
        
        grads = tape.gradient(actor_loss, actor_model.trainable_variables)
        actor_model.optimizer.apply_gradients(zip(grads, actor_model.trainable_variables))
    else:
        actor_loss = 0

    #--- Criticの学習 MSEで学習
    with tf.GradientTape() as tape:
        q1, q2 = critic_model(states, actions, training=True)
        loss1 = tf.reduce_mean(tf.square(q_vals - q1))
        loss2 = tf.reduce_mean(tf.square(q_vals - q2))
        critic_loss = loss1 + loss2
    
    grads = tape.gradient(critic_loss, critic_model.trainable_variables)
    critic_model.optimizer.apply_gradients(zip(grads, critic_model.trainable_variables))

    return actor_loss, critic_loss


In [23]:
def update_target_model(actor_model, target_actor_model, critic_model, target_critic_model, soft_tau):

    target_actor_model.set_weights(
        (1 - soft_tau) * np.array(target_actor_model.get_weights(), dtype=object)
        + (soft_tau) * np.array(actor_model.get_weights(), dtype=object))

    target_critic_model.set_weights(
        (1 - soft_tau) * np.array(target_critic_model.get_weights(), dtype=object)
        + (soft_tau) * np.array(critic_model.get_weights(), dtype=object))


## 学習のメインコード

In [None]:
def train_main():
    env = gym.make("Pendulum-v1")

    # ハイパーパラメータ
    buffer_size = 10000  # キューの最大容量
    warmup_size = 500    # 最低限キューに入れる数
    train_interval = 10  # 学習間隔
    batch_size = 32      # バッチサイズ
    gamma = 0.9          # 割引率
    soft_tau = 0.02      # Target network の近づく割合
    actor_update_interval = 2         # Actorの更新間隔
    target_policy_noise_stddev = 0.2  # Target policy ノイズの標準偏差
    target_policy_clip_range = 0.5    # Target policy ノイズのclip範囲

    # モデルの定義
    actor_model = ActorModel(env.action_space)
    target_actor_model = ActorModel(env.action_space)
    critic_model = CriticModel()
    target_critic_model = CriticModel()

    # モデルは一度伝搬させないと重みが作成されない
    dummy_state = np.random.normal(0, 0.1, size=(1,) + env.observation_space.shape)
    actor_model(dummy_state)
    target_actor_model(dummy_state)
    target_actor_model.set_weights(actor_model.get_weights())

    dummy_action  = np.random.normal(0, 0.1, size=(1,) + env.action_space.shape)
    critic_model(dummy_state, dummy_action)
    target_critic_model(dummy_state, dummy_action)
    target_critic_model.set_weights(critic_model.get_weights())

    # 収集する経験は上限を決め、古いものから削除する
    experiences = deque(maxlen=buffer_size)

    all_step_count = 0
    all_train_count = 0

    # 記録用
    history_rewards = []
    history_metrics = []
    history_metrics_y = []

    # 学習ループ
    for episode in range(500):
        state = np.asarray(env.reset())
        done = False
        total_reward = 0
        step = 0

        metrics_list = []

        # 1episode
        while not done:
            # アクションを決定
            env_action, action = actor_model.sample_action(state, True)

            # 1step進める
            n_state, reward, done, _ = env.step(env_action)
            n_state = np.asarray(n_state)
            step += 1
            total_reward += reward

            experiences.append({
                "state": state,
                "action": action,
                "reward": reward,
                "n_state": n_state,
                "done": done,
            })
            state = n_state

            if len(experiences) == warmup_size-1:
                print("train start")

            # warmup貯まったら train_interval 毎に学習する
            if len(experiences) >= warmup_size and all_step_count % train_interval == 0:
                # モデルの更新
                metrics = update_model(
                    actor_model,
                    target_actor_model,
                    critic_model,
                    target_critic_model,
                    experiences,
                    batch_size,
                    gamma,
                    all_train_count,
                    actor_update_interval,
                    target_policy_noise_stddev,
                    target_policy_clip_range,
                )
                # Soft-target
                update_target_model(
                    actor_model,
                    target_actor_model,
                    critic_model,
                    target_critic_model,
                    soft_tau
                )
                all_train_count += 1
                metrics_list.append(metrics)

            all_step_count += 1

        # 報酬
        history_rewards.append(total_reward)

        # メトリクス
        if len(metrics_list) > 0:
            history_metrics.append(np.mean(metrics_list, axis=0))  # 平均を保存
            history_metrics_y.append(episode)

        #--- print
        interval = 50
        if episode % interval == 0:
            print("{} (min,ave,max)reward {:.1f} {:.1f} {:.1f}".format(
                episode,
                min(history_rewards[-interval:]),
                np.mean(history_rewards[-interval:]),
                max(history_rewards[-interval:]),
            ))

    return actor_model, history_rewards, history_metrics, history_metrics_y

In [33]:
def myenv_train(db_path):

    # env = gym.make("Pendulum-v1")
    env = TimeSeriesEnv(db_path)  # 新しい環境を初期化

    # ハイパーパラメータ
    buffer_size = 10000  # キューの最大容量
    warmup_size = 500    # 最低限キューに入れる数
    train_interval = 10  # 学習間隔
    batch_size = 32      # バッチサイズ
    gamma = 0.9          # 割引率
    soft_tau = 0.02      # Target network の近づく割合
    actor_update_interval = 2         # Actorの更新間隔
    target_policy_noise_stddev = 0.2  # Target policy ノイズの標準偏差
    target_policy_clip_range = 0.5    # Target policy ノイズのclip範囲

    # モデルの定義
    actor_model = ActorModel(env.action_space)
    target_actor_model = ActorModel(env.action_space)
    critic_model = CriticModel()
    target_critic_model = CriticModel()

    # モデルは一度伝搬させないと重みが作成されない
    dummy_state = np.random.normal(0, 0.1, size=(1,) + env.observation_space.shape)
    actor_model(dummy_state)
    target_actor_model(dummy_state)
    target_actor_model.set_weights(actor_model.get_weights())

    dummy_action  = np.random.normal(0, 0.1, size=(1,) + env.action_space.shape)
    critic_model(dummy_state, dummy_action)
    target_critic_model(dummy_state, dummy_action)
    target_critic_model.set_weights(critic_model.get_weights())

    # 収集する経験は上限を決め、古いものから削除する
    experiences = deque(maxlen=buffer_size)

    all_step_count = 0
    all_train_count = 0

    # 記録用
    history_rewards = []
    history_metrics = []
    history_metrics_y = []

    # 学習ループ
    for episode in range(500):
        state = np.asarray(env.reset())
        done = False
        total_reward = 0
        step = 0

        metrics_list = []

        # 1episode
        while not done:
            # アクションを決定
            env_action, action = actor_model.sample_action(state, True)

            # 1step進める
            n_state, reward, done, _ = env.step(env_action)
            n_state = np.asarray(n_state)
            step += 1
            total_reward += reward

            experiences.append({
                "state": state,
                "action": action,
                "reward": reward,
                "n_state": n_state,
                "done": done,
            })
            state = n_state

            if len(experiences) == warmup_size-1:
                print("train start")

            # warmup貯まったら train_interval 毎に学習する
            if len(experiences) >= warmup_size and all_step_count % train_interval == 0:
                # モデルの更新
                metrics = update_model(
                    actor_model,
                    target_actor_model,
                    critic_model,
                    target_critic_model,
                    experiences,
                    batch_size,
                    gamma,
                    all_train_count,
                    actor_update_interval,
                    target_policy_noise_stddev,
                    target_policy_clip_range,
                )
                # Soft-target
                update_target_model(
                    actor_model,
                    target_actor_model,
                    critic_model,
                    target_critic_model,
                    soft_tau
                )
                all_train_count += 1
                metrics_list.append(metrics)

            all_step_count += 1

        # 報酬
        history_rewards.append(total_reward)

        # メトリクス
        if len(metrics_list) > 0:
            history_metrics.append(np.mean(metrics_list, axis=0))  # 平均を保存
            history_metrics_y.append(episode)

        #--- print
        interval = 50
        if episode % interval == 0:
            print("{} (min,ave,max)reward {:.1f} {:.1f} {:.1f}".format(
                episode,
                min(history_rewards[-interval:]),
                np.mean(history_rewards[-interval:]),
                max(history_rewards[-interval:]),
            ))

    return actor_model, history_rewards, history_metrics, history_metrics_y

# db_pathにはSQLiteデータベースの正確なパスを指定してください。
db_path = r'C:\Users\yamaguchi\MyDocument\pytest\virtual_currency\gmo\original_gym\myenv\2023-09-20_ratest_rate.db'
model, history_rewards, history_metrics, history_metrics_y = myenv_train(db_path)



UnimplementedError: {{function_node __wrapped__Cast_device_/job:localhost/replica:0/task:0/device:CPU:0}} Cast string to float is not supported [Op:Cast] name: 