In [1]:
import argparse
import os
import gym
import random
from gym import spaces
from sklearn import preprocessing
import numpy as np
import pandas as pd
from parl.utils import logger, tensorboard, ReplayMemory
import paddle
from OAC import OAC
import parl
import paddle.nn as nn

[32m[01-19 09:56:39 MainThread @utils.py:73][0m paddlepaddle version: 2.3.2.


  context = pyarrow.default_serialization_context()


In [2]:
# 默认的一些数据，用于归一化属性值
MAX_ACCOUNT_BALANCE = 2147480        # 最大的账户财产
MAX_NUM_SHARES = 2147480             # 最大的手数
MAX_SHARE_PRICE = 5000              # 最大的单手价格
MAX_VOLUME = 1e9                 # 最大的成交量
MAX_AMOUNT = 1e10                    # 最大的成交额
MAX_OPEN_POSITIONS = 5              # 最大的持仓头寸
MAX_STEPS = 100000                    # 最大的交互次数
MAX_DAY_CHANGE = 1                  # 最大的日期改变
max_loss =-300000                   # 最大的损失
max_predict_rate = 1.5            # 最大的预测率
INITIAL_ACCOUNT_BALANCE = 1000000    # 初始的金钱
# NUM = 150 

class StockTradingEnv(gym.Env):
    """A stock trading environment for OpenAI gym"""
    metadata = {'render.modes': ['human']}

    def __init__(self, df):
        super(StockTradingEnv, self).__init__()
        self.df = df
        # self.reward_range = (0, MAX_ACCOUNT_BALANCE)

        # 动作的可能情况：买入x%, 卖出x%, 观望
        self.action_space = spaces.Box(
            low=np.array([0]), high=np.array([1]), dtype=np.float32)
        # self.action_space = spaces.Box(
        #     low=np.array([-1,-1]), high=np.array([1,1]), dtype=np.float32)

        # 环境状态的维度
        self.observation_space = spaces.Box(
            low=0, high=1, shape=(17,), dtype=np.float32)

        self.current_step = 0
        self.buy = []
        self.sell = []
        self.buy_v = []
        self.sell_v = []
        self.get = []
    
    def seed(self, seed):
        random.seed(seed)
        np.random.seed(seed)

    
    # 处理状态
    def _next_observation(self):
        obs = np.array([
            self.df.loc[self.current_step, 'Open']/MAX_ACCOUNT_BALANCE,
            self.df.loc[self.current_step, 'High']/MAX_ACCOUNT_BALANCE,
            self.df.loc[self.current_step, 'Low']/MAX_ACCOUNT_BALANCE,
            self.df.loc[self.current_step, 'Close']/MAX_ACCOUNT_BALANCE,
            self.df.loc[self.current_step, 'Volume']/MAX_ACCOUNT_BALANCE,
            self.df.loc[self.current_step, 'return'],
            self.df.loc[self.current_step, 'rsi_rsi14']/1000,
            self.df.loc[self.current_step, 'boll_upper']/MAX_ACCOUNT_BALANCE,
            self.df.loc[self.current_step, 'boll_middle']/MAX_ACCOUNT_BALANCE,
            self.df.loc[self.current_step, 'boll_lower']/MAX_ACCOUNT_BALANCE,
            self.balance / MAX_ACCOUNT_BALANCE,
            self.max_net_worth / MAX_ACCOUNT_BALANCE,
            self.net_worth /  MAX_ACCOUNT_BALANCE,
            self.shares_held / MAX_NUM_SHARES,
            self.cost_basis / MAX_SHARE_PRICE,
            self.total_shares_sold / MAX_NUM_SHARES,
            self.total_sales_value / (MAX_NUM_SHARES * MAX_SHARE_PRICE)
        ])
        return obs


    # 执行当前动作，并计算出当前的数据（如：资产等）
    def _take_action(self, action):
        # 随机设置当前的价格，其范围上界为当前时间点的价格
        current_price =  self.df.loc[self.current_step, "Close"]
        ris = self.df.loc[self.current_step, 'rsi_rsi14']
        boll_up = self.df.loc[self.current_step, 'boll_upper']
        boll_low = self.df.loc[self.current_step, 'boll_lower']
        # action_type = action[0]
        amount = action[0]
        # current_price<boll_low and ris<30 and self.balance >= current_price
        # action_type < 1/3 and self.balance >= current_price
        if  current_price<=boll_low and ris<=30 and self.balance >= current_price:     # 买入amount%
            self.buy.append(self.current_step)
            total_possible = int(self.balance / current_price)
            shares_bought = total_possible * amount
            if shares_bought != 0.:
                prev_cost = self.cost_basis * self.shares_held
                additional_cost = shares_bought * current_price
                self.buy_v.append(additional_cost)

                self.balance -= additional_cost
                self.cost_basis = (
                    prev_cost + additional_cost) / (self.shares_held + shares_bought)
                self.shares_held += shares_bought
        # current_price>boll_up and ris>70 and self.shares_held != 0
        # action_type > 2/3 and self.shares_held != 0
        elif  current_price>=boll_up and ris>=70 and self.shares_held != 0:  # 卖出amount%
            self.sell.append(self.current_step)
            # print(self.states_sell)
            shares_sold = self.shares_held * amount
            self.balance += shares_sold * current_price
            self.sell_v.append(shares_sold * current_price)
            self.shares_held -= shares_sold
            self.total_shares_sold += shares_sold
            self.total_sales_value += shares_sold * current_price
        
        else:
            pass

        # 计算出执行动作后的资产净值
        self.net_worth = self.balance + self.shares_held * current_price
        self.get.append(self.net_worth)

        if self.net_worth > self.max_net_worth:
            self.max_net_worth = self.net_worth

        if self.shares_held == 0:
            self.cost_basis = 0


    # 与环境交互
    def step(self, action):
        # 在环境内执行
        self._take_action(action)
        done = False
        status = None

        reward = 0

        # 判断是否终止
        self.current_step += 1

        # delay_modifier = (self.current_step / MAX_STEPS)

        # reward += delay_modifier

        if self.net_worth >= INITIAL_ACCOUNT_BALANCE * max_predict_rate:
            reward += max_predict_rate
            status = f'[ENV] success at step {self.current_step}! Get {max_predict_rate} times worth.'
            # self.current_step = 0
            done = True
        if self.current_step > len(self.df.loc[:, 'Open'].values) - 1:
            status = f'[ENV] Loop training. Max worth was {self.max_net_worth}, final worth is {self.net_worth}.'
            # reward += (self.net_worth / INITIAL_ACCOUNT_BALANCE - max_predict_rate) / max_predict_rate  
            reward += self.net_worth / INITIAL_ACCOUNT_BALANCE
            self.current_step = 0  # loop training
            done = True


        if self.net_worth <= 0 :
            status = f'[ENV] Failure at step {self.current_step}. Loss all worth. Max worth was {self.max_net_worth}'
            reward += -1
            # self.current_step = 0
            done = True
        
        else:
            # 计算相对收益比，并据此来计算奖励
            profit = self.net_worth - INITIAL_ACCOUNT_BALANCE
            # profit = self.net_worth - self.balance
            profit_percent = profit / INITIAL_ACCOUNT_BALANCE
            reward += profit_percent
            # if profit_percent > 0:
            #     reward += profit_percent
            # elif profit_percent == 0:
            #     reward += -0.1
            # else:
            #     reward += -0.1

        obs = self._next_observation()

        return obs, reward, done, {
            'profit': self.net_worth,
            'current_step': self.current_step,
            'status': status,
            'buy' : self.buy,
            'sell' : self.sell,
            'buy_v':self.buy_v,
            'sell_v':self.sell_v,
            'get':self.get
        }


    # 重置环境
    def reset(self, new_df=None):
        # 重置环境的变量为初始值
        self.balance = INITIAL_ACCOUNT_BALANCE
        self.net_worth = INITIAL_ACCOUNT_BALANCE
        self.max_net_worth = INITIAL_ACCOUNT_BALANCE
        self.shares_held = 0
        self.cost_basis = 0
        self.total_shares_sold = 0
        self.total_sales_value = 0

        # 传入环境数据集
        if new_df:
            self.df = new_df
        # if self.current_step > len(self.df.loc[:, 'open'].values) - 1:
        self.current_step = 0

        return self._next_observation()

    
    def get_obs(self, current_step):
        obs = np.array([
            self.df.loc[self.current_step, 'Open']/MAX_ACCOUNT_BALANCE,
            self.df.loc[self.current_step, 'High']/MAX_ACCOUNT_BALANCE,
            self.df.loc[self.current_step, 'Low']/MAX_ACCOUNT_BALANCE,
            self.df.loc[self.current_step, 'Close']/MAX_ACCOUNT_BALANCE,
            self.df.loc[self.current_step, 'Volume']/MAX_ACCOUNT_BALANCE,
            self.df.loc[self.current_step, 'return'],
            self.df.loc[self.current_step, 'rsi_rsi14']/1000,
            self.df.loc[self.current_step, 'boll_upper']/MAX_ACCOUNT_BALANCE,
            self.df.loc[self.current_step, 'boll_middle']/MAX_ACCOUNT_BALANCE,
            self.df.loc[self.current_step, 'boll_lower']/MAX_ACCOUNT_BALANCE,
            self.balance / MAX_ACCOUNT_BALANCE,
            self.max_net_worth / MAX_ACCOUNT_BALANCE,
            self.net_worth /  MAX_ACCOUNT_BALANCE,
            self.shares_held / MAX_NUM_SHARES,
            self.cost_basis / MAX_SHARE_PRICE,
            self.total_shares_sold / MAX_NUM_SHARES,
            self.total_sales_value / (MAX_NUM_SHARES * MAX_SHARE_PRICE)
        ])
        return obs


    # 显示环境至屏幕
    def render(self, mode='human'):
        # 打印环境信息
        # profit = self.net_worth - INITIAL_ACCOUNT_BALANCE
        # print('-'*30)
        # print(f'Step: {self.current_step}')
        # print(f'Balance: {self.balance}')
        # print(f'Shares held: {self.shares_held} (Total sold: {self.total_shares_sold})')
        # print(f'Avg cost for held shares: {self.cost_basis} (Total sales value: {self.total_sales_value})')
        # print(f'Net worth: {self.net_worth} (Max net worth: {self.max_net_worth})')
        # print(f'Profit: {profit}')
        self.sell = len(self.states_sell)
        self.buy = len(self.states_buy)
        print(self.buy)
        print(self.sell)
        return self.sell,self.buy

In [3]:
# 获得数据
df = pd.read_csv('oac/BTC_train.csv')
print(df)
# 根据数据集设置环境
env = StockTradingEnv(df)
# T得到环境的参数信息（如：状态和动作的维度）
state_dim = env.observation_space.shape[0]
print(state_dim)
action_dim = env.action_space.shape[0]

max_action = float(env.action_space.high[0])
max_step = len(df.loc[:, 'Open'].values)
print(f'state: {state_dim}, action: {action_dim}, action max value: {max_action}, max step:{max_step}')

                      Time      Open      High       Low     Close  \
0      2022-07-01 18:00:00  19424.45  19496.80  19344.00  19426.64   
1      2022-07-01 18:30:00  19426.64  19448.69  19000.00  19070.56   
2      2022-07-01 19:00:00  19070.57  19200.00  18975.00  19177.53   
3      2022-07-01 19:30:00  19177.52  19226.68  19136.21  19191.03   
4      2022-07-01 20:00:00  19191.02  19294.14  19130.21  19231.02   
...                    ...       ...       ...       ...       ...   
10493  2023-02-05 08:30:00  23314.05  23349.50  23230.00  23291.00   
10494  2023-02-05 09:00:00  23290.99  23330.58  23261.22  23328.24   
10495  2023-02-05 09:30:00  23329.46  23333.06  23290.66  23319.48   
10496  2023-02-05 10:00:00  23319.86  23347.59  23312.40  23334.60   
10497  2023-02-05 10:30:00  23335.19  23342.72  23292.00  23292.21   

           Volume    return  rsi_rsi14    boll_upper  boll_middle  \
0      1610.29264  0.000113  24.738969  20521.539697   19750.2385   
1     

In [4]:
# 获得数据
eval_df = pd.read_csv('oac/BTC_test_pro.csv')[:3500]

print(eval_df)
# 根据数据集设置环境
eval_env = StockTradingEnv(eval_df)

                     Time      Open      High       Low     Close      Volume  \
0     2023-02-05 11:00:00  23292.22  23339.00  23289.40  23313.82  3218.47246   
1     2023-02-05 11:30:00  23313.82  23349.73  23301.34  23344.53  2933.81627   
2     2023-02-05 12:00:00  23344.53  23345.60  23326.23  23338.42  2215.77951   
3     2023-02-05 12:30:00  23338.95  23377.54  23337.48  23364.00  2821.43409   
4     2023-02-05 13:00:00  23364.33  23390.00  23360.73  23369.61  2300.42999   
...                   ...       ...       ...       ...       ...         ...   
3495  2023-04-19 07:30:00  30332.79  30408.44  30317.01  30380.01   618.62109   
3496  2023-04-19 08:00:00  30380.01  30413.53  30307.00  30314.36   822.31314   
3497  2023-04-19 08:30:00  30314.35  30358.08  30282.95  30319.25   614.32757   
3498  2023-04-19 09:00:00  30319.25  30348.28  30243.16  30310.08   732.41606   
3499  2023-04-19 09:30:00  30310.09  30327.12  30223.12  30233.42   647.13137   

        return

In [5]:
import parl
import paddle
import paddle.nn as nn
import paddle.nn.functional as F

class StockAgent(parl.Agent):
    def __init__(self, algorithm, act_dim, expl_noise=0.1):
        super(StockAgent, self).__init__(algorithm)

        self.alg.sync_target(decay=0)
        self.expl_noise = expl_noise
        self.action_dim = act_dim

    def predict(self, obs):
        obs = paddle.to_tensor(obs.reshape(1, -1), dtype='float32')
        action = self.alg.predict(obs)
        action_numpy = action.cpu().numpy()[0]
        return action_numpy

    def sample(self, obs):
        obs = paddle.to_tensor(obs.reshape(1, -1), dtype='float32')
        action = self.alg.get_optimistic_exploration_action(obs)
        action_numpy = action.cpu().numpy()[0]
        return action_numpy

    def learn(self, obs, action, reward, next_obs, terminal):
        terminal = np.expand_dims(terminal, -1)
        reward = np.expand_dims(reward, -1)

        obs = paddle.to_tensor(obs, dtype='float32')
        action = paddle.to_tensor(action, dtype='float32')
        reward = paddle.to_tensor(reward, dtype='float32')
        next_obs = paddle.to_tensor(next_obs, dtype='float32')
        terminal = paddle.to_tensor(terminal, dtype='float32')
        critic_loss, actor_loss = self.alg.learn(obs, action, reward, next_obs,
                                                 terminal)
        return critic_loss, actor_loss

In [6]:
class ResidualBlock(nn.Layer):
    def __init__(self, in_channels, out_channels):
        super(ResidualBlock, self).__init__()

        self.conv1 = nn.Conv1D(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm1D(out_channels)
        self.relu = nn.LeakyReLU()
        self.conv2 = nn.Conv1D(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm1D(out_channels)

    def forward(self, x):
        # x = paddle.unsqueeze(x, axis=-1) 
        residual = x
        out = paddle.unsqueeze(x, axis=-1)  # transpose to shape [batch_size, seq_len, in_channels]
        out = self.conv1(out)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        out += residual.unsqueeze(axis=-1) 
        out = self.relu(out)
        out = paddle.squeeze(x, axis=-1)  # transpose back to original shape

        return out

In [7]:
class SelfAttention(parl.Model):
    def __init__(self, hidden_dim, num_heads):
        super(SelfAttention, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.num_heads = num_heads
        
        self.query_proj = nn.Linear(hidden_dim, hidden_dim)
        self.key_proj = nn.Linear(hidden_dim, hidden_dim)
        self.value_proj = nn.Linear(hidden_dim, hidden_dim)
        self.attention = nn.MultiHeadAttention(hidden_dim, num_heads)
        
    def forward(self, x):
        query = self.query_proj(x) # [256,1,20]
        key = self.key_proj(x)
        value = self.value_proj(x)

        # transpose to match multihead attention input shape
        query = query.transpose([1, 0, 2]) #[1,256,20]
        key = key.transpose([1, 0, 2])
        value = value.transpose([1, 0, 2])


        # pass through multi-head attention layer
        output = self.attention(query, key, value)

        # transpose back to match original input shape
        output = output.transpose([1, 0, 2])

        return output

In [8]:
# clamp bounds for Std of action_log
# action网络输出的标准差的上界和下界
LOG_SIG_MAX = 2.0
LOG_SIG_MIN = -20.0


class StockModel(parl.Model):
    def __init__(self, obs_dim, action_dim):
        super(StockModel, self).__init__()
        self.actor_model = Actor(obs_dim, action_dim)
        self.critic_model = Critic(obs_dim, action_dim)

    def policy(self, obs):
        return self.actor_model(obs)

    def value(self, obs, action):
        return self.critic_model(obs, action)


    def get_actor_params(self):
        return self.actor_model.parameters()

    def get_critic_params(self):
        return self.critic_model.parameters()
    


class Actor(parl.Model):
    def __init__(self, obs_dim, action_dim, hidden_dim=17, num_heads=17):
        super(Actor, self).__init__()
        self.self_attention = SelfAttention(hidden_dim, num_heads)
        self.l1 = nn.Linear(obs_dim, 256)
        self.res1 = ResidualBlock(256, 256)
        self.l2 = nn.Linear(256, 256)
        self.res2 = ResidualBlock(256, 256)
        self.mean_linear = nn.Linear(256, action_dim)
        self.std_linear = nn.Linear(256, action_dim)

    def forward(self, obs):
        obs = obs.unsqueeze(axis=-2)
        x = self.self_attention(obs)

        # 使用lstm提取特征
        lstm = nn.LSTM(17, 256) 
        x,_= lstm(x)
        x=x.squeeze(axis=-2)

        x = F.relu(self.l1(obs))
        x = x.squeeze(axis = -2)
        x = self.res1(x)
        x = F.relu(self.l2(x))
        x = self.res2(x)


        act_mean = paddle.tanh(self.mean_linear(x))
        act_std = self.std_linear(x)
        act_log_std = paddle.clip(act_std, min=LOG_SIG_MIN, max=LOG_SIG_MAX)
        return act_mean, act_log_std


class Critic(parl.Model):
    def __init__(self, obs_dim, action_dim, hidden_dim=18, num_heads=3):
        super(Critic, self).__init__()

        self.self_attention = SelfAttention(hidden_dim, num_heads)
        # Q1 network
        self.l1 = nn.Linear(256, 256)
        self.res1 = ResidualBlock(256, 256)
        self.l2 = nn.Linear(256, 256)
        self.res2 = ResidualBlock(256, 256)
        self.l3 = nn.Linear(256, 1)

        # Q2 network
        self.l4 = nn.Linear(256, 256)
        self.res3 = ResidualBlock(256, 256)
        self.l5 = nn.Linear(256, 256)
        self.res4 = ResidualBlock(256, 256)
        self.l6 = nn.Linear(256, 1)

    def forward(self, obs, action):
        x = paddle.concat([obs, action], 1)
        x = x.unsqueeze(axis=-2)
        x = self.self_attention(x)
        x = x.squeeze(axis=-2)

        # 使用lstm提取特征
        lstm = nn.LSTM(17 + 1 , 256) 
        x,_= lstm(x.unsqueeze(axis=-2))
        x=x.squeeze(axis=-2)

        # Q1
        q1 = F.relu(self.l1(x))
        # q1 = q1.squeeze(axis = -2)
        q1 = self.res1(q1)
        q1 = F.relu(self.l2(q1))
        q1 = self.res2(q1)
        q1 = self.l3(q1)

        # Q2
        q2 = F.relu(self.l4(x))
        # q2 = q2.squeeze(axis = -2)
        q2 = self.res3(q2)
        q2 = F.relu(self.l5(q2))
        q2 = self.res4(q2)
        q2 = self.l6(q2)

        return q1, q2


In [9]:
SEED = 678 # 随机种子
WARMUP_STEPS = 10000
EVAL_EPISODES = 3 # 评估的轮数
MEMORY_SIZE = int(1e6)  # 经验池的大小
BATCH_SIZE = 512  # 批次的大小
GAMMA = 0.8180365520412749 # 折扣因子
TAU = 0.0011142374179250148 # 当前网络参数比例，用于更新目标网络
ACTOR_LR = 0.006732424266773069 # actor网络的参数
CRITIC_LR = 0.0006900033317865891 # critic网络的参数
alpha = 0.1406079080372761 # 熵正则化系数, SAC的参数
beta = 4.66
EXPL_NOISE = 0.1
MAX_REWARD = -1e9 # 最大奖励
file_name = f'OACmodel/OAC' # 模型保存的名字

In [10]:
model = StockModel(state_dim, action_dim)
algorithm = OAC(
        model,
        gamma=GAMMA,
        tau=TAU,
        actor_lr=ACTOR_LR,
        critic_lr=CRITIC_LR,
        beta = beta,
        delta = 23.53,
        alpha = alpha)
agent = StockAgent(algorithm, act_dim=action_dim, expl_noise=EXPL_NOISE)
rpm = ReplayMemory(
        max_size=MEMORY_SIZE, obs_dim=state_dim, act_dim=action_dim)

In [11]:
# Runs policy for 5 episodes by default and returns average reward
# A fixed seed is used for the eval environment
eval_seed = [0, 53, 47, 99, 107, 1, 17, 57, 97, 179, 777]
@paddle.no_grad()
def run_evaluate_episodes(agent, env, eval_episodes):
    avg_reward = 0.
    for epi in range(eval_episodes):
        obs = env.reset()
        env.seed(eval_seed[epi])
        done = False
        while not done:
            action = agent.predict(obs)
            obs, reward, done, _ = env.step(action)
            avg_reward += reward
    avg_reward /= eval_episodes
    print(f'Evaluator: the average reward is {avg_reward:.3f} over {eval_episodes} episodes.')
    return avg_reward

In [12]:
# Run episode for training
import time
def run_train_episode(agent, env, rpm,episode_num):
    action_dim = env.action_space.shape[0]
    obs = env.reset()
    env.seed(SEED)
    done = False
    episode_reward = 0
    episode_steps = 0
    while not done:
        episode_steps += 1
        # Select action randomly or according to policy
        # print(rpm.size())
        if rpm.size() < WARMUP_STEPS:
            action = np.random.uniform(-1, 1, size=action_dim)
        else:
            action = agent.sample(obs)
        action = abs(action)
        
        next_obs, reward, done, info = env.step(action)
        # if episode_steps%100 ==0:
        #     print(action)
        terminal = float(done)

        # Store data in replay memory
        rpm.append(obs, action, reward, next_obs, terminal)

        obs = next_obs
        episode_reward += reward

        # Train agent after collecting sufficient data
        if rpm.size() >= WARMUP_STEPS:
            batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal = rpm.sample_batch(
                BATCH_SIZE)
            agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs,
                        batch_terminal)
    # print(f'Learner: Episode {episode_steps+1} done. The reward is {episode_reward:.3f}.')
    # 打印信息
    current_step = info['current_step']
    print(f'Learner: Episode {episode_num} done. The reward is {episode_reward:.3f}.')
    print(info['status'])
    return episode_reward, episode_steps

In [13]:
def do_train(agent, env, rpm):
    save_freq = 1
    total_steps = 0
    num = 100
    train_total_steps = len(df)*num
    episode_num = 0
    best_award = -1e9
    while total_steps < train_total_steps:
        episode_num +=1
        # Train episode
        start = time.perf_counter()
        episode_reward, episode_steps = run_train_episode(agent, env, rpm,episode_num)
        total_steps += episode_steps
        if(episode_num%save_freq==0):
            avg_reward = run_evaluate_episodes(agent, eval_env, EVAL_EPISODES)
            if(best_award<avg_reward):
                best_award = avg_reward
                print(f'Saving best model!')
                agent.save(f"./models/{file_name}.ckpt")
        end = time.perf_counter()
        runTime = end - start
        print("运行时间：", runTime)

do_train(agent, env, rpm)

  "When training, we now always track global mean and variance.")


KeyboardInterrupt: 

In [14]:
def run_test_episodes(agent, env, eval_episodes,max_action_step = 200):
    avg_reward = 0.
    avg_worth = 0.
    for _ in range(eval_episodes):
        obs = env.reset()
        env.seed(1)
        done = False
        t = 0
        Buy = []
        Sell = []
        while not done:
            action = agent.predict(obs)
            # action = (action+1.0)/2.0
            action = abs(action)
            obs, reward, done, info = env.step(action)
            avg_reward += reward
            t+=1
            if(t==max_action_step):
                # eval_env.render()
                print('over')
                break
        avg_worth += info['profit']
        # eval_env.render()
        Buy=info['buy']
        Sell=info["sell"]
        Buy_v=info['buy_v']
        Sell_v=info['sell_v']
        get = info['get']
    avg_reward /= eval_episodes
    avg_worth /= eval_episodes
    print(f'Evaluator: The average reward is {avg_reward:.3f} over {eval_episodes} episodes.')
    print(f'Evaluator: The average worth is {avg_worth:.3f} over {eval_episodes} episodes.')

    return avg_reward,Buy,Sell,Buy_v,Sell_v,get

In [None]:
# 获得数据
test_df = pd.read_csv('oac/BTC_test_pro.csv')
# 根据数据集设置环境
env = StockTradingEnv(test_df)
agent.restore('models/OACmodel/OAC.ckpt')
# 设置的最大执行的天数，每一个step表示一天
max_action_step = len(test_df)
avg_reward,Buy,Sell,Buy_v,Sell_v,get = run_test_episodes(agent, env, EVAL_EPISODES,max_action_step)