In [5]:
############################### Import libraries ###############################


import os
import glob
import time
from datetime import datetime

import torch
import torch.nn as nn
from torch.distributions import MultivariateNormal
from torch.distributions import Categorical

import numpy as np
from env.make_env import TradingEnv
import env.config as default_config

import gym
from net import RolloutBuffer, ActorCritic, PPO
# import roboschool
# import pybullet_envs


################################## set device ##################################

print("============================================================================================")


# set device to cpu or cuda
device = torch.device('cpu')

if(torch.cuda.is_available()): 
    device = torch.device('cuda:0') 
    torch.cuda.empty_cache()
    print("Device set to : " + str(torch.cuda.get_device_name(device)))
else:
    print("Device set to : cpu")
    
print("============================================================================================")


################################### Training ###################################


####### initialize environment hyperparameters ######

env_name = "sim_trade"
has_continuous_action_space = False

max_ep_len = 400                    # max timesteps in one episode
max_training_timesteps = int(1e5)   # break training loop if timeteps > max_training_timesteps

print_freq = max_ep_len * 4     # print avg reward in the interval (in num timesteps)
log_freq = max_ep_len * 2       # log avg reward in the interval (in num timesteps)
save_model_freq = int(2e4)      # save model frequency (in num timesteps)

action_std = None


#####################################################


## Note : print/log frequencies should be > than max_ep_len


################ PPO hyperparameters ################


update_timestep = max_ep_len * 4      # update policy every n timesteps
K_epochs = 40               # update policy for K epochs
eps_clip = 0.2              # clip parameter for PPO
gamma = 0.99                # discount factor

lr_actor = 0.0003       # learning rate for actor network
lr_critic = 0.001       # learning rate for critic network

random_seed = 0         # set random seed if required (0 = no random seed)

#####################################################



print("training environment name : " + env_name)

# env = TradingEnv()

# state space dimension
# state_dim = env.observation_space.shape[0]
state_dim = 7
# action space dimension

# action_dim = env.action_space.n
action_dim = 11
###################### logging ######################

#### log files for multiple runs are NOT overwritten

log_dir = "PPO_logs"
if not os.path.exists(log_dir):
      os.makedirs(log_dir)

log_dir = log_dir + '/' + env_name + '/'
if not os.path.exists(log_dir):
      os.makedirs(log_dir)


#### get number of log files in log directory
run_num = 0
current_num_files = next(os.walk(log_dir))[2]
run_num = len(current_num_files)


#### create new log file for each run 
log_f_name = log_dir + '/PPO_' + env_name + "_log_" + str(run_num) + ".csv"

print("current logging run number for " + env_name + " : ", run_num)
print("logging at : " + log_f_name)

#####################################################


################### checkpointing ###################

run_num_pretrained = 0      #### change this to prevent overwriting weights in same env_name folder

directory = "PPO_preTrained"
if not os.path.exists(directory):
      os.makedirs(directory)

directory = directory + '/' + env_name + '/'
if not os.path.exists(directory):
      os.makedirs(directory)


checkpoint_path = directory + "PPO_{}_{}_{}.pth".format(env_name, random_seed, run_num_pretrained)
training_checkpoint_path = directory + "training_PPO_{}_{}_{}.pth".format(env_name, random_seed, run_num_pretrained)
print("save checkpoint path : " + checkpoint_path)

#####################################################


############# print all hyperparameters #############

print("--------------------------------------------------------------------------------------------")

print("max training timesteps : ", max_training_timesteps)
print("max timesteps per episode : ", max_ep_len)

print("model saving frequency : " + str(save_model_freq) + " timesteps")
print("log frequency : " + str(log_freq) + " timesteps")
print("printing average reward over episodes in last : " + str(print_freq) + " timesteps")

print("--------------------------------------------------------------------------------------------")

print("state space dimension : ", state_dim)
print("action space dimension : ", action_dim)

print("--------------------------------------------------------------------------------------------")

print("Initializing a discrete action space policy")

print("--------------------------------------------------------------------------------------------")

print("PPO update frequency : " + str(update_timestep) + " timesteps") 
print("PPO K epochs : ", K_epochs)
print("PPO epsilon clip : ", eps_clip)
print("discount factor (gamma) : ", gamma)

print("--------------------------------------------------------------------------------------------")

print("optimizer learning rate actor : ", lr_actor)
print("optimizer learning rate critic : ", lr_critic)

if random_seed:
    print("--------------------------------------------------------------------------------------------")
    print("setting random seed to ", random_seed)
    torch.manual_seed(random_seed)
    env.seed(random_seed)
    np.random.seed(random_seed)

#####################################################

print("============================================================================================")

################# training procedure ################

# initialize a PPO agent
ppo_agent = PPO(state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, device, action_std)
# save initial model
ppo_agent.save(training_checkpoint_path)

# track total training time
start_time = datetime.now().replace(microsecond=0)
print("Started training at (GMT) : ", start_time)

print("============================================================================================")


# logging file
log_f = open(log_f_name,"w+")
log_f.write('episode,timestep,reward\n')

# printing and logging variables
print_running_reward = 0
print_running_episodes = 0

log_running_reward = 0
log_running_episodes = 0

time_step = 0
i_episode = 0

from tqdm import tqdm
from env.utils import time_delete
import subprocess
import json
import pandas as pd
from datetime import datetime

def convert_time_format(time_str):
    # 去掉前面的 'z ' 并去掉冒号
    time_str = time_str[2:].replace(':', '') + '000'
    # 去掉开头的 '0'
    return time_str.lstrip('0')
def formate_date(date):
    '''
    input: date(example: '2023/8/29')
    output: date(example: '20230829')
    '''
    date_obj = datetime.strptime(date, '%Y/%m/%d')
    formatted_date_str = date_obj.strftime('%Y%m%d')
    return formatted_date_str
def adjust_row(row):
    sample = row[1]
    date = formate_date(sample['date'])
    sym = sample['sym']
    side = sample['side'].lower()
    start_time = convert_time_format(sample['start-time'])
    end_time = convert_time_format(sample['end-time'])
    volume = sample['volume']
    return date, sym, side, start_time, end_time, volume
    
def set_config(default_config, date, sym, side, start_time, end_time, volume):
    config = default_config
    config.StrategyParam.trading_day = date.replace('-', '')
    config.StrategyParam.instrument = sym
    config.TradeParam.volume = int(0.05 * volume)
    # split_num = int(min(config.TradeParam.volume//100, time_delete(config.TradeParam.end_time, config.TradeParam.start_time)//(3*config.StrategyParam.time_window)))
    # config.TradeParam.split_num = split_num
    config.TradeParam.start_time = start_time
    config.TradeParam.end_time = end_time
    return config
def adjust_sample(t_result, tot_vwap):
    state = t_result[0]['features']
    action, action_logprob, state_val = t_result[1]['action'], t_result[1]['action_logprob'], t_result[1]['state_val']
    reward_data = t_result[2]
    reward = reward_data[0] - reward_data[1]*tot_vwap
    return (state, action, action_logprob, state_val, reward)   
    
# train set 
train_set = pd.read_csv('./plans/train.csv')
for row in tqdm(train_set.iterrows(), desc='training'):
    date, sym, side, start_time, end_time, volume = adjust_row(row)
    config = set_config(default_config, date, sym, side, start_time, end_time, volume)
    command = ['python', './env/make_env.py',  
         '--inst', str(config.StrategyParam.instrument), 
         '--td', str(config.StrategyParam.trading_day), 
        '--volume', str(config.TradeParam.volume),
         '--start_time', str(config.TradeParam.start_time),
         '--end_time', str(config.TradeParam.end_time),
        '--policy', os.path.abspath(training_checkpoint_path),
        '--direction', 'sell' 
        ]
    print("command:", '\n', command)
    result = subprocess.run(command, 
        capture_output=True, text=True
    )
    output_lines = result.stdout.splitlines()
    json_output_lines = []
    capture_json = False
    for line in output_lines:
        if "JSON_OUTPUT_START" in line:
            capture_json = True
            continue
        if "JSON_OUTPUT_END" in line:
            capture_json = False
            continue
        if capture_json:
            json_output_lines.append(line)
    
    json_output = "\n".join(json_output_lines)
    output_data = json.loads(json_output)
    train_data = output_data["train_data"]
    tot_vwap = output_data["tot_vwap"]
    
    for i in range(len(train_data)):
        t_result = train_data[i]
        state, action, action_logprob, state_val, reward = adjust_sample(t_result, tot_vwap)
        done = (i == (len(result_output)-1))
        print(state, action, action_logprob, state_val, reward)
    break

Device set to : cpu
training environment name : sim_trade
current logging run number for sim_trade :  2
logging at : PPO_logs/sim_trade//PPO_sim_trade_log_2.csv
save checkpoint path : PPO_preTrained/sim_trade/PPO_sim_trade_0_0.pth
--------------------------------------------------------------------------------------------
max training timesteps :  100000
max timesteps per episode :  400
model saving frequency : 20000 timesteps
log frequency : 800 timesteps
printing average reward over episodes in last : 1600 timesteps
--------------------------------------------------------------------------------------------
state space dimension :  7
action space dimension :  11
--------------------------------------------------------------------------------------------
Initializing a discrete action space policy
--------------------------------------------------------------------------------------------
PPO update frequency : 1600 timesteps
PPO K epochs :  40
PPO epsilon clip :  0.2
discount factor 

training: 0it [00:00, ?it/s]

command: 
 ['python', './env/make_env.py', '--inst', '000030.SZ', '--td', '20230829', '--volume', '9205', '--start_time', '93000000', '--end_time', '93930000', '--policy', '/mnt/huangchunyang/RL4Execution/PPO_preTrained/sim_trade/training_PPO_sim_trade_0_0.pth', '--direction', 'sell']


training: 0it [02:45, ?it/s]

[52444.84024972457, 110.0, 640300, 272300, 146.11639196202458, 0.0, 0.0] 7 -1.638455867767334 0.34816962480545044 128717.64622373879
[52623.865877712036, 110.0, 710300, 202800, 87.89197915623473, -0.1111, 0.1111111111111111] 4 -2.293360948562622 0.4894809126853943 128717.64622373879
[52697.597597597596, 105.0, -1787000, 99900, 53.09190145398826, -0.2222, 0.2222222222222222] 8 -2.1863625049591064 -0.517612099647522 239817.6462237388
[52631.1761684643, 100.0, -2246200, 194700, 58.94913061275798, -0.3333, 0.3333333333333333] 6 -2.223736524581909 -0.4946668744087219 17617.64622373879
[52426.4450867052, 100.0, -3328000, 69200, 47.69696007084728, -0.4444, 0.4444444444444444] 6 -2.2934205532073975 -0.517612099647522 -93482.35377626121
[52454.30847212165, 105.0, -3931900, 138100, 55.84576975922169, -0.5555, 0.5555555555555556] 2 -2.5273401737213135 -0.517612099647522 17617.64622373879
[52487.107623318385, 110.0, -2549766, 89200, 25.495097567963924, -0.6666, 0.6666666666666666] 8 -2.18636250495




In [19]:
for a in [state, action, action_logprob, state_val, reward]:
    print(type(a), a, torch.tensor(a))
    print(torch.tensor(a).dtype)

<class 'list'> [52392.99145299145, 105.0, -2074277, 175500, 93.90819985496474, -0.8888, 0.8888888888888888] tensor([ 5.2393e+04,  1.0500e+02, -2.0743e+06,  1.7550e+05,  9.3908e+01,
        -8.8880e-01,  8.8889e-01])
torch.float32
<class 'int'> 6 tensor(6)
torch.int64
<class 'float'> -2.223736524581909 tensor(-2.2237)
torch.float32
<class 'float'> -0.4946668744087219 tensor(-0.4947)
torch.float32
<class 'float'> -204582.3537762612 tensor(-204582.3594)
torch.float32


In [17]:
action, torch.tensor(action)

(6, tensor(6))

In [15]:
# 假设你有一个包含 int 和 float 的列表
data = [1, 2.5, 3, 4.75]

# 创建一个空列表来存储单独处理后的 tensors
tensor_list = []

for item in data:
    print(item)
    if isinstance(item, int):
        tensor_list.append(torch.tensor(item, dtype=torch.int))
    elif isinstance(item, float):
        tensor_list.append(torch.tensor(item, dtype=torch.float))

# 最终将所有张量组合成一个列表形式的张量
final_tensor = torch.stack(tensor_list)

print(final_tensor)

1
2.5
3
4.75
tensor([1.0000, 2.5000, 3.0000, 4.7500])
