In [1]:
#initialise for papermill
iteration = 4
error_type = "gauss"
num_iterations = 100_000
seed = 555

In [2]:
# Parameters
error_type = "both"
iteration = 7
num_iterations = 50000
seed = 735


In [3]:
# Early stopping parameters
patience = 100  
min_delta = 500 

In [4]:
import warnings
warnings.filterwarnings("ignore")

import os
import io
import tempfile
import shutil
import zipfile
# from google.colab import files
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import yfinance as yf
import gym
import quanttrader as qt
import pyfolio as pf

import tensorflow as tf
from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import py_driver
from tf_agents.drivers.dynamic_step_driver import DynamicStepDriver
from tf_agents.environments import tf_py_environment
from tf_agents.environments import suite_gym
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import sequential, q_network, network
from tf_agents.policies import py_tf_eager_policy
from tf_agents.policies import random_tf_policy
from tf_agents.policies import policy_saver
from tf_agents.replay_buffers import TFUniformReplayBuffer
from tf_agents.trajectories import trajectory
from tf_agents.specs import tensor_spec
from tf_agents.utils import common

import base64
import imageio
import IPython
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import PIL.Image
import pyvirtualdisplay

# iteration = 4
eval_interval = 1_00
log_interval = 1_00

gym.__version__, qt.__version__, pf.__version__

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Gym trading env
Unlike live engine or backtest engine, where event loops are driven by live ticks or historical ticks,
here it is driven by step function, similar to
https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py
The sequence is
1. obs <- env.reset()      # env; obs = OHLCV + technical indicators + holdings
2. action <- pi(obs)       # agent; action = target weights
3. news_obs, reward, <- step(action)      # env
    3.a action to orders   # sum(holdings) * new weights
    3.b fill orders        # portfolio rotates into new holdings
repeat 2, and 3 to interact between agent and env
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gym

tf.keras.utils.set_random_seed(seed)
tf.config.experimental.enable_op_determinism()

class TradingEnv(gym.Env):
    """
    Description:
        backtest gym engine
        it doesn't normalize; and expects a normalization layer
    Observation:
        Type: Box(lookback_window, n_assets*5+2)
        lookback_window x (n_assets*(ohlcv) + cash+npv)
        TODO: append trades, standing orders, etc
        TODO: stop/limit orders
    Actions:
        Type: Box(n_assets + 1)
        portfolio weights [w1,w2...w_k, cash_weight], add up to one
    Reward:
        pnl every day, similar to space-invaders
    Starting State:
        random timestamp between start_date and (end_date - run_window)
    Episode Termination:
        after predefined window
        If broke, no orders will send
    """
    def __init__(self, n: np.int32, df_obs_scaled : pd.DataFrame, df_exch : pd.DataFrame):
        assert n >= 2

        self._df_obs_scaled = df_obs_scaled     # observation; scaled outside along with TA indicators
        self._df_exch = df_exch                 # exch
        self._df_positions = df_exch * 0.0      # positions plus cash plus nav
        self._df_positions['Cash'] = 0.0
        self._df_positions['NAV'] = 0.0

        self._asset_syms = df_exch.columns
        self._n_assets = len(self._asset_syms)
        self._n_features = df_obs_scaled.shape[1] / self._n_assets      # assume same features across assets

        self._inital_cash = 100_000.0
        self._cash = self._inital_cash
        self._commission_rate = 0.0
        self._look_back = 10                        # observation lookback history
        self._warmup = 50                           # observation ramp-up due to e.g. 10 periods are required to cacl MA(10)
        self._maxsteps = 252                        # max steps in one episode

        self._max_nav_scaler = 1.0
        self._lock_init_step = False
        self._init_step = 0
        self._current_step = 0

        self._n = n             # n=2: buy/sell 100% or [0, 100%]; n=3: [0, 50%, 100%]; n=4: [0, 33.3%, 66.7%, 100%]
        self._pcts = [pct / (self._n-1) for pct in range(self._n)]

        self.action_space = gym.spaces.Discrete(n=self._n)
        # first col is open, second col is high, ..., last col is nav
        self.observation_space = gym.spaces.Box(low=-np.inf, high=np.inf, shape=(self._look_back, df_obs_scaled.shape[1]+1), dtype=np.float32)
                    
    def set_cash(self, cash: np.float32=100_000.0):
        self._inital_cash = cash
        self._cash = self._inital_cash

    def set_commission(self, comm: np.float32=0.0001):
        """
        commission plus slippage
        """
        self._commission_rate = comm

    def set_steps(self, n_lookback : np.int32=10, n_warmup : np.int32=50, n_maxsteps: np.int32=252, n_init_step: np.int32=0):
        self._lookback = n_lookback
        self._warmup = n_warmup
        self._maxsteps = n_maxsteps
        self._init_step = n_init_step
        if n_init_step > 0:
            assert n_init_step >= n_warmup
            self._lock_init_step = True

    def set_feature_scaling(self, max_nav_scaler : np.float32=1.0):
        self._max_nav_scaler = max_nav_scaler

    def _get_observation(self):
        """
        return an array of size self._lookback x features.
        Each column is a feature; last feature is NAV.
        Row is in time ascending order. That is, last row is self._current_step.
        """
        obs = self._df_obs_scaled.iloc[self._current_step-self._look_back+1:self._current_step+1].values
        obs = np.append(obs, self._df_positions.iloc[self._current_step-self._look_back+1:self._current_step+1][['NAV']].values / self._max_nav_scaler, axis=1)

        return obs

    def step(self, action):
        """
        move one step to the next timestamp, accordingly to action
        assume hft condition: execution at today 15:59:59, after observing today's ohl and (almost) close.
        execution immediately using market or market on close, no slippage.
        e.g., assume on 12/31/2019, 1/2/2020, and 1/3/2020 prices are $95, $100, $110. respectively. 
        The state or observation is prices of last two days. 
        We start on 1/2/2020 with $100,000.
        Then
        1. on 1/2/2020, obs = [$95, $100]         # obs <- env.reset()
        2. on 1/2/2020, based on the up-trend observation, we decide to buy. 
            Our allocation action w is [50%, 50%], or half in cash, half in stock.
        3. on 1/2/2020, the step function is      # news_obs, reward <- step(action)
            3.a buy order of $5,000 or 50 shares, filled at $100
            3.b stock market environment transits to 1/3/2020, new observation is [$100, $110];
                our stock is now worth $5,500, and total asset NAV is $10,500, or reward $500.
        :param action:
        :return:
        """
        done = False

        current_size = int(self._df_positions.iloc[self._current_step][self._asset_syms].item())
        current_cash = self._df_positions.iloc[self._current_step]['Cash']
        current_price = self._df_exch.iloc[self._current_step].item()

        # rebalance
        current_nav = current_cash + current_price * current_size       # should equal to the nav column
        new_size = int(np.floor(current_nav * self._pcts[action] / current_price))       # odd size allowed; action[-1] is cash
        delta_size = new_size - current_size
        current_commission = np.abs(delta_size) * current_price * self._commission_rate
        new_cash = current_cash - delta_size * current_price - current_commission

        # move to next timestep
        self._current_step += 1
        new_price = self._df_exch.iloc[self._current_step].item()
        new_nav = new_cash + new_price * new_size
        reward = (new_price - current_price) * new_size - current_commission     # commission is penalty
        info = {'step': self._current_step, 'time': self._df_obs_scaled.index[self._current_step],
                'old_price': current_price, 'old position': current_size, 'old_cash': current_cash, 'old_nav': current_nav,
                'price': new_price, 'position': new_size, 'cash': new_cash, 'nav': new_nav,
                'transaction': delta_size, 'commission': current_commission, 'nav_diff': new_nav-current_nav}     # reward = new_nav - current_nav

        # reward = reward / self._max_nav_scaler
        self._df_positions.loc[self._df_positions.index[self._current_step], self._asset_syms] = new_size
        self._df_positions['Cash'][self._current_step] = new_cash
        self._df_positions['NAV'][self._current_step] = new_nav

        if self._current_step - self._init_step >= self._maxsteps:      # e.g. init=3, current=7, _maxsteps=4
            done = True
        if self._current_step == self._df_exch.shape[0]-1:              # end of data
            done = True

        # s'
        new_state = self._get_observation()

        return new_state, reward, done, info

    def reset(self):
        """
        random start time
        """
        self._cash = self._inital_cash
        self._df_positions = self._df_exch * 0.0
        self._df_positions['Cash'] = 0.0
        self._df_positions['NAV'] = 0.0
        
        if not self._lock_init_step:
            self._init_step = np.random.randint(low=self._warmup-1, high=self._df_obs_scaled.shape[0]-self._maxsteps)    # low (inclusive) to high (exclusive)
        self._current_step = self._init_step

        self._df_positions['Cash'][:self._current_step+1] = self._cash
        self._df_positions['NAV'][:self._current_step+1] = self._cash

        # return current_step
        return self._get_observation()

    def render(self, mode='human'):
        plt.rcParams.update({'font.size': 10})  # Adjust font size globally
        fig, ax = plt.subplots(2, 1, figsize=(15, 8), gridspec_kw={'height_ratios': [3, 1]})  # Increase figure size
        fig.suptitle(f'PPO Agent: {error_type.capitalize()} Error, Dataset {iteration}, {dataset_type}', fontsize=12)        # Adjust layout
        plt.subplots_adjust(
            left=0.1,  # left side margin
            right=0.9,  # right side margin
            bottom=0.1,  # bottom margin
            top=0.9,  # top margin
            hspace=0.4,  # height space between subplots
        )
        # fig.tight_layout()
        x_left = self._init_step
        x_right = self._current_step+1
        x_end = min(self._df_exch.shape[0] - 1, self._init_step + self._maxsteps)
        # x_end = min(self._df_exch.shape[0], self._init_step+self._maxsteps+1)
        df_price = self._df_exch[x_left:x_right]
        df_nav = self._df_positions['NAV'][x_left:x_right]

        # Market Price and Trading Signals
        ax[0].set_title('Market Price and Trading Signals')
        ax[0].set_xlabel('Time')  # Set x-axis label
        ax[0].set_ylabel('Price')  # Set y-axis label
        ax[0].tick_params(
            axis='x',          
            which='both',      
            bottom=True,  # Enable ticks along the bottom edge
            top=False,         
            labelbottom=True)  # Enable labels along the bottom edge
        ax[0].set_xlim([self._df_exch.index[x_left], self._df_exch.index[x_end]])
        ax[0].set_ylim([max(self._df_exch[x_left:x_end].min().item()-5, 0), self._df_exch[x_left:x_end].max().item()+5])
        df_position = self._df_positions['NAV'][x_left:x_right]
        df_position_diff = df_position - df_position.shift(1)
        df_buy = df_price[df_position_diff > 0]
        df_sell = df_price[df_position_diff < 0]
        ax[0].plot(df_price, color='black', label='Price')
        ax[0].plot(df_buy, '^', markersize=5, color='r')
        ax[0].plot(df_sell, 'v', markersize=5, color='g')

        # Net Asset Value (NAV) Over Time
        ax[1].set_title('Net Asset Value (NAV) Over Time')
        ax[1].set_xlabel('Time')  # Set x-axis label
        ax[1].set_ylabel('NAV')  # Set y-axis label
        ax[1].set_xlim([self._df_exch.index[x_left], self._df_exch.index[x_end]])
        ax[1].plot(df_nav, color='black', label='NAV')

        #plt.pause(0.001)

        # https://stackoverflow.com/questions/7821518/matplotlib-save-plot-to-numpy-array
        fig.canvas.draw()
        data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
        data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
        plt.close()
        return data

    def close(self):
        pass

# Render function for trades

def embed_mp4(filename):
  """Embeds an mp4 file in the notebook."""
  video = open(filename,'rb').read()
  b64 = base64.b64encode(video)
  tag = '''
  <video width="640" height="480" controls>
    <source src="data:video/mp4;base64,{0}" type="video/mp4">
  Your browser does not support the video tag.
  </video>'''.format(b64.decode())

  return IPython.display.HTML(tag)

def create_policy_eval_video(env, policy, filename, num_episodes=5, fps=30):
  filename = "results/videos/" +filename + ".mp4"
  with imageio.get_writer(filename, fps=fps) as video:
    for _ in range(num_episodes):
      time_step = env.reset()
      video.append_data(env.pyenv.envs[0].render())

      while not time_step.is_last():
        action_step = policy.action(time_step)
        time_step = env.step(action_step.action)
        video.append_data(env.pyenv.envs[0].render())

  return embed_mp4(filename)

def load_data():
    from datetime import timedelta
    import ta

    start_date = datetime(2010, 1, 1)
    end_date = datetime(2020, 12, 31)
    syms = ['SPY']
    max_price_scaler = 5_000.0
    max_price_scaler = 1
    max_volume_scaler = 1.5e8
    df_obs = pd.DataFrame()             # observation
    df_exch = pd.DataFrame()            # exchange; for order match

    for sym in syms:
        df = yf.download(sym, start=start_date, end=end_date)
        df.index = pd.to_datetime(df.index) + timedelta(hours=15, minutes=59, seconds=59)

        df_exch = pd.concat([df_exch, df['Close'].rename(sym)], axis=1)

        df['Open'] = df['Adj Close'] / df['Close'] * df['Open'] / max_price_scaler
        df['High'] = df['Adj Close'] / df['Close'] * df['High'] / max_price_scaler
        df['Low'] = df['Adj Close'] / df['Close'] * df['Low'] / max_price_scaler
        df['Volume'] = df['Adj Close'] / df['Close'] * df['Volume'] / max_volume_scaler
        df['Close'] = df['Adj Close'] / max_price_scaler
        df = df[['Open', 'High', 'Low', 'Close', 'Volume']]
        df.columns = [f'{sym}:{c.lower()}' for c in df.columns]

        macd = ta.trend.MACD(close=df[f'{sym}:close'])
        df[f'{sym}:macd'] = macd.macd()
        df[f'{sym}:macd_diff'] = macd.macd_diff()
        df[f'{sym}:macd_signal'] = macd.macd_signal()

        rsi = ta.momentum.RSIIndicator(close=df[f'{sym}:close'])
        df[f'{sym}:rsi'] = rsi.rsi()

        bb = ta.volatility.BollingerBands(close=df[f'{sym}:close'], window=20, window_dev=2)
        df[f'{sym}:bb_bbm'] = bb.bollinger_mavg()
        df[f'{sym}:bb_bbh'] = bb.bollinger_hband()
        df[f'{sym}:bb_bbl'] = bb.bollinger_lband()

        atr = ta.volatility.AverageTrueRange(high=df[f'{sym}:high'], low=df[f'{sym}:low'], close=df[f'{sym}:close'])
        df[f'{sym}:atr'] = atr.average_true_range()

        df_obs = pd.concat([df_obs, df], axis=1)

    return df_obs, df_exch

def setindexdate(df):
    df['date'] = pd.to_datetime(df['date'])
    df = df.set_index('date', drop=True)
    df = df.drop(columns = 'tic')
    return df


def compute_avg_return(environment, policy, num_episodes=5):

  total_return = 0.0
  for _ in range(num_episodes):

    time_step = environment.reset()
    episode_return = 0.0
    zeros = 0
    ones = 0

    while not time_step.is_last():
      action_step = policy.action(time_step)
      if action_step.action.numpy()[0] == 1:
        ones+=1
      else:
        zeros+=1
      time_step = environment.step(action_step.action)
      episode_return += time_step.reward
    total_return += episode_return

  avg_return = total_return / num_episodes
  return avg_return.numpy()[0], zeros, ones






In [5]:
train = setindexdate(pd.read_parquet(f"datasets/rl_train_dataset{iteration}.parquet")).astype('float64')
trade = setindexdate(pd.concat([pd.read_parquet(f"datasets/rl_val_dataset{iteration}.parquet"),pd.read_parquet(f"datasets/rl_test_dataset{iteration}.parquet")])).astype('float64')

match(error_type):
    case "gauss": 
        train_df_obs = train.drop(columns=['errors', 'TEDRATE error', 'AAAFF error', 'DTB3 error',
       'S&P 500 COMPOSITE - DS DIVIDEND YIELD error', 'SnP PRICE error'])
        train_df_exch = train_df_obs[['open']]
        trade_df_obs = trade.drop(columns=['errors', 'TEDRATE error', 'AAAFF error', 'DTB3 error',
       'S&P 500 COMPOSITE - DS DIVIDEND YIELD error', 'SnP PRICE error'])
        trade_df_exch = trade_df_obs[['open']]
    case "ae": 
        train_df_obs = train.drop(columns=['gauss_score', 'TEDRATE error', 'AAAFF error', 'DTB3 error',
       'S&P 500 COMPOSITE - DS DIVIDEND YIELD error', 'SnP PRICE error'])
        train_df_exch = train_df_obs[['open']]
        trade_df_obs = trade.drop(columns=['gauss_score', 'TEDRATE error', 'AAAFF error', 'DTB3 error',
       'S&P 500 COMPOSITE - DS DIVIDEND YIELD error', 'SnP PRICE error'])
        trade_df_exch = trade_df_obs[['open']]
    case "both": 
        train_df_obs = train.drop(columns=['TEDRATE error', 'AAAFF error', 'DTB3 error',
       'S&P 500 COMPOSITE - DS DIVIDEND YIELD error', 'SnP PRICE error'])
        train_df_exch = train_df_obs[['open']]
        trade_df_obs = trade.drop(columns=['TEDRATE error', 'AAAFF error', 'DTB3 error',
       'S&P 500 COMPOSITE - DS DIVIDEND YIELD error', 'SnP PRICE error'])
        trade_df_exch = trade_df_obs[['open']]
    case "na":
        train_df_obs = train.drop(columns=['gauss_score', 'errors', 'TEDRATE error', 'AAAFF error', 'DTB3 error',
       'S&P 500 COMPOSITE - DS DIVIDEND YIELD error', 'SnP PRICE error'])
        train_df_exch = train_df_obs[['open']]
        trade_df_obs = trade.drop(columns=['gauss_score', 'errors', 'TEDRATE error', 'AAAFF error', 'DTB3 error',
       'S&P 500 COMPOSITE - DS DIVIDEND YIELD error', 'SnP PRICE error'])
        trade_df_exch = trade_df_obs[['open']]


df_obs = pd.concat([train_df_obs, trade_df_obs])
df_exch = pd.concat([train_df_exch, trade_df_exch])
look_back = 5
cash = 100_000.0
max_nav_scaler = cash

train_qt_env = TradingEnv(3, train_df_obs, train_df_exch)
train_qt_env.set_cash(cash)
train_qt_env.set_commission(0.0001)
train_qt_env.set_steps(n_lookback=10, n_warmup=50, n_maxsteps=250)
train_qt_env.set_feature_scaling(max_nav_scaler)

eval_qt_env = TradingEnv(3, df_obs, df_exch)
eval_qt_env.set_cash(cash)
eval_qt_env.set_commission(0.0001)
eval_qt_env.set_steps(n_lookback=10, n_warmup=50, n_maxsteps=2000, n_init_step=len(train_df_obs)) 
eval_qt_env.set_feature_scaling(max_nav_scaler)


train_qt_env = gym.wrappers.FlattenObservation(train_qt_env)
train_py_env = suite_gym.wrap_env(train_qt_env)
train_env = tf_py_environment.TFPyEnvironment(train_py_env)

eval_qt_env = gym.wrappers.FlattenObservation(eval_qt_env)
eval_py_env = suite_gym.wrap_env(eval_qt_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

learning_rate = 1e-5
num_eval_episodes = 10
replay_buffer_max_length = 100000

fc_layer_params = (100, 100)
action_tensor_spec = tensor_spec.from_spec(train_env.action_spec())
num_actions = action_tensor_spec.maximum - action_tensor_spec.minimum + 1

# Define a helper function to create Dense layers configured with the right
# activation and kernel initializer.
def dense_layer(num_units):
  return tf.keras.layers.Dense(
      num_units,
      activation=tf.keras.activations.relu,
      kernel_initializer=tf.keras.initializers.VarianceScaling(
          scale=2.0, mode='fan_in', distribution='truncated_normal'))

# QNetwork consists of a sequence of Dense layers followed by a dense layer
# with `num_actions` units to generate one q_value per available action as
# its output.
dense_layers = [dense_layer(num_units) for num_units in fc_layer_params]
q_values_layer = tf.keras.layers.Dense(
    num_actions,
    activation=None,
    kernel_initializer=tf.keras.initializers.RandomUniform(
        minval=-0.03, maxval=0.03),
    bias_initializer=tf.keras.initializers.Constant(-0.2))
q_net = sequential.Sequential(dense_layers + [q_values_layer])

optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=learning_rate)

train_step_counter = tf.Variable(0)

agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    epsilon_greedy=0.01,
    gamma = 0.99,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter)

agent.initialize()

print(q_net.summary())

eval_policy = agent.policy
collect_policy = agent.collect_policy

replay_buffer = TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=100_000)

train_env.reset()

dataset = replay_buffer.as_dataset(
    num_parallel_calls=3,
    sample_batch_size=64,
    num_steps=2).prefetch(3)
iterator = iter(dataset)
# Create a driver to collect experience.
collect_driver = DynamicStepDriver(
    train_env,
    agent.collect_policy,
    observers=[replay_buffer.add_batch],
    num_steps=5) # collect 4 steps for each training iteration

agent.train_step_counter.assign(0)
time_step = None
policy_state = agent.collect_policy.get_initial_state(train_env.batch_size)

steps_since_last_improvement = 0
best_metric = -float('inf')

returns = np.array([])
metrics = []

while True:
    # Collect a few steps using collect_policy and save to the replay buffer.
    time_step, policy_state = collect_driver.run(time_step, policy_state)

    # Sample a batch of data from the buffer and update the agent's network.
    experience, unused_info = next(iterator)
    train_loss = agent.train(experience).loss

    step = agent.train_step_counter.numpy()
    if step % log_interval == 0:
        avg_return = compute_avg_return(train_env, agent.policy, num_episodes=1)[0]
        returns = np.append(returns, avg_return)
        
        # Store metrics in the list
        metrics.append({'Step': step, 'Average Return': avg_return, 'Train Loss': train_loss.numpy()})

        # Check if there is an improvement
        if avg_return >= (best_metric + min_delta):
            best_metric = avg_return
            steps_since_last_improvement = 0
        else:
            steps_since_last_improvement += 1

        # Early stopping check
        if steps_since_last_improvement >= patience:
            print("Early stopping triggered")
            break

    if step > num_iterations:
        break

# Define the CSV file path
csv_file_path = f'training_metrics_{iteration}_{error_type}.csv'

# Convert the metrics list to a DataFrame
df_metrics = pd.DataFrame(metrics)

# Save the DataFrame to a CSV file
df_metrics.to_csv(csv_file_path, index=False)

print(f"Metrics saved to {csv_file_path}")

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               multiple                  22100     
                                                                 
 dense_1 (Dense)             multiple                  10100     
                                                                 
 dense_2 (Dense)             multiple                  303       
                                                                 
Total params: 32503 (126.96 KB)
Trainable params: 32503 (126.96 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Instructions for updating:
Use `tf.data.Dataset.counter(...)` instead.


Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))


Early stopping triggered
Metrics saved to training_metrics_7_both.csv


In [6]:
# tf_policy_saver = policy_saver.PolicySaver(agent.policy)
# tf_policy_saver.save(f"/Users/justin/Desktop/URECA/saved_models/DRL_{error_type}_trained_agent_"+str(iteration))

dataset_type = 'Train'
create_policy_eval_video(train_env, agent.policy, f"DRL_{error_type}_trained_agent_train_"+str(iteration), num_episodes=1)
print(compute_avg_return(train_env, agent.policy, num_episodes=1))

x_left = train_env.pyenv.envs[0].env._init_step
x_right = train_env.pyenv.envs[0].env._current_step     # _maxsteps+1
df_price = train_env.pyenv.envs[0].env._df_exch[x_left:x_right].copy()
df_reinforcement = train_env.pyenv.envs[0].env._df_positions['NAV'][x_left:x_right].copy()
df_orders = train_env.pyenv.envs[0].env._df_positions.copy()
df_orders['action'] = df_orders['open'].diff().fillna(0)
df_all = pd.concat([df_reinforcement, df_price], axis=1)
df_all.columns = ['tf-agent', 'benchmark']
df_ret = df_all / df_all.shift(1) - 1
df_ret = df_ret[1:]

agent_perf_stats = pf.timeseries.perf_stats(df_ret['tf-agent'])
benchmark_perf_stats = pf.timeseries.perf_stats(df_ret['benchmark'])
perf_stats = pd.concat([agent_perf_stats, benchmark_perf_stats], axis=1)
perf_stats.columns = ['tf-agent', 'benchmark']
perf_stats

df_all.to_parquet(path=f"results/DRL_{error_type}_returns_train_{iteration}.parquet")
df_orders.to_parquet(path=f"results/DRL_{error_type}_orders_train_{iteration}.parquet")

dataset_type = 'Test'
create_policy_eval_video(eval_env, agent.policy, f"DRL_{error_type}_trained_agent_test_"+str(iteration), num_episodes=1)
print(compute_avg_return(eval_env, agent.policy, num_episodes=1))

x_left = eval_env.pyenv.envs[0].env._init_step
x_right = eval_env.pyenv.envs[0].env._current_step     # _maxsteps+1
df_price = eval_env.pyenv.envs[0].env._df_exch[x_left:x_right].copy()
df_reinforcement = eval_env.pyenv.envs[0].env._df_positions['NAV'][x_left:x_right].copy()
df_orders = eval_env.pyenv.envs[0].env._df_positions.copy()
df_orders['action'] = df_orders['open'].diff().fillna(0)
df_all = pd.concat([df_reinforcement, df_price], axis=1)
df_all.columns = ['tf-agent', 'benchmark']
df_ret = df_all / df_all.shift(1) - 1
df_ret = df_ret[1:]

agent_perf_stats = pf.timeseries.perf_stats(df_ret['tf-agent'])
benchmark_perf_stats = pf.timeseries.perf_stats(df_ret['benchmark'])
perf_stats = pd.concat([agent_perf_stats, benchmark_perf_stats], axis=1)
perf_stats.columns = ['tf-agent', 'benchmark']
perf_stats

df_all.to_parquet(path=f"results/DRL_{error_type}_returns_test_{iteration}.parquet")
df_orders.to_parquet(path=f"results/DRL_{error_type}_orders_test_{iteration}.parquet")





(1824.2832, 196, 54)


(-2365.6384, 60, 12)
