### PPO AGENT:

#### Load the dataset and preprocess the dataframe in the required format.

Technical indicators created for the environment include:

- RSI
- MACD
- Stoch_k
- OBV
- Upper_BB
- ATR_1
- ATR_2
- ATR_5
- ATR_10
- ATR_20

In [1]:
import pandas as pd
import numpy as np
import talib as ta
np.random.seed(42)

class TechnicalIndicators:
    def __init__(self, data):
        self.data = data

    def add_momentum_indicators(self):
        self.data['RSI'] = ta.RSI(self.data['Close'], timeperiod=14)
        self.data['MACD'], self.data['MACD_signal'], self.data['MACD_hist'] = ta.MACD(self.data['Close'], fastperiod=12, slowperiod=26, signalperiod=9)
        self.data['Stoch_k'], self.data['Stoch_d'] = ta.STOCH(self.data['High'], self.data['Low'], self.data['Close'],
                                                              fastk_period=14, slowk_period=3, slowd_period=3)

    def add_volume_indicators(self):
        self.data['OBV'] = ta.OBV(self.data['Close'], self.data['Volume'])

    def add_volatility_indicators(self):
        self.data['Upper_BB'], self.data['Middle_BB'], self.data['Lower_BB'] = ta.BBANDS(self.data['Close'], timeperiod=20)
        self.data['ATR_1'] = ta.ATR(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=1)
        self.data['ATR_2'] = ta.ATR(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=2)
        self.data['ATR_5'] = ta.ATR(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=5)
        self.data['ATR_10'] = ta.ATR(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=10)
        self.data['ATR_20'] = ta.ATR(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=20)

    def add_trend_indicators(self):
        self.data['ADX'] = ta.ADX(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=14)
        self.data['+DI'] = ta.PLUS_DI(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=14)
        self.data['-DI'] = ta.MINUS_DI(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=14)
        self.data['CCI'] = ta.CCI(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=5)
    def add_moving_averages(self):
        self.data['SMA_10'] = ta.SMA(self.data['Close'], timeperiod=10)
        self.data['SMA_50'] = ta.SMA(self.data['Close'], timeperiod=50)
        self.data['EMA_10'] = ta.EMA(self.data['Close'], timeperiod=10)
        self.data['EMA_50'] = ta.EMA(self.data['Close'], timeperiod=50)
        self.data['WMA_10'] = ta.WMA(self.data['Close'], timeperiod=10)
        self.data['WMA_50'] = ta.WMA(self.data['Close'], timeperiod=50)

    def add_other_indicators(self):
        self.data['DLR'] = np.log(self.data['Close'] / self.data['Close'].shift(1))
        self.data['TWAP'] = self.data['Close'].expanding().mean()
        self.data['VWAP'] = (self.data['Volume'] * (self.data['High'] + self.data['Low']) / 2).cumsum() / self.data['Volume'].cumsum()

    def add_all_indicators(self):
        self.add_momentum_indicators()
        self.add_volume_indicators()
        self.add_volatility_indicators()
        self.add_moving_averages()
        self.add_trend_indicators()
        self.add_other_indicators()
        return self.data

In [2]:
# data = pd.read_csv('/Users/hao/Downloads/Blockhouse-Work-Trial-main/xnas-itch-20230703.tbbo.csv')
data = pd.read_csv('./xnas-itch-20230703.tbbo.csv')

reverse_data = False
if reverse_data:
    data = data.iloc[::-1].reset_index(drop=True)
# Preprocessing to create necessary columns
data['price']=data['price']/1e9
data['bid_px_00']=data['bid_px_00']/1e9
data['ask_px_00']=data['ask_px_00']/1e9

data['Close'] = data['price']
data['Volume'] = data['size']
data['High'] = data[['bid_px_00', 'ask_px_00']].max(axis=1)
data['Low'] = data[['bid_px_00', 'ask_px_00']].min(axis=1)
data['Open'] = data['Close'].shift(1).fillna(data['Close'])


ti = TechnicalIndicators(data)
df_with_indicators = ti.add_all_indicators()

market_features_df = df_with_indicators[50:]

print(market_features_df)

                   ts_recv             ts_event  rtype  publisher_id  \
50     1688371242324266534  1688371242324101963      1             2   
51     1688371247317894640  1688371247317729998      1             2   
52     1688371257325756491  1688371257325590403      1             2   
53     1688371257325756491  1688371257325590403      1             2   
54     1688371259762706298  1688371259762541862      1             2   
...                    ...                  ...    ...           ...   
59266  1688417954514485218  1688417954514320323      1             2   
59267  1688417961020718430  1688417961020553920      1             2   
59268  1688417973297905504  1688417973297741235      1             2   
59269  1688417996889779362  1688417996889614660      1             2   
59270  1688417998907430616  1688417998907265922      1             2   

       instrument_id action side  depth   price  size  ...      EMA_50  \
50                32      T    B      0  194.15    10  ...  1

In [3]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
numeric_columns = market_features_df.select_dtypes(include=['float64', 'int64']).columns

# normalize numeric data, save to a new dataframe
normalized_data = scaler.fit_transform(market_features_df[numeric_columns])
normalized_df = pd.DataFrame(normalized_data, columns=numeric_columns, index=market_features_df.index)
suffix = '_normalized'
normalized_df = normalized_df.add_suffix(suffix)

# concatenate new df with the old one
combined_df = pd.concat([market_features_df, normalized_df], axis=1)

pd.set_option('display.max_columns', None)



combined_df.head()

Unnamed: 0,ts_recv,ts_event,rtype,publisher_id,instrument_id,action,side,depth,price,size,flags,ts_in_delta,sequence,bid_px_00,ask_px_00,bid_sz_00,ask_sz_00,bid_ct_00,ask_ct_00,symbol,Close,Volume,High,Low,Open,RSI,MACD,MACD_signal,MACD_hist,Stoch_k,Stoch_d,OBV,Upper_BB,Middle_BB,Lower_BB,ATR_1,ATR_2,ATR_5,ATR_10,ATR_20,SMA_10,SMA_50,EMA_10,EMA_50,WMA_10,WMA_50,ADX,+DI,-DI,CCI,DLR,TWAP,VWAP,ts_recv_normalized,ts_event_normalized,rtype_normalized,publisher_id_normalized,instrument_id_normalized,depth_normalized,price_normalized,size_normalized,flags_normalized,ts_in_delta_normalized,sequence_normalized,bid_px_00_normalized,ask_px_00_normalized,bid_sz_00_normalized,ask_sz_00_normalized,bid_ct_00_normalized,ask_ct_00_normalized,Close_normalized,Volume_normalized,High_normalized,Low_normalized,Open_normalized,RSI_normalized,MACD_normalized,MACD_signal_normalized,MACD_hist_normalized,Stoch_k_normalized,Stoch_d_normalized,OBV_normalized,Upper_BB_normalized,Middle_BB_normalized,Lower_BB_normalized,ATR_1_normalized,ATR_2_normalized,ATR_5_normalized,ATR_10_normalized,ATR_20_normalized,SMA_10_normalized,SMA_50_normalized,EMA_10_normalized,EMA_50_normalized,WMA_10_normalized,WMA_50_normalized,ADX_normalized,+DI_normalized,-DI_normalized,CCI_normalized,DLR_normalized,TWAP_normalized,VWAP_normalized
50,1688371242324266534,1688371242324101963,1,2,32,T,B,0,194.15,10,130,164571,350459,194.02,194.15,180,400,2,1,AAPL,194.15,10,194.15,194.02,194.12,64.961352,0.012237,0.005054,0.007183,47.142857,26.825397,-3332.0,194.185746,194.0655,193.945254,0.13,0.129921,0.142483,0.136696,0.118353,194.04,194.0332,194.070777,194.037204,194.061273,194.04349,94.270606,8.115584,0.394096,107.142857,0.000155,194.034902,194.059871,0.0,0.0,0.0,0.0,0.0,0.0,0.944444,4e-06,1.0,0.00102,0.0,0.91129,0.940711,0.008079,0.011485,0.001742,0.0,0.944444,4e-06,0.940711,0.91129,0.93254,0.649616,0.667895,0.626625,0.556042,0.471429,0.268254,0.006927,0.943705,0.929107,0.905363,0.27907,0.278887,0.309876,0.315408,0.31707,0.912013,0.928627,0.924625,0.935534,0.916909,0.932679,0.940871,0.154466,0.008314,0.660714,0.432868,0.930453,0.958016
51,1688371247317894640,1688371247317729998,1,2,32,T,B,0,194.1,10,130,164642,354918,194.06,194.1,100,10,1,1,AAPL,194.1,10,194.1,194.06,194.15,55.742466,0.013947,0.006833,0.007115,57.97619,42.81746,-3342.0,194.18893,194.068,193.94707,0.09,0.109961,0.131986,0.132026,0.116935,194.048,194.033,194.07609,194.039667,194.072182,194.04611,94.018256,7.698755,0.373855,18.518519,-0.000258,194.036154,194.059909,0.000107,0.000107,0.0,0.0,0.0,0.0,0.924603,4e-06,1.0,0.001114,1.8e-05,0.927419,0.920949,0.004468,0.000259,0.0,0.0,0.924603,4e-06,0.920949,0.927419,0.944444,0.557427,0.677393,0.637301,0.555365,0.579762,0.428175,0.006916,0.944954,0.93012,0.906115,0.186047,0.232467,0.285325,0.303784,0.312922,0.915227,0.928545,0.926761,0.936557,0.921275,0.933759,0.938266,0.146532,0.007887,0.527778,0.300804,0.931326,0.958044
52,1688371257325756491,1688371257325590403,1,2,32,T,A,0,194.1,1,0,166088,363515,194.1,194.12,101,244,2,1,AAPL,194.1,1,194.12,194.1,194.1,55.742466,0.015128,0.008492,0.006636,65.47619,56.865079,-3342.0,194.191904,194.0705,193.949096,0.02,0.06498,0.109589,0.120823,0.112088,194.057,194.0328,194.080437,194.042033,194.081636,194.048737,93.871355,8.819513,0.369315,83.333333,0.0,194.037358,194.059918,0.000321,0.000321,0.0,0.0,0.0,0.0,0.924603,0.0,0.0,0.00303,5.2e-05,0.943548,0.928854,0.004513,0.006995,0.001742,0.0,0.924603,0.0,0.928854,0.943548,0.924603,0.557427,0.683951,0.64726,0.550643,0.654762,0.568651,0.006916,0.946121,0.931132,0.906953,0.023256,0.127861,0.232938,0.275895,0.298739,0.918843,0.928462,0.928509,0.937541,0.925058,0.934841,0.93675,0.167864,0.007791,0.625,0.383346,0.932167,0.95805
53,1688371257325756491,1688371257325590403,1,2,32,T,A,0,194.1,1,130,166088,363516,194.1,194.12,100,244,1,1,AAPL,194.1,1,194.12,194.1,194.1,55.742466,0.015881,0.009969,0.005911,62.5,61.984127,-3342.0,194.194672,194.073,193.951328,0.02,0.04249,0.091671,0.110741,0.107484,194.066,194.0326,194.083994,194.044306,194.089455,194.051373,93.734948,8.705674,0.364548,55.555556,0.0,194.038519,194.059928,0.000321,0.000321,0.0,0.0,0.0,0.0,0.924603,0.0,1.0,0.00303,5.2e-05,0.943548,0.928854,0.004468,0.006995,0.0,0.0,0.924603,0.0,0.928854,0.943548,0.924603,0.557427,0.688132,0.65613,0.543485,0.625,0.619841,0.006916,0.947206,0.932145,0.907877,0.023256,0.075559,0.191028,0.250795,0.285265,0.922459,0.92838,0.929939,0.938486,0.928187,0.935927,0.935342,0.165697,0.00769,0.583333,0.383346,0.932976,0.958057
54,1688371259762706298,1688371259762541862,1,2,32,T,B,0,194.12,50,130,164436,366351,194.1,194.12,99,244,1,1,AAPL,194.12,50,194.12,194.1,194.1,58.672524,0.017885,0.011553,0.006332,66.666667,64.880952,-3292.0,194.199345,194.0765,193.953655,0.02,0.031245,0.077337,0.101667,0.10311,194.078,194.035,194.090541,194.047274,194.099273,194.0548,93.608283,8.58632,0.35955,89.74359,0.000103,194.04,194.060393,0.000373,0.000373,0.0,0.0,0.0,0.0,0.93254,2.1e-05,1.0,0.000841,6.3e-05,0.943548,0.928854,0.004423,0.006995,0.0,0.0,0.93254,2.1e-05,0.928854,0.943548,0.924603,0.586728,0.699264,0.665633,0.547643,0.666667,0.64881,0.00697,0.94904,0.933563,0.90884,0.023256,0.049407,0.1575,0.228205,0.272465,0.92728,0.929371,0.932572,0.93972,0.932116,0.93734,0.934035,0.163425,0.007585,0.634615,0.416365,0.934009,0.95839


In [4]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59221 entries, 50 to 59270
Columns: 103 entries, ts_recv to VWAP_normalized
dtypes: float64(85), int64(15), object(3)
memory usage: 46.5+ MB


#### Create the Trading Environment class for the PPO Agent

In [5]:
import gym
from gym import spaces
import torch as th
th.set_num_threads(5)
th.manual_seed(42)

class TradingEnvironment(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, data, daily_trading_limit):
        super(TradingEnvironment, self).__init__()
        self.data = data
        self.daily_trading_limit = daily_trading_limit
        self.current_step = 0
        self.window = 64

        # Extract state columns
        # self.state_columns = ['Close', 'Volume', 'RSI', 'MACD', 'MACD_signal', 'MACD_hist', 'Stoch_k', 'Stoch_d',
        #                       'OBV', 'Upper_BB', 'Middle_BB', 'Lower_BB', 'ATR_1', 'ADX', '+DI', '-DI', 'CCI','shares_held', 'total_shares_traded']
        self.state_columns = ['Close_normalized', 'Volume_normalized', 'RSI_normalized', 'MACD_normalized', 'MACD_signal_normalized', 'MACD_hist_normalized', 'Stoch_k_normalized', 'Stoch_d_normalized',
                              'OBV_normalized', 'Upper_BB_normalized', 'Middle_BB_normalized', 'Lower_BB_normalized', 'ATR_1_normalized', 'ADX_normalized', '+DI_normalized', '-DI_normalized', 'CCI_normalized',
                              'bid_px_00_normalized','ask_px_00_normalized','ts_in_delta_normalized','ts_recv_normalized',
                              'High_normalized','Low_normalized','Open_normalized',
                              'SMA_10_normalized','SMA_50_normalized','EMA_10_normalized','EMA_50_normalized','WMA_10_normalized','WMA_50_normalized',
                              'shares_held','average_price']

        # Initialize balance, shares held, and total shares traded
        self.balance = 10_000_000.0  # $10 million
        self.shares_held = 0
        self.total_shares_traded = 0
        self.data['shares_held'] = 0.0
        self.data['average_price'] = 0.0
        self.window_interval = 32
        self.money_spend = 0

        # Define action space: [Hold, Buy, Sell]
        self.action_space = spaces.Discrete(3)

        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(self.window, len(self.state_columns)), dtype=np.float32)

    def reset(self):
        self.current_step = 0
        self.balance = 10_000_000.0  # $10 million
        self.shares_held = 0
        self.total_shares_traded = 0
        self.cumulative_reward = 0
        self.trades = []
        self.money_spend = 0
        self.data['shares_held'] = 0.0
        self.data['average_price'] = 0.0
        return self._next_observation()

    def _next_observation(self):
        # current share_held will be observed as a part of the observation
        # the index start with 35 because of the Technical indicator analysis
        self.data.iloc[self.current_step, -2] = self.shares_held/100000.0
        # self.data.iloc[self.current_step, -1] = self.total_shares_traded/100000.0
        if self.shares_held == 0:
            self.data.iloc[self.current_step, -1] = 0.0
        else:
            self.data.iloc[self.current_step, -1] = self.money_spend/  (self.shares_held*192.0)


        
        start_idx = max(self.current_step - (self.window - 1) * self.window_interval, 0)
        end_idx = self.current_step + 1
        obs_indices = list(range(start_idx, end_idx, self.window_interval))
        obs = self.data[self.state_columns].iloc[obs_indices].values
        
         # Ensure the shape is (size, num_columns) by padding with zeros if necessary
        if len(obs) < self.window:
            padding = th.zeros((self.window - len(obs), obs.shape[1]))
            obs = th.cat((padding, th.tensor(obs, dtype=th.float32)), dim=0)
        else:
            obs = th.tensor(obs, dtype=th.float32)
        
        return obs

    def step(self, action):
        expected_price = self.data.iloc[self.current_step]['ask_px_00']
        actual_price = self.data.iloc[self.current_step]['price']
        transaction_time = self.data.iloc[self.current_step]['ts_in_delta']

        reward = 0
        
        if self.shares_held == 0:
            current_average_price = 0.0
        else:
            current_average_price = self.money_spend/ self.shares_held


        if self.current_step >= len(self.data) - 1:
            self.current_step = 0

        is_actioned = self._take_action(action)
        
        # when action was done, calculate additional reward for trading
        if action != 0 and is_actioned:
            transaction_cost= self._calculate_transaction_cost(self.data.iloc[self.current_step]['Volume'], 0.3, self.data['Volume'].mean())
            
            cost = self._calculate_reward(expected_price, actual_price, transaction_time,transaction_cost)*self.trades[-1]['shares']

            reward += cost

            if action==2:
                self.money_spend = max(self.money_spend-current_average_price*self.trades[-1]['shares'],0)
                reward+=(actual_price-current_average_price)*self.trades[-1]['shares']
            self.trades[-1]['reward'] = reward
            self.trades[-1]['previous_price'] =  self.data.iloc[max(self.current_step-1,0)]['price']
            self.trades[-1]['cost'] = cost
            self.trades[-1]['transaction_cost'] = transaction_cost*self.trades[-1]['shares']
            self.trades[-1]['slippage'] = expected_price - actual_price
            self.trades[-1]['time_penalty'] = 100*transaction_time/1e9
                
        done = self.current_step == len(self.data) - 2
        obs = self._next_observation()
        info = {
        'step': self.current_step,
        'action': action,
        'price': actual_price,
        'shares': self.trades[-1]['shares'] if self.trades else 0
        }
        self.current_step += 1
        self.cumulative_reward += reward
        return obs, reward, done, info


    def _take_action(self, action):
        current_price = self.data.iloc[self.current_step]['Close']
        current_time = pd.to_datetime(self.data.iloc[self.current_step]['ts_event'])
        trade_info = {'step': self.current_step, 'timestamp': current_time,'balance':self.balance,'shares_held':self.shares_held, 'action': action, 'price': current_price, 'shares': 0, 'reward': 0, 'transaction_cost': 0, 'slippage': 0, 'time_penalty': 0}

        if action == 1: # and self.total_shares_traded < self.daily_trading_limit:  # Buy
            shares_bought = (self.balance * np.random.uniform(0.001, 0.005)) // current_price
            self.balance -= shares_bought * current_price
            self.money_spend+=shares_bought*current_price
            self.shares_held += shares_bought
            self.total_shares_traded += shares_bought
            trade_info['shares'] = shares_bought
            trade_info['balance'] = self.balance
            trade_info['shares_held'] = self.shares_held
            if(shares_bought>0):
                self.trades.append(trade_info)
                return True
            else:
                # False means even the action is 1, no trade will be done because of currently not enough balance
                return False
        elif action == 2: # and self.total_shares_traded < self.daily_trading_limit:  # Sell
            shares_sold = min((self.balance * np.random.uniform(0.001, 0.005)) // current_price, self.shares_held)
            self.balance += shares_sold * current_price
            self.shares_held -= shares_sold
            self.total_shares_traded += shares_sold
            trade_info['shares'] = shares_sold
            trade_info['balance'] = self.balance
            trade_info['shares_held'] = self.shares_held
            if(shares_sold>0):
                self.trades.append(trade_info)
                return True
            else:
                # False means even the action is 2, no trade will be done because of currently not holding any
                return False

    def _calculate_reward(self, expected_price, actual_price, transaction_time, transaction_cost):
        # The order loss for each share of stock
        slippage = expected_price - actual_price
        time_penalty = 100*transaction_time/1e9
        reward = - (slippage + time_penalty + transaction_cost)
        return reward
    # def _calculate_reward_of_holding(self):
    #     # Calculate the profit or loss based on current holdings
    #     current_price = self.data.iloc[self.current_step]['Close']
    #     profit_or_loss = 0
    #     if self.current_step>0:
    #         profit_or_loss = (current_price - self.data.iloc[max(0, self.current_step - 1)]['Close']) * self.shares_held
    #     return profit_or_loss
    
    
    def _calculate_transaction_cost(self, volume, volatility, daily_volume):
        return volatility * np.sqrt(volume / daily_volume)
    
    def run(self):
        self.reset()
        for _ in range(len(self.data)):
            self.step()
        return self.cumulative_reward, self.trades

    def render(self, mode='human', close=False):
        print(f'Step: {self.current_step}')
        print(f'Balance: {self.balance}')
        print(f'Shares held: {self.shares_held}')
        print(f'Total shares traded: {self.total_shares_traded}')
        print(f'Total portfolio value: {self.balance + self.shares_held * self.data.iloc[self.current_step]["Close"]}')
        print(f'Cumulative reward: {self.cumulative_reward}')
        self.print_trades()

    def print_trades(self):
        # download all trades in a pandas dataframe using .csv
        trades_df = pd.DataFrame(self.trades)
        # Save a csv
        trades_df.to_csv('trades_ppo.csv', index=False)
        for trade in self.trades:
            print(f"Step: {trade['step']}, Timestamp: {trade['timestamp']}, Action: {trade['action']}, Price: {trade['price']}, Shares: {trade['shares']}, Reward: {trade['reward']}, Transaction Cost: {trade['transaction_cost']}, Slippage: {trade['slippage']}, Time Penalty: {trade['time_penalty']}")

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
from typing import Callable, Dict, List, Optional, Tuple, Type, Union

from torch import nn
from stable_baselines3 import PPO
from stable_baselines3.common.policies import ActorCriticPolicy
from stable_baselines3.common.distributions import CategoricalDistribution
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor

class CustomLSTMFeatureExtractor(BaseFeaturesExtractor):
    """
    :param observation_space: (gym.Space)
    :param features_dim: (int) Number of features extracted.
        This corresponds to the number of units for the last layer.
    :param lstm_hidden_size: (int) Number of hidden units in the LSTM.
    :param n_lstm_layers: (int) Number of LSTM layers.
    """

    def __init__(self, observation_space: spaces.Box, input_dim,features_dim: int = 64, lstm_hidden_size: int = 64, n_lstm_layers: int = 1):
        super().__init__(observation_space, features_dim)

        self.lstm = nn.LSTM(input_dim, lstm_hidden_size, n_lstm_layers, batch_first=True)

        # Output layer to map LSTM outputs to the desired feature dimension
        self.linear = nn.Sequential(nn.Linear(lstm_hidden_size, features_dim), nn.ReLU())

    def forward(self, observations: th.Tensor) -> th.Tensor:
        # Assuming observations is of shape (batch_size, sequence_length, input_dim)
        # print("forward")
        lstm_out, _ = self.lstm(observations)
        # Use the last output of the LSTM
        last_out = lstm_out[:, -1, :]
        return self.linear(last_out)


#### Train the PPO Agent with the environment and for different tickers.

In [7]:
# Define the daily trading limit (total number of shares to trade per day)
daily_trading_limit = 1000

ticker = 'AAPL'  # Specify the ticker you want to trade
ticker_data = market_features_df[market_features_df['symbol'] == ticker]

env = TradingEnvironment(ticker_data, daily_trading_limit)  # Adjust window_size if needed

In [8]:

import torch
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(0), :]

class TransAm(nn.Module):
    def __init__(self, feature_size=30, num_layers=2, dropout=0.2):
        super(TransAm, self).__init__()
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(feature_size)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=feature_size, nhead=10, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        self.decoder = nn.Linear(feature_size, 1)
        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            device = src.device
            mask = self._generate_square_subsequent_mask(len(src)).to(device)
            self.src_mask = mask
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        # output = self.decoder(output)
        return output

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask
        
class CustomTransformerFeatureExtractor(BaseFeaturesExtractor):
    def __init__(self, observation_space: spaces.Box, features_dim: int = 30):
        super().__init__(observation_space, features_dim)

        #load the pre-trained transformer
        self.transformer = torch.load(f'./best_model_multi22.pt', map_location=torch.device('cpu'))
        self.transformer.decoder = None
        print("finish loading pre-trained network")
        print(self.transformer)
        self.transformation_layer = nn.Sequential(
        nn.Linear(32, 30),
        nn.ReLU(),
        nn.Linear(30, 30),
        nn.ReLU()
    )

    def forward(self, observations: th.Tensor) -> th.Tensor:
        observations = observations.permute(1,0,2)
        out = self.transformation_layer(observations)
        out = self.transformer(out)
        out = out.permute(1,0,2)
        return out[:,-1,:]



In [9]:
import pandas as pd
from stable_baselines3 import PPO
# from stable_baselines3.common.utils import linear_schedule
# Define the daily trading limit (total number of shares to trade per day)
daily_trading_limit = 1000

ticker = 'AAPL'  # Specify the ticker you want to trade
ticker_data = combined_df[market_features_df['symbol'] == ticker]


env = TradingEnvironment(ticker_data, daily_trading_limit)  # Adjust window_size if needed
env.action_space.seed(42)

policy_kwargs_LSTM = dict(
    features_extractor_class=CustomLSTMFeatureExtractor,
    features_extractor_kwargs=dict(features_dim=64, lstm_hidden_size=64, n_lstm_layers=2,input_dim = 32)
)
policy_kwargs_Transformer = dict(
    features_extractor_class=CustomTransformerFeatureExtractor
)

best_hyperparameters = {'learning_rate': 0.0002,'n_steps': 2048,'batch_size': 128, 'gamma': 1,'clip_range': 0.3,'n_epochs': 6,'ent_coef':0.01}

# model = PPO('MlpPolicy', env, policy_kwargs=policy_kwargs_LSTM,verbose=1,**best_hyperparameters)
model = PPO('MlpPolicy', env, policy_kwargs=policy_kwargs_Transformer,verbose=1,**best_hyperparameters)
# Train the model
model.learn(total_timesteps=55000)
model.learn(total_timesteps=55000)
model.learn(total_timesteps=55000)
# Save the model
model.save("trading_agent")

# Evaluate the model
print("evaluating")
obs = env.reset()
from tqdm import tqdm
for _ in tqdm(range(len(ticker_data)), desc="Processing"):
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    if done:
        break

# Render the final state
env.render()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
finish loading pre-trained network
TransAm(
  (pos_encoder): PositionalEncoding()
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): _LinearWithBias(in_features=30, out_features=30, bias=True)
    )
    (linear1): Linear(in_features=30, out_features=2048, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
    (linear2): Linear(in_features=2048, out_features=30, bias=True)
    (norm1): LayerNorm((30,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((30,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.2, inplace=False)
    (dropout2): Dropout(p=0.2, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=30, out_features=30, bias=True)
        )
        (l

Processing: 100%|███████████████████████▉| 59219/59221 [06:14<00:00, 158.22it/s]


Step: 59220
Balance: 10000000.0
Shares held: 0
Total shares traded: 0
Total portfolio value: 10000000.0
Cumulative reward: 0


### TRADING BLOTTER:

#### Preprocess the data for the trading blotter:

In [10]:
import pandas as pd
from scipy.stats import norm
import matplotlib.pyplot as plt

INITIAL_CASH = 10_000_000  # $10 million

def preprocess_data(df):
    df['liquidity'] = df['bid_sz_00'] * df['bid_px_00'] + df['ask_sz_00'] * df['ask_px_00']
    return df

def calculate_rsi(data, window=14):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

def calculate_vol_and_liquidity(price_df, volume_df, window_size):
    # Calculate rolling statistics
    rolling_mean_vol = price_df.pct_change().rolling(window=window_size).mean()
    rolling_std_vol = price_df.pct_change().rolling(window=window_size).std()
    rolling_mean_liq = volume_df.rolling(window=window_size).mean()
    rolling_std_liq = volume_df.rolling(window=window_size).std()
    
    return rolling_mean_vol, rolling_std_vol, rolling_mean_liq, rolling_std_liq

def get_percentile(current_value, mean, std):
    if std > 0:
        z_score = (current_value - mean) / std
        percentile = norm.cdf(z_score)
    else:
        percentile = 0.5  # No variation
    return percentile

def get_trade_price(base_price, current_vol, current_liq, mean_vol, std_vol, mean_liq, std_liq, trade_direction):
    vol_percentile = get_percentile(current_vol, mean_vol, std_vol)
    liq_percentile = get_percentile(current_liq, mean_liq, std_liq)

    # Define price adjustment scenarios based on market conditions
    if vol_percentile >= 0.9 and liq_percentile < 0.1:
        price_adjustment_percent = np.random.uniform(-0.25, -0.15)
    elif vol_percentile <= 0.1 and liq_percentile < 0.1:
        price_adjustment_percent = np.random.uniform(-0.10, -0.05)
    elif vol_percentile >= 0.9 and liq_percentile >= 0.9:
        price_adjustment_percent = np.random.uniform(-0.05, +0.10)
    else:
        price_adjustment_percent = np.random.uniform(-0.05, +0.05)  # Default for normal conditions

    # Adjust price based on trade direction
    if trade_direction == 'BUY':
        adjusted_price = base_price * (1 - price_adjustment_percent)
    else:  # SELL
        adjusted_price = base_price * (1 + price_adjustment_percent)
    
    return adjusted_price


#### Create trading environment for the blotter

In [11]:
class TradingEnvironmentwithBlotter:
    def __init__(self, data, daily_trading_limit, window_size):
        self.data = preprocess_data(data)
        self.daily_trading_limit = daily_trading_limit
        self.window_size = window_size
        self.state_columns = ['price', 'liquidity', 'RSI', 'MACD', 'MACD_signal', 'MACD_hist', 'Stoch_k', 'Stoch_d',
                              'OBV', 'Upper_BB', 'Middle_BB', 'Lower_BB', 'ATR_1', 'ADX', '+DI', '-DI', 'CCI']
        self.reset()

    def reset(self):
        self.current_step = 0
        self.balance = INITIAL_CASH
        self.shares_held = 0
        self.total_shares_traded = 0
        self.cumulative_reward = 0
        self.trades = []
        self.portfolio = {'cash': self.balance, 'holdings': {ticker: 0 for ticker in self.data['symbol'].unique()}}
        self.data['RSI'] = calculate_rsi(self.data['price'])
        self.data['pct_change'] = self.data['price'].pct_change()
        self.data['rolling_mean_vol'], self.data['rolling_std_vol'], self.data['rolling_mean_liq'], self.data['rolling_std_liq'] = calculate_vol_and_liquidity(self.data['price'], self.data['liquidity'], self.window_size)

    def step(self):
        row = self.data.iloc[self.current_step]
        current_price = row['price']
        current_time = pd.to_datetime(row['ts_event'])
        current_rsi = row['RSI']
        current_vol = row['pct_change']
        current_liq = row['liquidity']
        mean_vol = row['rolling_mean_vol']
        std_vol = row['rolling_std_vol']
        mean_liq = row['rolling_mean_liq']
        std_liq = row['rolling_std_liq']

        if current_rsi < 30:  # Entry signal based on RSI
            trade_direction = 'BUY'
            trade_price = get_trade_price(current_price, current_vol, current_liq, mean_vol, std_vol, mean_liq, std_liq, trade_direction)
            trade_size = (self.portfolio['cash'] * np.random.uniform(0.001, 0.005)) / trade_price
            if self.portfolio['cash'] >= trade_size * trade_price:
                self.portfolio['cash'] -= trade_size * trade_price
                self.portfolio['holdings'][row['symbol']] += trade_size
                trade_status = 'filled'
            else:
                trade_status = 'cancelled'
        elif current_rsi > 70:  # Exit signal based on RSI
            trade_direction = 'SELL'
            if self.portfolio['holdings'][row['symbol']] > 0:
                trade_size = min(self.portfolio['holdings'][row['symbol']], self.portfolio['cash']*np.random.uniform(0.001, 0.005) / current_price)
                trade_price = get_trade_price(current_price, current_vol, current_liq, mean_vol, std_vol, mean_liq, std_liq, trade_direction)
                self.portfolio['cash'] += trade_size * trade_price
                self.portfolio['holdings'][row['symbol']] -= trade_size
                trade_status = 'filled'
            else:
                trade_size = 0
                trade_status = 'cancelled'
        else:
            trade_direction = 'HOLD'
            trade_size = 0
            trade_price = current_price
            trade_status = 'skipped'

        if trade_size > 0:
            expected_price = row['ask_px_00']
            actual_price = row['price']
            transaction_time = row['ts_in_delta']
            transaction_cost = self._calculate_transaction_cost(row['Volume'], 0.3, self.data['Volume'].mean())
            slippage = expected_price - actual_price
            time_penalty = 1000 * transaction_time / 1e9
            reward = - (slippage + time_penalty + transaction_cost)
        
            self.cumulative_reward += reward
            self.trades.append({
                'step': self.current_step,
                'timestamp': current_time,
                'action': trade_direction,
                'price': trade_price,
                'shares': trade_size,
                'symbol': row['symbol'],
                'reward': reward,
                'transaction_cost': transaction_cost,
                'slippage': slippage,
                'time_penalty': time_penalty
            })

            

        self.current_step += 1
        if self.current_step >= len(self.data) - 1:
            done=True
            self.current_step = 0

    def _calculate_transaction_cost(self, volume, volatility, daily_volume):
        return volatility * np.sqrt(volume / daily_volume)

    def run(self):
        self.reset()
        for _ in range(len(self.data)):
            self.step()
        return self.cumulative_reward, self.trades

    def render(self):
        print(f'Cumulative reward: {self.cumulative_reward}')
        row = self.data.iloc[self.current_step]
        print(f'Total portfolio value: {self.portfolio["cash"] + self.portfolio["holdings"][row["symbol"]]*row["Close"]}')
        # get trades in a pandas dataframe
        trades_df = pd.DataFrame(self.trades)
        # Save a csv
        trades_df.to_csv('trades_blotter.csv', index=False)
        for trade in self.trades:
            print(f"Step: {trade['step']}, Timestamp: {trade['timestamp']}, Action: {trade['action']}, Price: {trade['price']}, Shares: {trade['shares']}, Symbol: {trade['symbol']}, Reward: {trade['reward']}, Transaction Cost: {trade['transaction_cost']}, Slippage: {trade['slippage']}, Time Penalty: {trade['time_penalty']}")

#### Run the trading blotter

In [12]:
# Filter data for the specified ticker
ticker = 'AAPL'  # Specify the ticker you want to trade
ticker_data = market_features_df[market_features_df['symbol'] == ticker]

window_size = 60
daily_trading_limit = 1000
# Create the trading environment
env = TradingEnvironmentwithBlotter(ticker_data, daily_trading_limit=1000, window_size=window_size)  # Daily trading limit of 1000 shares

# Run the environment
cumulative_reward, trades = env.run()

# Render the results
env.render()

Cumulative reward: -12232.039320421814
Total portfolio value: 10013992.388694532


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



Step: 47793, Timestamp: 2023-07-03 16:13:21.791577131, Action: SELL, Price: 186.12664133457858, Shares: 0.21957937973827077, Symbol: AAPL, Reward: -0.344315394584932, Transaction Cost: 0.17960739458493197, Slippage: 0.0, Time Penalty: 0.164708
Step: 47794, Timestamp: 2023-07-03 16:13:24.796592552, Action: SELL, Price: 194.10022163499235, Shares: 0.30403153983974635, Symbol: AAPL, Reward: -0.44296521332449773, Transaction Cost: 0.2540032133245068, Slippage: 0.009999999999990905, Time Penalty: 0.178962
Step: 47795, Timestamp: 2023-07-03 16:13:24.796592552, Action: SELL, Price: 194.22043518531967, Shares: 0.4313245119748088, Symbol: AAPL, Reward: -0.44296521332449773, Transaction Cost: 0.2540032133245068, Slippage: 0.009999999999990905, Time Penalty: 0.178962
Step: 47796, Timestamp: 2023-07-03 16:13:24.796592552, Action: SELL, Price: 188.4968837541441, Shares: 0.5761477776394357, Symbol: AAPL, Reward: -0.44296521332449773, Transaction Cost: 0.2540032133245068, Slippage: 0.0099999999999909

In [13]:
df=market_features_df.copy()

In [14]:
df['timestamp']=pd.to_datetime(df['ts_recv'])

In [15]:
df.head()

Unnamed: 0,ts_recv,ts_event,rtype,publisher_id,instrument_id,action,side,depth,price,size,flags,ts_in_delta,sequence,bid_px_00,ask_px_00,bid_sz_00,ask_sz_00,bid_ct_00,ask_ct_00,symbol,Close,Volume,High,Low,Open,RSI,MACD,MACD_signal,MACD_hist,Stoch_k,Stoch_d,OBV,Upper_BB,Middle_BB,Lower_BB,ATR_1,ATR_2,ATR_5,ATR_10,ATR_20,SMA_10,SMA_50,EMA_10,EMA_50,WMA_10,WMA_50,ADX,+DI,-DI,CCI,DLR,TWAP,VWAP,timestamp
50,1688371242324266534,1688371242324101963,1,2,32,T,B,0,194.15,10,130,164571,350459,194.02,194.15,180,400,2,1,AAPL,194.15,10,194.15,194.02,194.12,64.961352,0.012237,0.005054,0.007183,47.142857,26.825397,-3332.0,194.185746,194.0655,193.945254,0.13,0.129921,0.142483,0.136696,0.118353,194.04,194.0332,194.070777,194.037204,194.061273,194.04349,94.270606,8.115584,0.394096,107.142857,0.000155,194.034902,194.059871,2023-07-03 08:00:42.324266534
51,1688371247317894640,1688371247317729998,1,2,32,T,B,0,194.1,10,130,164642,354918,194.06,194.1,100,10,1,1,AAPL,194.1,10,194.1,194.06,194.15,55.742466,0.013947,0.006833,0.007115,57.97619,42.81746,-3342.0,194.18893,194.068,193.94707,0.09,0.109961,0.131986,0.132026,0.116935,194.048,194.033,194.07609,194.039667,194.072182,194.04611,94.018256,7.698755,0.373855,18.518519,-0.000258,194.036154,194.059909,2023-07-03 08:00:47.317894640
52,1688371257325756491,1688371257325590403,1,2,32,T,A,0,194.1,1,0,166088,363515,194.1,194.12,101,244,2,1,AAPL,194.1,1,194.12,194.1,194.1,55.742466,0.015128,0.008492,0.006636,65.47619,56.865079,-3342.0,194.191904,194.0705,193.949096,0.02,0.06498,0.109589,0.120823,0.112088,194.057,194.0328,194.080437,194.042033,194.081636,194.048737,93.871355,8.819513,0.369315,83.333333,0.0,194.037358,194.059918,2023-07-03 08:00:57.325756491
53,1688371257325756491,1688371257325590403,1,2,32,T,A,0,194.1,1,130,166088,363516,194.1,194.12,100,244,1,1,AAPL,194.1,1,194.12,194.1,194.1,55.742466,0.015881,0.009969,0.005911,62.5,61.984127,-3342.0,194.194672,194.073,193.951328,0.02,0.04249,0.091671,0.110741,0.107484,194.066,194.0326,194.083994,194.044306,194.089455,194.051373,93.734948,8.705674,0.364548,55.555556,0.0,194.038519,194.059928,2023-07-03 08:00:57.325756491
54,1688371259762706298,1688371259762541862,1,2,32,T,B,0,194.12,50,130,164436,366351,194.1,194.12,99,244,1,1,AAPL,194.12,50,194.12,194.1,194.1,58.672524,0.017885,0.011553,0.006332,66.666667,64.880952,-3292.0,194.199345,194.0765,193.953655,0.02,0.031245,0.077337,0.101667,0.10311,194.078,194.035,194.090541,194.047274,194.099273,194.0548,93.608283,8.58632,0.35955,89.74359,0.000103,194.04,194.060393,2023-07-03 08:00:59.762706298
