### PPO AGENT:

#### Load the dataset and preprocess the dataframe in the required format.

Technical indicators created for the environment include:

- RSI
- MACD
- Stoch_k
- OBV
- Upper_BB
- ATR_1
- ATR_2
- ATR_5
- ATR_10
- ATR_20

In [1]:
import pandas as pd
import numpy as np
import talib as ta
np.random.seed(42)

class TechnicalIndicators:
    def __init__(self, data):
        self.data = data

    def add_momentum_indicators(self):
        self.data['RSI'] = ta.RSI(self.data['Close'], timeperiod=14)
        self.data['MACD'], self.data['MACD_signal'], self.data['MACD_hist'] = ta.MACD(self.data['Close'], fastperiod=12, slowperiod=26, signalperiod=9)
        self.data['Stoch_k'], self.data['Stoch_d'] = ta.STOCH(self.data['High'], self.data['Low'], self.data['Close'],
                                                              fastk_period=14, slowk_period=3, slowd_period=3)

    def add_volume_indicators(self):
        self.data['OBV'] = ta.OBV(self.data['Close'], self.data['Volume'])

    def add_volatility_indicators(self):
        self.data['Upper_BB'], self.data['Middle_BB'], self.data['Lower_BB'] = ta.BBANDS(self.data['Close'], timeperiod=20)
        self.data['ATR_1'] = ta.ATR(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=1)
        self.data['ATR_2'] = ta.ATR(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=2)
        self.data['ATR_5'] = ta.ATR(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=5)
        self.data['ATR_10'] = ta.ATR(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=10)
        self.data['ATR_20'] = ta.ATR(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=20)

    def add_trend_indicators(self):
        self.data['ADX'] = ta.ADX(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=14)
        self.data['+DI'] = ta.PLUS_DI(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=14)
        self.data['-DI'] = ta.MINUS_DI(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=14)
        self.data['CCI'] = ta.CCI(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=5)

    def add_other_indicators(self):
        self.data['DLR'] = np.log(self.data['Close'] / self.data['Close'].shift(1))
        self.data['TWAP'] = self.data['Close'].expanding().mean()
        self.data['VWAP'] = (self.data['Volume'] * (self.data['High'] + self.data['Low']) / 2).cumsum() / self.data['Volume'].cumsum()

    def add_all_indicators(self):
        self.add_momentum_indicators()
        self.add_volume_indicators()
        self.add_volatility_indicators()
        self.add_trend_indicators()
        self.add_other_indicators()
        return self.data

In [2]:
# data = pd.read_csv('/Users/hao/Downloads/Blockhouse-Work-Trial-main/xnas-itch-20230703.tbbo.csv')
data = pd.read_csv('./xnas-itch-20230703.tbbo.csv')
# Preprocessing to create necessary columns
data['price']=data['price']/1e9
data['bid_px_00']=data['bid_px_00']/1e9
data['ask_px_00']=data['ask_px_00']/1e9

data['Close'] = data['price']
data['Volume'] = data['size']
data['High'] = data[['bid_px_00', 'ask_px_00']].max(axis=1)
data['Low'] = data[['bid_px_00', 'ask_px_00']].min(axis=1)
data['Open'] = data['Close'].shift(1).fillna(data['Close'])


ti = TechnicalIndicators(data)
df_with_indicators = ti.add_all_indicators()

market_features_df = df_with_indicators[35:]

print(market_features_df)

                   ts_recv             ts_event  rtype  publisher_id  \
35     1688371214386057385  1688371214385893078      1             2   
36     1688371214386063777  1688371214385899379      1             2   
37     1688371215804852019  1688371215804687301      1             2   
38     1688371219671476629  1688371219671312224      1             2   
39     1688371223368835585  1688371223368671235      1             2   
...                    ...                  ...    ...           ...   
59266  1688417954514485218  1688417954514320323      1             2   
59267  1688417961020718430  1688417961020553920      1             2   
59268  1688417973297905504  1688417973297741235      1             2   
59269  1688417996889779362  1688417996889614660      1             2   
59270  1688417998907430616  1688417998907265922      1             2   

       instrument_id action side  depth   price  size  ...     ATR_5  \
35                32      T    N      0  194.05    50  ...  0.0

In [3]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
numeric_columns = market_features_df.select_dtypes(include=['float64', 'int64']).columns

# normalize numeric data, save to a new dataframe
normalized_data = scaler.fit_transform(market_features_df[numeric_columns])
normalized_df = pd.DataFrame(normalized_data, columns=numeric_columns, index=market_features_df.index)
suffix = '_normalized'
normalized_df = normalized_df.add_suffix(suffix)

# concatenate new df with the old one
combined_df = pd.concat([market_features_df, normalized_df], axis=1)

pd.set_option('display.max_columns', None)


reversed_df = combined_df.iloc[::-1]
reversed_df.index = combined_df.index
combined_df.head()

Unnamed: 0,ts_recv,ts_event,rtype,publisher_id,instrument_id,action,side,depth,price,size,flags,ts_in_delta,sequence,bid_px_00,ask_px_00,bid_sz_00,ask_sz_00,bid_ct_00,ask_ct_00,symbol,Close,Volume,High,Low,Open,RSI,MACD,MACD_signal,MACD_hist,Stoch_k,Stoch_d,OBV,Upper_BB,Middle_BB,Lower_BB,ATR_1,ATR_2,ATR_5,ATR_10,ATR_20,ADX,+DI,-DI,CCI,DLR,TWAP,VWAP,ts_recv_normalized,ts_event_normalized,rtype_normalized,publisher_id_normalized,instrument_id_normalized,depth_normalized,price_normalized,size_normalized,flags_normalized,ts_in_delta_normalized,sequence_normalized,bid_px_00_normalized,ask_px_00_normalized,bid_sz_00_normalized,ask_sz_00_normalized,bid_ct_00_normalized,ask_ct_00_normalized,Close_normalized,Volume_normalized,High_normalized,Low_normalized,Open_normalized,RSI_normalized,MACD_normalized,MACD_signal_normalized,MACD_hist_normalized,Stoch_k_normalized,Stoch_d_normalized,OBV_normalized,Upper_BB_normalized,Middle_BB_normalized,Lower_BB_normalized,ATR_1_normalized,ATR_2_normalized,ATR_5_normalized,ATR_10_normalized,ATR_20_normalized,ADX_normalized,+DI_normalized,-DI_normalized,CCI_normalized,DLR_normalized,TWAP_normalized,VWAP_normalized
35,1688371214386057385,1688371214385893078,1,2,32,T,N,0,194.05,50,130,164307,326232,194.0,194.3,3101,19,4,10,AAPL,194.05,50,194.3,194.0,194.05,54.544543,0.006271,-0.00313,0.009401,52.525253,61.952862,-266.0,194.065621,194.017,193.968379,0.3,0.175078,0.098615,0.075141,0.072403,97.257397,30.435801,0.196362,166.666667,0.0,194.02,194.021894,0.0,0.0,0.0,0.0,0.0,0.0,0.904762,2.1e-05,1.0,0.00067,0.0,0.903226,1.0,0.139917,0.000518,0.005226,0.096774,0.904762,2.1e-05,1.0,0.903226,0.904762,0.545448,0.63476,0.577504,0.577937,0.525253,0.619529,0.010229,0.896583,0.909459,0.914933,0.674419,0.383903,0.20727,0.162169,0.182608,0.971696,0.579292,0.004142,0.75,0.383346,0.920057,0.930818
36,1688371214386063777,1688371214385899379,1,2,32,T,N,0,194.05,50,130,164398,326233,194.0,194.3,3101,19,4,10,AAPL,194.05,50,194.3,194.0,194.05,54.544543,0.007108,-0.001082,0.00819,38.383838,52.525253,-266.0,194.06899,194.02,193.97101,0.3,0.237539,0.138892,0.097627,0.083783,97.361721,22.989295,0.14832,83.333333,0.0,194.020811,194.025188,1.382432e-10,1.309672e-10,0.0,0.0,0.0,0.0,0.904762,2.1e-05,1.0,0.000791,3.964976e-09,0.903226,1.0,0.139917,0.000518,0.005226,0.096774,0.904762,2.1e-05,1.0,0.903226,0.904762,0.545448,0.639407,0.589794,0.565983,0.383838,0.525253,0.010229,0.897905,0.910674,0.916022,0.674419,0.529161,0.301477,0.218147,0.215909,0.972773,0.43756,0.003129,0.625,0.383346,0.920623,0.933177
37,1688371215804852019,1688371215804687301,1,2,32,T,B,0,194.21,10,130,164718,328131,194.0,194.21,3101,29,4,1,AAPL,194.21,10,194.21,194.0,194.05,85.890753,0.020446,0.003223,0.017223,40.40404,43.771044,-256.0,194.125889,194.0305,193.935111,0.21,0.22377,0.153114,0.108864,0.090094,97.458593,19.409454,0.125224,79.268293,0.000824,194.025789,194.025596,3.032615e-05,3.032615e-05,0.0,0.0,0.0,0.0,0.968254,4e-06,1.0,0.001215,7.529489e-06,0.903226,0.964427,0.139917,0.000806,0.005226,0.0,0.968254,4e-06,0.964427,0.903226,0.904762,0.858911,0.713487,0.615638,0.65516,0.40404,0.43771,0.01024,0.920225,0.914928,0.901165,0.465116,0.497139,0.334741,0.246123,0.234376,0.973772,0.369425,0.002642,0.618902,0.647473,0.924096,0.933469
38,1688371219671476629,1688371219671312224,1,2,32,T,N,0,194.14,10,130,164405,331406,194.0,194.16,3101,400,4,1,AAPL,194.14,10,194.16,194.0,194.21,64.827662,0.025079,0.007594,0.017484,49.494949,42.760943,-266.0,194.142928,194.0375,193.932072,0.21,0.216885,0.164491,0.118978,0.096089,97.548546,16.622008,0.10724,-3.205128,-0.00036,194.028718,194.025873,0.0001129737,0.0001129737,0.0,0.0,0.0,0.0,0.940476,4e-06,1.0,0.0008,2.051479e-05,0.903226,0.944664,0.139917,0.011485,0.005226,0.0,0.940476,4e-06,0.944664,0.903226,0.968254,0.648279,0.739218,0.641874,0.657745,0.494949,0.427609,0.010229,0.926908,0.917764,0.899907,0.465116,0.481128,0.361353,0.2713,0.25192,0.974701,0.31637,0.002262,0.495192,0.267817,0.926139,0.933667
39,1688371223368835585,1688371223368671235,1,2,32,T,B,0,194.13,10,130,164350,334235,194.0,194.13,3101,400,4,1,AAPL,194.13,10,194.13,194.0,194.14,62.470772,0.027625,0.011601,0.016025,57.575758,49.158249,-276.0,194.155247,194.044,193.932753,0.14,0.178442,0.159593,0.12108,0.098285,97.632074,15.068361,0.097216,-113.095238,-5.2e-05,194.03125,194.026071,0.0001920032,0.0001920032,0.0,0.0,0.0,0.0,0.936508,4e-06,1.0,0.000727,3.17317e-05,0.903226,0.932806,0.139917,0.011485,0.005226,0.0,0.936508,4e-06,0.932806,0.903226,0.940476,0.62471,0.753361,0.665921,0.643333,0.575758,0.491582,0.010219,0.931741,0.920397,0.900189,0.302326,0.391727,0.349896,0.276534,0.258345,0.975563,0.2868,0.002051,0.330357,0.366838,0.927905,0.933809


In [4]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59236 entries, 35 to 59270
Data columns (total 91 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ts_recv                   59236 non-null  int64  
 1   ts_event                  59236 non-null  int64  
 2   rtype                     59236 non-null  int64  
 3   publisher_id              59236 non-null  int64  
 4   instrument_id             59236 non-null  int64  
 5   action                    59236 non-null  object 
 6   side                      59236 non-null  object 
 7   depth                     59236 non-null  int64  
 8   price                     59236 non-null  float64
 9   size                      59236 non-null  int64  
 10  flags                     59236 non-null  int64  
 11  ts_in_delta               59236 non-null  int64  
 12  sequence                  59236 non-null  int64  
 13  bid_px_00                 59236 non-null  float64
 14  ask_p

#### Create the Trading Environment class for the PPO Agent

In [5]:
import gym
from gym import spaces
import torch as th
th.set_num_threads(5)
th.manual_seed(42)

class TradingEnvironment(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, data, daily_trading_limit):
        super(TradingEnvironment, self).__init__()
        self.data = data
        self.daily_trading_limit = daily_trading_limit
        self.current_step = 0
        self.window = 64

        # Extract state columns
        # self.state_columns = ['Close', 'Volume', 'RSI', 'MACD', 'MACD_signal', 'MACD_hist', 'Stoch_k', 'Stoch_d',
        #                       'OBV', 'Upper_BB', 'Middle_BB', 'Lower_BB', 'ATR_1', 'ADX', '+DI', '-DI', 'CCI','shares_held', 'total_shares_traded']
        self.state_columns = ['Close_normalized', 'Volume_normalized', 'RSI_normalized', 'MACD_normalized', 'MACD_signal_normalized', 'MACD_hist_normalized', 'Stoch_k_normalized', 'Stoch_d_normalized',
                              'OBV_normalized', 'Upper_BB_normalized', 'Middle_BB_normalized', 'Lower_BB_normalized', 'ATR_1_normalized', 'ADX_normalized', '+DI_normalized', '-DI_normalized', 'CCI_normalized',
                              'bid_px_00_normalized','ask_px_00_normalized','ts_in_delta_normalized','ts_recv_normalized',
                              'High_normalized','Low_normalized','Open_normalized',
                              'shares_held','average_price']

        # Initialize balance, shares held, and total shares traded
        self.balance = 10_000_000.0  # $10 million
        self.shares_held = 0
        self.total_shares_traded = 0
        self.data['shares_held'] = 0.0
        self.data['average_price'] = 0.0
        self.window_interval = 32
        self.money_spend = 0

        # Define action space: [Hold, Buy, Sell]
        self.action_space = spaces.Discrete(3)

        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(self.window, len(self.state_columns)), dtype=np.float32)

    def reset(self):
        self.current_step = 0
        self.balance = 10_000_000.0  # $10 million
        self.shares_held = 0
        self.total_shares_traded = 0
        self.cumulative_reward = 0
        self.trades = []
        self.money_spend = 0
        self.data['shares_held'] = 0.0
        self.data['average_price'] = 0.0
        return self._next_observation()

    def _next_observation(self):
        # current share_held will be observed as a part of the observation
        # the index start with 35 because of the Technical indicator analysis
        self.data.iloc[self.current_step, -2] = self.shares_held/100000.0
        self.data.iloc[self.current_step, -1] = self.total_shares_traded/100000.0
        # if self.shares_held == 0:
        #     self.data.iloc[self.current_step, -1] = 0.0
        # else:
        #     self.data.iloc[self.current_step, -1] = self.money_spend/ (self.shares_held*192.0)


        
        start_idx = max(self.current_step - (self.window - 1) * self.window_interval, 0)
        end_idx = self.current_step + 1
        obs_indices = list(range(start_idx, end_idx, self.window_interval))
        obs = self.data[self.state_columns].iloc[obs_indices].values
        
         # Ensure the shape is (size, num_columns) by padding with zeros if necessary
        if len(obs) < self.window:
            padding = th.zeros((self.window - len(obs), obs.shape[1]))
            obs = th.cat((padding, th.tensor(obs, dtype=th.float32)), dim=0)
        else:
            obs = th.tensor(obs, dtype=th.float32)
        
        return obs

    def step(self, action):
        expected_price = self.data.iloc[self.current_step]['ask_px_00']
        actual_price = self.data.iloc[self.current_step]['price']
        transaction_time = self.data.iloc[self.current_step]['ts_in_delta']

        reward = 0
        
        if self.shares_held == 0:
            current_average_price = 0.0
        else:
            current_average_price = self.money_spend/ self.shares_held


        if self.current_step >= len(self.data) - 1:
            self.current_step = 0

        is_actioned = self._take_action(action)
        
        # when action was done, calculate additional reward for trading
        if action != 0 and is_actioned:
            transaction_cost= self._calculate_transaction_cost(self.data.iloc[self.current_step]['Volume'], 0.3, self.data['Volume'].mean())
            
            cost = self._calculate_reward(expected_price, actual_price, transaction_time, transaction_cost)*self.trades[-1]['shares']

            reward += cost

            if action==2:
                self.money_spend = max(self.money_spend-current_average_price*self.trades[-1]['shares'],0)
                reward+=(actual_price-current_average_price)*self.trades[-1]['shares']
            self.trades[-1]['reward'] = reward
            self.trades[-1]['previous_price'] =  self.data.iloc[max(self.current_step-1,0)]['price']
            self.trades[-1]['cost'] = cost
            self.trades[-1]['transaction_cost'] = transaction_cost*self.trades[-1]['shares']
            self.trades[-1]['slippage'] = expected_price - actual_price
            self.trades[-1]['time_penalty'] = 100*transaction_time/1e9
                
        done = self.current_step == len(self.data) - 2
        obs = self._next_observation()
        info = {
        'step': self.current_step,
        'action': action,
        'price': actual_price,
        'shares': self.trades[-1]['shares'] if self.trades else 0
        }
        self.current_step += 1
        self.cumulative_reward += reward
        return obs, reward, done, info


    def _take_action(self, action):
        current_price = self.data.iloc[self.current_step]['Close']
        current_time = pd.to_datetime(self.data.iloc[self.current_step]['ts_event'])
        trade_info = {'step': self.current_step, 'timestamp': current_time,'balance':self.balance,'shares_held':self.shares_held, 'action': action, 'price': current_price, 'shares': 0, 'reward': 0, 'transaction_cost': 0, 'slippage': 0, 'time_penalty': 0}

        if action == 1: # and self.total_shares_traded < self.daily_trading_limit:  # Buy
            shares_bought = (self.balance * np.random.uniform(0.001, 0.005)) // current_price
            self.balance -= shares_bought * current_price
            self.money_spend+=shares_bought*current_price
            self.shares_held += shares_bought
            self.total_shares_traded += shares_bought
            trade_info['shares'] = shares_bought
            trade_info['balance'] = self.balance
            trade_info['shares_held'] = self.shares_held
            if(shares_bought>0):
                self.trades.append(trade_info)
                return True
            else:
                # False means even the action is 1, no trade will be done because of currently not enough balance
                return False
        elif action == 2: # and self.total_shares_traded < self.daily_trading_limit:  # Sell
            shares_sold = min((self.balance * np.random.uniform(0.001, 0.005)) // current_price, self.shares_held)
            self.balance += shares_sold * current_price
            self.shares_held -= shares_sold
            self.total_shares_traded += shares_sold
            trade_info['shares'] = shares_sold
            trade_info['balance'] = self.balance
            trade_info['shares_held'] = self.shares_held
            if(shares_sold>0):
                self.trades.append(trade_info)
                return True
            else:
                # False means even the action is 2, no trade will be done because of currently not holding any
                return False

    def _calculate_reward(self, expected_price, actual_price, transaction_time, transaction_cost):
        # The order loss for each share of stock
        slippage = expected_price - actual_price
        time_penalty = 100*transaction_time/1e9
        reward = - (slippage + time_penalty + transaction_cost)
        return reward
        return 0
    # def _calculate_reward_of_holding(self):
    #     # Calculate the profit or loss based on current holdings
    #     current_price = self.data.iloc[self.current_step]['Close']
    #     profit_or_loss = 0
    #     if self.current_step>0:
    #         profit_or_loss = (current_price - self.data.iloc[max(0, self.current_step - 1)]['Close']) * self.shares_held
    #     return profit_or_loss
    
    
    def _calculate_transaction_cost(self, volume, volatility, daily_volume):
        return volatility * np.sqrt(volume / daily_volume)
    
    def run(self):
        self.reset()
        for _ in range(len(self.data)):
            self.step()
        return self.cumulative_reward, self.trades

    def render(self, mode='human', close=False):
        print(f'Step: {self.current_step}')
        print(f'Balance: {self.balance}')
        print(f'Shares held: {self.shares_held}')
        print(f'Total shares traded: {self.total_shares_traded}')
        print(f'Total portfolio value: {self.balance + self.shares_held * self.data.iloc[self.current_step]["Close"]}')
        print(f'Cumulative reward: {self.cumulative_reward}')
        self.print_trades()

    def print_trades(self):
        # download all trades in a pandas dataframe using .csv
        trades_df = pd.DataFrame(self.trades)
        # Save a csv
        trades_df.to_csv('trades_ppo.csv', index=False)
        for trade in self.trades:
            print(f"Step: {trade['step']}, Timestamp: {trade['timestamp']}, Action: {trade['action']}, Price: {trade['price']}, Shares: {trade['shares']}, Reward: {trade['reward']}, Transaction Cost: {trade['transaction_cost']}, Slippage: {trade['slippage']}, Time Penalty: {trade['time_penalty']}")

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
from typing import Callable, Dict, List, Optional, Tuple, Type, Union

from torch import nn
from stable_baselines3 import PPO
from stable_baselines3.common.policies import ActorCriticPolicy
from stable_baselines3.common.distributions import CategoricalDistribution
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor

class CustomLSTMFeatureExtractor(BaseFeaturesExtractor):
    """
    :param observation_space: (gym.Space)
    :param features_dim: (int) Number of features extracted.
        This corresponds to the number of units for the last layer.
    :param lstm_hidden_size: (int) Number of hidden units in the LSTM.
    :param n_lstm_layers: (int) Number of LSTM layers.
    """

    def __init__(self, observation_space: spaces.Box, input_dim,features_dim: int = 64, lstm_hidden_size: int = 64, n_lstm_layers: int = 1):
        super().__init__(observation_space, features_dim)

        self.lstm = nn.LSTM(input_dim, lstm_hidden_size, n_lstm_layers, batch_first=True)

        # Output layer to map LSTM outputs to the desired feature dimension
        self.linear = nn.Sequential(nn.Linear(lstm_hidden_size, features_dim), nn.ReLU())

    def forward(self, observations: th.Tensor) -> th.Tensor:
        # Assuming observations is of shape (batch_size, sequence_length, input_dim)
        # print("forward")
        lstm_out, _ = self.lstm(observations)
        # Use the last output of the LSTM
        last_out = lstm_out[:, -1, :]
        return self.linear(last_out)


#### Train the PPO Agent with the environment and for different tickers.

In [7]:
# Define the daily trading limit (total number of shares to trade per day)
daily_trading_limit = 1000

ticker = 'AAPL'  # Specify the ticker you want to trade
ticker_data = market_features_df[market_features_df['symbol'] == ticker]

env = TradingEnvironment(ticker_data, daily_trading_limit)  # Adjust window_size if needed

In [8]:

import torch
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(0), :]

class TransAm(nn.Module):
    def __init__(self, feature_size=30, num_layers=2, dropout=0.2):
        super(TransAm, self).__init__()
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(feature_size)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=feature_size, nhead=10, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        self.decoder = nn.Linear(feature_size, 1)
        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            device = src.device
            mask = self._generate_square_subsequent_mask(len(src)).to(device)
            self.src_mask = mask
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        # output = self.decoder(output)
        return output

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask
        
class CustomTransformerFeatureExtractor(BaseFeaturesExtractor):
    def __init__(self, observation_space: spaces.Box, features_dim: int = 30):
        super().__init__(observation_space, features_dim)

        #load the pre-trained transformer
        self.transformer = torch.load(f'./best_model_multi22.pt', map_location=torch.device('cpu'))
        self.transformer.decoder = None
        print("finish loading pre-trained network")
        print(self.transformer)
        self.transformation_layer = nn.Sequential(
        nn.Linear(26, 30),
        nn.ReLU()
    )

    def forward(self, observations: th.Tensor) -> th.Tensor:
        observations = observations.permute(1,0,2)
        out = self.transformation_layer(observations)
        out = self.transformer(out)
        out = out.permute(1,0,2)
        return out[:,-1,:]



In [None]:
import pandas as pd
from stable_baselines3 import PPO
# from stable_baselines3.common.utils import linear_schedule
# Define the daily trading limit (total number of shares to trade per day)
daily_trading_limit = 1000

ticker = 'AAPL'  # Specify the ticker you want to trade
# ticker_data = combined_df[market_features_df['symbol'] == ticker]
ticker_data = reversed_df[market_features_df['symbol'] == ticker]


env = TradingEnvironment(ticker_data, daily_trading_limit)  # Adjust window_size if needed
env.action_space.seed(42)

policy_kwargs_LSTM = dict(
    features_extractor_class=CustomLSTMFeatureExtractor,
    features_extractor_kwargs=dict(features_dim=64, lstm_hidden_size=64, n_lstm_layers=2,input_dim = 26)
)
policy_kwargs_Transformer = dict(
    features_extractor_class=CustomTransformerFeatureExtractor
)

best_hyperparameters = {'learning_rate': 0.0002,'n_steps': 2048,'batch_size': 128, 'gamma': 1,'clip_range': 0.3,'n_epochs': 6,'ent_coef':0.01}

# model = PPO('MlpPolicy', env, policy_kwargs=policy_kwargs_LSTM,verbose=1,**best_hyperparameters)
model = PPO('MlpPolicy', env, policy_kwargs=policy_kwargs_Transformer,verbose=1,**best_hyperparameters)
# Train the model
model.learn(total_timesteps=50000)

# Save the model
model.save("trading_agent")

# Evaluate the model
print("evaluating")
obs = env.reset()
from tqdm import tqdm
for _ in tqdm(range(len(ticker_data)), desc="Processing"):
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    if done:
        break

# Render the final state
env.render()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
finish loading pre-trained network
TransAm(
  (pos_encoder): PositionalEncoding()
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): _LinearWithBias(in_features=30, out_features=30, bias=True)
    )
    (linear1): Linear(in_features=30, out_features=2048, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
    (linear2): Linear(in_features=2048, out_features=30, bias=True)
    (norm1): LayerNorm((30,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((30,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.2, inplace=False)
    (dropout2): Dropout(p=0.2, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=30, out_features=30, bias=True)
        )
        (l

### TRADING BLOTTER:

#### Preprocess the data for the trading blotter:

In [None]:
import pandas as pd
from scipy.stats import norm
import matplotlib.pyplot as plt

INITIAL_CASH = 10_000_000  # $10 million

def preprocess_data(df):
    df['liquidity'] = df['bid_sz_00'] * df['bid_px_00'] + df['ask_sz_00'] * df['ask_px_00']
    return df

def calculate_rsi(data, window=14):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

def calculate_vol_and_liquidity(price_df, volume_df, window_size):
    # Calculate rolling statistics
    rolling_mean_vol = price_df.pct_change().rolling(window=window_size).mean()
    rolling_std_vol = price_df.pct_change().rolling(window=window_size).std()
    rolling_mean_liq = volume_df.rolling(window=window_size).mean()
    rolling_std_liq = volume_df.rolling(window=window_size).std()
    
    return rolling_mean_vol, rolling_std_vol, rolling_mean_liq, rolling_std_liq

def get_percentile(current_value, mean, std):
    if std > 0:
        z_score = (current_value - mean) / std
        percentile = norm.cdf(z_score)
    else:
        percentile = 0.5  # No variation
    return percentile

def get_trade_price(base_price, current_vol, current_liq, mean_vol, std_vol, mean_liq, std_liq, trade_direction):
    vol_percentile = get_percentile(current_vol, mean_vol, std_vol)
    liq_percentile = get_percentile(current_liq, mean_liq, std_liq)

    # Define price adjustment scenarios based on market conditions
    if vol_percentile >= 0.9 and liq_percentile < 0.1:
        price_adjustment_percent = np.random.uniform(-0.25, -0.15)
    elif vol_percentile <= 0.1 and liq_percentile < 0.1:
        price_adjustment_percent = np.random.uniform(-0.10, -0.05)
    elif vol_percentile >= 0.9 and liq_percentile >= 0.9:
        price_adjustment_percent = np.random.uniform(-0.05, +0.10)
    else:
        price_adjustment_percent = np.random.uniform(-0.05, +0.05)  # Default for normal conditions

    # Adjust price based on trade direction
    if trade_direction == 'BUY':
        adjusted_price = base_price * (1 - price_adjustment_percent)
    else:  # SELL
        adjusted_price = base_price * (1 + price_adjustment_percent)
    
    return adjusted_price


#### Create trading environment for the blotter

In [None]:
class TradingEnvironmentwithBlotter:
    def __init__(self, data, daily_trading_limit, window_size):
        self.data = preprocess_data(data)
        self.daily_trading_limit = daily_trading_limit
        self.window_size = window_size
        self.state_columns = ['price', 'liquidity', 'RSI', 'MACD', 'MACD_signal', 'MACD_hist', 'Stoch_k', 'Stoch_d',
                              'OBV', 'Upper_BB', 'Middle_BB', 'Lower_BB', 'ATR_1', 'ADX', '+DI', '-DI', 'CCI']
        self.reset()

    def reset(self):
        self.current_step = 0
        self.balance = INITIAL_CASH
        self.shares_held = 0
        self.total_shares_traded = 0
        self.cumulative_reward = 0
        self.trades = []
        self.portfolio = {'cash': self.balance, 'holdings': {ticker: 0 for ticker in self.data['symbol'].unique()}}
        self.data['RSI'] = calculate_rsi(self.data['price'])
        self.data['pct_change'] = self.data['price'].pct_change()
        self.data['rolling_mean_vol'], self.data['rolling_std_vol'], self.data['rolling_mean_liq'], self.data['rolling_std_liq'] = calculate_vol_and_liquidity(self.data['price'], self.data['liquidity'], self.window_size)

    def step(self):
        row = self.data.iloc[self.current_step]
        current_price = row['price']
        current_time = pd.to_datetime(row['ts_event'])
        current_rsi = row['RSI']
        current_vol = row['pct_change']
        current_liq = row['liquidity']
        mean_vol = row['rolling_mean_vol']
        std_vol = row['rolling_std_vol']
        mean_liq = row['rolling_mean_liq']
        std_liq = row['rolling_std_liq']

        if current_rsi < 30:  # Entry signal based on RSI
            trade_direction = 'BUY'
            trade_price = get_trade_price(current_price, current_vol, current_liq, mean_vol, std_vol, mean_liq, std_liq, trade_direction)
            trade_size = (self.portfolio['cash'] * np.random.uniform(0.001, 0.005)) / trade_price
            if self.portfolio['cash'] >= trade_size * trade_price:
                self.portfolio['cash'] -= trade_size * trade_price
                self.portfolio['holdings'][row['symbol']] += trade_size
                trade_status = 'filled'
            else:
                trade_status = 'cancelled'
        elif current_rsi > 70:  # Exit signal based on RSI
            trade_direction = 'SELL'
            if self.portfolio['holdings'][row['symbol']] > 0:
                trade_size = min(self.portfolio['holdings'][row['symbol']], self.portfolio['cash']*np.random.uniform(0.001, 0.005) / current_price)
                trade_price = get_trade_price(current_price, current_vol, current_liq, mean_vol, std_vol, mean_liq, std_liq, trade_direction)
                self.portfolio['cash'] += trade_size * trade_price
                self.portfolio['holdings'][row['symbol']] -= trade_size
                trade_status = 'filled'
            else:
                trade_size = 0
                trade_status = 'cancelled'
        else:
            trade_direction = 'HOLD'
            trade_size = 0
            trade_price = current_price
            trade_status = 'skipped'

        if trade_size > 0:
            expected_price = row['ask_px_00']
            actual_price = row['price']
            transaction_time = row['ts_in_delta']
            transaction_cost = self._calculate_transaction_cost(row['Volume'], 0.3, self.data['Volume'].mean())
            slippage = expected_price - actual_price
            time_penalty = 1000 * transaction_time / 1e9
            reward = - (slippage + time_penalty + transaction_cost)
        
            self.cumulative_reward += reward
            self.trades.append({
                'step': self.current_step,
                'timestamp': current_time,
                'action': trade_direction,
                'price': trade_price,
                'shares': trade_size,
                'symbol': row['symbol'],
                'reward': reward,
                'transaction_cost': transaction_cost,
                'slippage': slippage,
                'time_penalty': time_penalty
            })

            

        self.current_step += 1
        if self.current_step >= len(self.data) - 1:
            done=True
            self.current_step = 0

    def _calculate_transaction_cost(self, volume, volatility, daily_volume):
        return volatility * np.sqrt(volume / daily_volume)

    def run(self):
        self.reset()
        for _ in range(len(self.data)):
            self.step()
        return self.cumulative_reward, self.trades

    def render(self):
        print(f'Cumulative reward: {self.cumulative_reward}')
        row = self.data.iloc[self.current_step]
        print(f'Total portfolio value: {self.portfolio["cash"] + self.portfolio["holdings"][row["symbol"]]*row["Close"]}')
        # get trades in a pandas dataframe
        trades_df = pd.DataFrame(self.trades)
        # Save a csv
        trades_df.to_csv('trades_blotter.csv', index=False)
        for trade in self.trades:
            print(f"Step: {trade['step']}, Timestamp: {trade['timestamp']}, Action: {trade['action']}, Price: {trade['price']}, Shares: {trade['shares']}, Symbol: {trade['symbol']}, Reward: {trade['reward']}, Transaction Cost: {trade['transaction_cost']}, Slippage: {trade['slippage']}, Time Penalty: {trade['time_penalty']}")

#### Run the trading blotter

In [None]:
# Filter data for the specified ticker
ticker = 'AAPL'  # Specify the ticker you want to trade
ticker_data = market_features_df[market_features_df['symbol'] == ticker]

window_size = 60
daily_trading_limit = 1000
# Create the trading environment
env = TradingEnvironmentwithBlotter(ticker_data, daily_trading_limit=1000, window_size=window_size)  # Daily trading limit of 1000 shares

# Run the environment
cumulative_reward, trades = env.run()

# Render the results
env.render()

In [None]:
df=market_features_df.copy()

In [None]:
df['timestamp']=pd.to_datetime(df['ts_recv'])

In [None]:
df.head()