In [1]:
import torch
import torch.nn as nn
import gymnasium as gym
from gym.spaces.box import Box
import numpy as np
from statsmodels.tsa.seasonal import seasonal_decompose
import pandas as pd
from stable_baselines3.common.policies import BasePolicy, ActorCriticPolicy
from stable_baselines3.common.distributions import CategoricalDistribution
from sklearn.preprocessing import MinMaxScaler
from stable_baselines3.common.utils import get_device

In [2]:
df = pd.read_csv("smh_stocks_max.csv")
df = df.fillna(0)
df["volume"] = df["volume"].astype(float)

close = df["close"].to_numpy()
tsa = seasonal_decompose(close, model="additive", period=180)
df["trend"] = tsa.trend
df["seasonal"] = tsa.seasonal
df["residual"] = tsa.resid
df = df.fillna(0)

In [3]:
features = df[['volume', 'macd_hist', "macd", "signal_line", "%K", "%D", "ema_200",
       'rsi', "roc", 'bb_upper', 'bb_lower']]
targets = df[["trend","seasonal","residual"]]   

train_size = int(0.8 * len(features))
X_train_set = features.iloc[:train_size, :]
X_test_set = features.iloc[train_size:, :]

y_train_set = targets.iloc[:train_size, :]
y_test_set = targets.iloc[train_size:, :]

scaler = MinMaxScaler((0,1))

X_train_set = scaler.fit_transform(X_train_set.fillna(np.nan).to_numpy())
X_test_set = scaler.transform(X_test_set.fillna(np.nan).to_numpy())

y_train_set = scaler.fit_transform(y_train_set.fillna(np.nan).to_numpy())
y_test_set = scaler.transform(y_test_set.fillna(np.nan).to_numpy())

In [4]:
class LSTMNetwork(nn.Module):
    def __init__(self,input_size, hidden_size, n_layers, output_size, actor_size, critic_size):
        super(LSTMNetwork, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.output_size = output_size

        self.latent_dim_pi = actor_size
        self.latent_dim_vf = critic_size

        self.lstm = nn.LSTM(input_size, hidden_size, n_layers, batch_first=True, device="cuda")

        self.policy_network = nn.Sequential(
            nn.Linear(hidden_size, self.latent_dim_pi),
            nn.ReLU()
        )

        self.value_net = nn.Sequential(
            nn.Linear(hidden_size, self.latent_dim_vf),
            nn.ReLU()
        )

    def forward_actor(self, x):
        lstm_out, _ = self.lstm(x)
        out = lstm_out[:,-1,:]

        return self.policy_network(out)

    def forward_critic(self,x):
        lstm_out, _ = self.lstm(x)
        out = lstm_out[:,-1,:]

        return self.forward_critic(out)

    def forward(self, x):
        lstm_out,_ = self.lstm(x)
        out = lstm_out[:,-1,:]

        return self.forward_actor(x), self.forward_critic(x)
    

class CustomActorCriticPolicy(ActorCriticPolicy):
    def __init__(
            self,
            observation_space,
            action_space,
            lr_schedule,
            *args,
            **kwargs,
    ):
        kwargs["ortho_init"] = False
        super().__init__(
            observation_space,
            action_space,
            lr_schedule,
            *args,
            **kwargs,
        )

    def _build_mlp_extractor(self) -> None:
        self.mlp_extractor = LSTMNetwork(11,128,3,3,64,64)

In [5]:
torch.cuda.current_device()

0

In [6]:
class CustomLSTMPolicy(BasePolicy):
    def __init__(self, observation_space, action_space, lstm_hidden_size, n_layers, output_size=1, activation_fn=nn.ReLU, **kwargs):
        super(CustomLSTMPolicy, self).__init__(observation_space, action_space, **kwargs)

        self.lstm = nn.LSTM(input_size=observation_space.shape[2], hidden_size=lstm_hidden_size, num_layers=n_layers, batch_first=True)

        self.actor = self.build_fc(lstm_hidden_size, output_size, activation_fn=activation_fn)
        self.critic = self.build_fc(lstm_hidden_size, 1, activation_fn=activation_fn)

        self.n_layers = n_layers
        self.hidden_size = lstm_hidden_size

    def build_fc(self, input_dim, output_dim, activation_fn):
        layers = []
        layers.append(nn.Linear(input_dim, 64))
        layers.append(activation_fn())
        layers.append(nn.Linear(64, output_dim))
        return nn.Sequential(*layers).to(self.device)
    
    def forward(self, obs, hidden_state=None, mask=None, **kwargs):
        if hidden_state is None:
            hidden_state = self.reset_hidden_state()

        out, hidden_state = self.lstm(obs, hidden_state)
        out = out[:, -1, :]  

        actor_out = self.actor(out)
        critic_out = self.critic(out)

        return actor_out.unsqueeze(0), critic_out, hidden_state
    
    def _predict(self, observation, deterministic=False):
        lstm_out, _ = self.lstm(observation)
        out = lstm_out[:,-1,:]

        action_probs = self.actor(out)

        if deterministic:
            actions = action_probs.argmax(dim=-1)

        else:
            actions = CategoricalDistribution(action_probs).sample()

        return actions, None
    
    def get_action_dist(self, observation):
        lstm_out, _ = self.lstm(observation)
        out = lstm_out[:, -1, :]

        action_probs = self.actor(out)

        action_dist = CategoricalDistribution(action_probs)
        return action_dist

    def evaluate_actions(self, obs, hidden_state, actions):
        lstm_out, hidden_state = self.lstm(obs, hidden_state)
        action_probs = self.actor(lstm_out[:,-1,:])
        values = self.critic(lstm_out[:,-1,:])

        log_probs = torch.log(action_probs.gather(1, actions.unsqueeze(1)).squeeze(1))

        return values, log_probs, hidden_state
    
    def reset_hidden_state(self):
        return (torch.zeros(self.n_layers,1,self.hidden_size).to(self.device),
                torch.zeros(self.n_layers,1,self.hidden_size).to(self.device))
        

In [7]:
class StockTradingEnv(gym.Env):
    def __init__(self, data, targets, seq_length, n_features, policy, scaler):
        super().__init__()

        self.action_space = gym.spaces.box.Box(low=0, high=2000, shape=(1,1,3), dtype=np.float32)
        self.observation_space = gym.spaces.box.Box(low=0, high=1, shape=(1,seq_length,n_features), dtype=np.float32)

        self.data = data
        self.targets = targets
        self.scaler = scaler
        self.policy = CustomLSTMPolicy(self.observation_space, self.action_space, 128, 3, 3, nn.ReLU)
        self.seq_length = seq_length
        self.current_step = seq_length
        

        self.model = policy

        self.observation = data[self.current_step - seq_length: self.current_step]
        self.done = False
        self.info = {}

    def step(self, action):
        if self.current_step >= len(self.data) - 1:
            self.done = True
            return self.observation, 0, self.done, {}
        
        with torch.no_grad():
            obs_tensor = torch.FloatTensor(self.observation)
            actor_out, critic_out = self.model.forward(self.observation)
            scaled_output = self.scaler.inverse_transform(actor_out)
            predicted_price = scaled_output[0] + scaled_output[1] + scaled_output[2]

        actual_price = self.targets[self.current_step]

        reward = nn.MSELoss()

        self.current_step += 1
        self.observation = self.data[self.current_step - self.seq_length: self.current_step]

        return self.observation, reward, self.done, self.info

    def reset(self):
        self.current_step = self.seq_length
        observation = self.data[self.current_step - self.seq_length: self.current_step]


In [8]:
from stable_baselines3 import A2C

custom_policy = CustomLSTMPolicy(gym.spaces.box.Box(low=0, high=1, shape=(1,20,11), dtype=np.float32), Box(low=0, high=2000, shape=(3,)), lstm_hidden_size=128, n_layers=3)

env = StockTradingEnv(data=X_train_set, targets=y_train_set, seq_length=20, n_features=11, policy=custom_policy, scaler=MinMaxScaler())

a2c_model = A2C(custom_policy, env, verbose=1)
a2c_model.learn(total_timesteps=len(X_train_set))

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


AttributeError: 'Box' object has no attribute 'dim'

In [23]:
from stable_baselines3 import PPO

model = PPO(CustomActorCriticPolicy, env, verbose=1)
model.learn(5000)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


AssertionError: The algorithm only supports (<class 'gymnasium.spaces.box.Box'>, <class 'gymnasium.spaces.discrete.Discrete'>, <class 'gymnasium.spaces.multi_discrete.MultiDiscrete'>, <class 'gymnasium.spaces.multi_binary.MultiBinary'>) as action spaces but Box(0.0, 2000.0, (1, 1, 3), float32) was provided