In [44]:
import gymnasium as gym
from gymnasium.spaces import Discrete
from gymnasium.spaces import Box
import numpy as np
import pandas as pd

In [45]:
class TrainingEnv(gym.Env):

    def __init__(self, data, episode_length = 250, budget=10000):
        self.budget = budget
        self.portfolio_value = budget
        self.cur_row_num = 0
        self.starting_row_num = 0
        self.asset_allocation = 0.0
        self.data = data
        self.episode_length = episode_length
        self.cur_action = 0
    
        # action space: Sell 25%, sell 10%, no change, buy 10%, buy 25% (percentages are of total portfolio value, asset + cash, at each timestep)
        self.action_space = Discrete(5)

        # observation space: Close, Volume, SMA Ratio, RSI, Bandwidth, Asset Allocation
        self.observation_space = Box(low=np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0]),
            high=np.array([np.inf, np.inf, np.inf, 100.0, np.inf, 1.0]), dtype=np.float64)

    # returns the current row in dataframe with current asset allocation appended
    def _get_obs(self):
        obs = np.array(self.data.iloc[self.cur_row_num, :])
        obs = np.append(obs, self.asset_allocation)
        return obs

    # returns current portfolio value
    def _get_info(self):
        return {'Portfolio Value': self.portfolio_value, 'Action Taken': self.cur_action, 'Asset Allocation': self.asset_allocation}
    
    def _print_episode_return(self):
        print(f'Current Episode Return: {(self.portfolio_value - self.budget) / self.budget}')

    # sets the starting row and starting asset allocation
    def reset(self, seed=5):
        super().reset(seed=seed)
        self._print_episode_return()
        self.portfolio_value = self.budget
        self.cur_action = 0

        self.starting_row_num = self.np_random.integers(0, 0.7 * len(self.data) - self.episode_length - 1)

        self.cur_row_num = self.starting_row_num

        rand = self.np_random.random()
        if rand < 0.7:
            self.asset_allocation = 0.0
        else:
            self.asset_allocation = rand
        
        return self._get_obs(), self._get_info()

    # moves to the next row in data, updates reward and current portfolio value
    def step(self, action):
        self.cur_action = action
        self.cur_row_num += 1
        if (self.cur_row_num - self.starting_row_num) > self.episode_length:
            terminated = True
        else:
            terminated = False
        truncated = False
        self.asset_allocation = self._action_to_allocation(action)
        obs = self._get_obs()
        rew = self._get_reward()
        info = self._get_info()
        return obs, rew, terminated, truncated, info
    
    # converts action to asset allocation value
    def _action_to_allocation(self, action):
        allocation_change = 0.0
        if action == 0: allocation_change = -.25
        elif action == 1: allocation_change = -.1
        elif action == 2: allocation_change = 0.0
        elif action == 3: allocation_change = .1
        else: allocation_change = 0.25
        return max(0.0, min(1.0, self.asset_allocation + allocation_change))
    
    # calculates new portfolio value 
    # accounts for possible commision costs + slippage by applying a fixed .2% cost to the price of each trade
    def _get_new_portfolio_value(self):
        asset_change = (self.data.iloc[self.cur_row_num, 0] - self.data.iloc[self.cur_row_num - 1, 0]) / self.data.iloc[self.cur_row_num - 1, 0]
        new_portfolio_value = self.portfolio_value * (self.asset_allocation * (1.0 + asset_change) + (1.0 - self.asset_allocation))
        allocation_change = 0.0
        if self.cur_action == 0: allocation_change = .25
        elif self.cur_action == 1: allocation_change = .1
        elif self.cur_action == 2: allocation_change = 0.0
        elif self.cur_action == 3: allocation_change = .1
        else: allocation_change = 0.25
        new_portfolio_value = new_portfolio_value - (.002 * allocation_change * self.portfolio_value)
        return new_portfolio_value
    
    # returns reward in the form of regular percent return of the total portfolio (stock + cash) over this timestep
    def _get_reward(self):
        new_portfolio_value = self._get_new_portfolio_value()
        reward = (new_portfolio_value - self.portfolio_value) / self.portfolio_value
        self.portfolio_value = new_portfolio_value
        return reward * 100     # scaling up rewards for better training


In [51]:
class TestingEnv(TrainingEnv):

    def __init__(self, data):
        TrainingEnv.__init__(data)

    # sets the starting row and starting asset allocation
    def reset(self, seed=5):
        super().reset(seed=seed)
        super().portfolio_value = self.budget
        self.cur_action = 0

        self.starting_row_num = 0.7 * len(self.data)

        self.cur_row_num = self.starting_row_num

        self.asset_allocation = 0.0
        
        return self._get_obs(), self._get_info()
    
    # moves to the next row in data, updates reward and current portfolio value
    def step(self, action):
        self.cur_action = action
        self.cur_row_num += 1
        if self.cur_row_num > 0.8 * len(self.data)
            terminated = True
        else:
            terminated = False
        truncated = False
        self.asset_allocation = self._action_to_allocation(action)
        obs = self._get_obs()
        rew = self._get_reward()
        info = self._get_info()
        return obs, rew, terminated, truncated, info

In [46]:
from stable_baselines3.common.env_checker import check_env

In [47]:
data = pd.read_csv('Amazon Data.csv')
data.drop(labels=['Date'], axis=1, inplace=True)
data

Unnamed: 0,Close,Volume,SMA Ratio,RSI,Bandwidth
0,0.000452,0.022338,0.665141,0.570910,0.354954
1,0.000149,0.033351,0.645005,0.459289,0.322549
2,0.000328,0.031779,0.625469,0.327539,0.317990
3,0.000446,0.003155,0.612184,0.316933,0.268611
4,0.000421,0.010204,0.589496,0.374103,0.237668
...,...,...,...,...,...
6853,5.328879,0.008557,0.391774,0.345607,0.036780
6854,5.410236,0.010644,0.389779,0.328603,0.034522
6855,5.279399,0.008831,0.385461,0.262110,0.035667
6856,5.279874,0.007362,0.381246,0.198390,0.036765


In [48]:
env = TrainingEnv(data)
check_env(env, warn=True)

Current Episode Return: 0.0
Current Episode Return: 0.0
Current Episode Return: -0.0009638082099241728


In [49]:
from stable_baselines3 import DQN

model = DQN("MlpPolicy", env)
model.learn(total_timesteps=250 * 100)

Current Episode Return: 0.016741351770058463
Current Episode Return: 0.2316171827260043
Current Episode Return: 0.1497518524849178
Current Episode Return: 0.08305386038541601
Current Episode Return: 0.19711874161971135
Current Episode Return: 0.230572826508246
Current Episode Return: -0.0802903624427583
Current Episode Return: 0.197251085299401
Current Episode Return: 0.18612912463001993
Current Episode Return: -0.04550119548914645
Current Episode Return: 0.0049286170847604805
Current Episode Return: -0.11092865001668979
Current Episode Return: -0.001766947291415272
Current Episode Return: 0.22924277227252587
Current Episode Return: 0.23651720831655912
Current Episode Return: 0.10415674544893645
Current Episode Return: 0.20889039799234452
Current Episode Return: 0.14846680883338467
Current Episode Return: 0.15598126597516676
Current Episode Return: 0.08320508401616407
Current Episode Return: 0.04467683363435517
Current Episode Return: 0.12609456263528865
Current Episode Return: 0.24840

<stable_baselines3.dqn.dqn.DQN at 0x233738307c0>

In [50]:
obs, info = env.reset()
while True:
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, info = env.step(action)
    print(info)
    if terminated or truncated:
        obs, info = env.reset()

Current Episode Return: -0.1942147270508929
{'Portfolio Value': 9993.788896075952, 'Action Taken': array(3, dtype=int64)}
{'Portfolio Value': 10076.028245010499, 'Action Taken': array(3, dtype=int64)}
{'Portfolio Value': 9897.07109585442, 'Action Taken': array(3, dtype=int64)}
{'Portfolio Value': 10217.230143752866, 'Action Taken': array(3, dtype=int64)}
{'Portfolio Value': 10415.229834890843, 'Action Taken': array(3, dtype=int64)}
{'Portfolio Value': 10702.726850541663, 'Action Taken': array(3, dtype=int64)}
{'Portfolio Value': 10617.20289287785, 'Action Taken': array(3, dtype=int64)}
{'Portfolio Value': 10337.965198261265, 'Action Taken': array(3, dtype=int64)}
{'Portfolio Value': 10075.817508732187, 'Action Taken': array(3, dtype=int64)}
{'Portfolio Value': 10066.086996916925, 'Action Taken': array(3, dtype=int64)}
{'Portfolio Value': 9848.071080420936, 'Action Taken': array(3, dtype=int64)}
{'Portfolio Value': 10160.012293988675, 'Action Taken': array(3, dtype=int64)}
{'Portfolio V

KeyboardInterrupt: 

In [None]:
# next step: creating wrapper class for use on testing data, think more about metrics to add to analyze performance during training, incorporate optuna for hyperparameter tuning
# also add tests for custom environment 