In [259]:
import gymnasium as gym
from gymnasium.spaces import Discrete
from gymnasium.spaces import Box
import numpy as np
import pandas as pd

In [260]:
class TrainingEnv(gym.Env):

    def __init__(self, data, episode_length = 250, budget=10000):
        self.budget = budget
        self.portfolio_value = budget
        self.cur_row_num = 0
        self.starting_row_num = 0
        self.asset_allocation = 0.0
        self.data = data
        self.episode_length = episode_length
        self.cur_action = 0
    
        # action space: Sell 25%, sell 10%, no change, buy 10%, buy 25% (percentages are of total portfolio value, asset + cash, at each timestep)
        self.action_space = Discrete(5)

        # observation space: Close, Volume, SMA Ratio, RSI, Bandwidth, Asset Allocation
        self.observation_space = Box(low=np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0]),
            high=np.array([np.inf, np.inf, np.inf, 100.0, np.inf, 1.0]), dtype=np.float64)

    # returns the current row in dataframe with current asset allocation appended
    def _get_obs(self):
        obs = np.array(self.data.iloc[self.cur_row_num, :])
        obs = np.append(obs, self.asset_allocation)
        return obs

    # returns current portfolio value
    def _get_info(self):
        return {'Portfolio Value': self.portfolio_value, 'Action Taken': self.cur_action, 'Asset Allocation': self.asset_allocation}

    # sets the starting row and starting asset allocation
    def reset(self, seed=5):
        super().reset(seed=seed)
        self.portfolio_value = self.budget
        self.cur_action = 0

        self.starting_row_num = self.np_random.integers(0, int(len(self.data)) - self.episode_length - 1)

        self.cur_row_num = self.starting_row_num
        self.asset_allocation = 0.0
        
        return self._get_obs(), self._get_info()

    # moves to the next row in data, updates reward and current portfolio value
    def step(self, action):
        self.cur_action = action
        self.cur_row_num += 1
        if (self.cur_row_num - self.starting_row_num) > self.episode_length:
            terminated = True
        else:
            terminated = False
        truncated = False
        self.asset_allocation = self._action_to_allocation(action)
        obs = self._get_obs()
        rew = self._get_reward()
        info = self._get_info()
        return obs, rew, terminated, truncated, info
    
    # converts action to asset allocation value
    def _action_to_allocation(self, action):
        allocation_change = 0.0
        if action == 0: allocation_change = -.25
        elif action == 1: allocation_change = -.1
        elif action == 2: allocation_change = 0.0
        elif action == 3: allocation_change = .1
        else: allocation_change = 0.25
        return max(0.0, min(1.0, self.asset_allocation + allocation_change))
    
    # calculates new portfolio value 
    # accounts for possible commision costs + slippage by applying a fixed .2% cost to the price of each trade
    def _get_new_portfolio_value(self):
        asset_change = (self.data.iloc[self.cur_row_num, 0] - self.data.iloc[self.cur_row_num - 1, 0]) / self.data.iloc[self.cur_row_num - 1, 0]
        new_portfolio_value = self.portfolio_value * (self.asset_allocation * (1.0 + asset_change) + (1.0 - self.asset_allocation))
        allocation_change = 0.0
        if self.cur_action == 0: allocation_change = .25
        elif self.cur_action == 1: allocation_change = .1
        elif self.cur_action == 2: allocation_change = 0.0
        elif self.cur_action == 3: allocation_change = .1
        else: allocation_change = 0.25
        new_portfolio_value = new_portfolio_value - (.002 * allocation_change * self.portfolio_value)
        return new_portfolio_value
    
    # returns reward in the form of regular percent return of the total portfolio (stock + cash) over this timestep
    def _get_reward(self):
        new_portfolio_value = self._get_new_portfolio_value()
        reward = (new_portfolio_value - self.portfolio_value) / self.portfolio_value
        self.portfolio_value = new_portfolio_value
        return reward 

In [261]:
class TestingEnv(TrainingEnv):

    def __init__(self, data):
        TrainingEnv.__init__(self, data)

    # sets the starting row and starting asset allocation
    def reset(self, seed=5):
        super().reset(seed=seed)
        self.portfolio_value = self.budget
        self.cur_action = 0

        self.starting_row_num = 0

        self.cur_row_num = self.starting_row_num

        self.asset_allocation = 0.0
        
        return self._get_obs(), self._get_info()
    
    # moves to the next row in data, updates reward and current portfolio value
    def step(self, action):
        self.cur_action = action
        self.cur_row_num += 1
        if self.cur_row_num >= int(len(self.data)) - 1:
            terminated = True
        else:
            terminated = False
        truncated = False
        self.asset_allocation = self._action_to_allocation(action)
        obs = self._get_obs()
        rew = self._get_reward()
        info = self._get_info()
        return obs, rew, terminated, truncated, info

In [262]:
from stable_baselines3.common.env_checker import check_env

In [263]:
data = pd.read_csv('Amazon Data.csv')
data.drop(labels=['Date'], axis=1, inplace=True)
data

Unnamed: 0,Close,Volume,SMA Ratio,RSI,Bandwidth
0,0.000452,0.022338,0.665141,0.570910,0.354954
1,0.000149,0.033351,0.645005,0.459289,0.322549
2,0.000328,0.031779,0.625469,0.327539,0.317990
3,0.000446,0.003155,0.612184,0.316933,0.268611
4,0.000421,0.010204,0.589496,0.374103,0.237668
...,...,...,...,...,...
6853,5.328879,0.008557,0.391774,0.345607,0.036780
6854,5.410236,0.010644,0.389779,0.328603,0.034522
6855,5.279399,0.008831,0.385461,0.262110,0.035667
6856,5.279874,0.007362,0.381246,0.198390,0.036765


In [264]:
training_data = data.iloc[0:int(0.7 * len(data)), :].copy()
validation_data = data.iloc[int(0.7 * len(data)):int(0.8 * len(data)), :].copy()
testing_data = data.iloc[int(0.8 * len(data)):, :].copy()

In [265]:
training_data

Unnamed: 0,Close,Volume,SMA Ratio,RSI,Bandwidth
0,0.000452,0.022338,0.665141,0.570910,0.354954
1,0.000149,0.033351,0.645005,0.459289,0.322549
2,0.000328,0.031779,0.625469,0.327539,0.317990
3,0.000446,0.003155,0.612184,0.316933,0.268611
4,0.000421,0.010204,0.589496,0.374103,0.237668
...,...,...,...,...,...
4795,0.969075,0.066632,0.363506,0.368270,0.021978
4796,0.919071,0.099708,0.358066,0.275162,0.042683
4797,0.935128,0.047438,0.354769,0.316028,0.049917
4798,0.929883,0.046399,0.350616,0.374791,0.057410


In [266]:
validation_data

Unnamed: 0,Close,Volume,SMA Ratio,RSI,Bandwidth
4800,0.908021,0.032600,0.339769,0.264389,0.077814
4801,0.893772,0.044634,0.332787,0.258751,0.088688
4802,0.929312,0.052938,0.326556,0.390593,0.086616
4803,0.932666,0.028172,0.320953,0.402514,0.085521
4804,0.913790,0.077769,0.315254,0.377905,0.085911
...,...,...,...,...,...
5481,2.368006,0.021343,0.389239,0.421960,0.056087
5482,2.375523,0.020648,0.386700,0.425832,0.049572
5483,2.343420,0.035143,0.384109,0.411629,0.043731
5484,2.306821,0.042756,0.380230,0.393911,0.039110


In [267]:
testing_data

Unnamed: 0,Close,Volume,SMA Ratio,RSI,Bandwidth
5486,2.253867,0.023340,0.369628,0.247232,0.039533
5487,2.216103,0.038361,0.363707,0.222631,0.050578
5488,2.202472,0.040697,0.358006,0.244681,0.062264
5489,2.164315,0.043037,0.351757,0.242447,0.078364
5490,2.095197,0.053649,0.341933,0.190121,0.103498
...,...,...,...,...,...
6853,5.328879,0.008557,0.391774,0.345607,0.036780
6854,5.410236,0.010644,0.389779,0.328603,0.034522
6855,5.279399,0.008831,0.385461,0.262110,0.035667
6856,5.279874,0.007362,0.381246,0.198390,0.036765


In [268]:
training_env = TrainingEnv(training_data)
check_env(training_env, warn=True)
validation_env = TestingEnv(validation_data)
check_env(validation_env, warn=True)
testing_env = TestingEnv(testing_data)
check_env(testing_env, warn=True)

In [269]:
from stable_baselines3 import DQN
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnNoModelImprovement

stop_train_callback = StopTrainingOnNoModelImprovement(max_no_improvement_evals=1, min_evals=5, verbose=1)
eval_callback = EvalCallback(validation_env, eval_freq=1000, callback_after_eval=stop_train_callback, verbose=1)

model = DQN("MlpPolicy", training_env, buffer_size=5000, learning_rate=0.001)
model.learn(int(1e10), callback=eval_callback)



Eval num_timesteps=1000, episode_reward=0.91 +/- 0.00
Episode length: 685.00 +/- 0.00
New best mean reward!
Eval num_timesteps=2000, episode_reward=0.90 +/- 0.00
Episode length: 685.00 +/- 0.00
Eval num_timesteps=3000, episode_reward=0.90 +/- 0.00
Episode length: 685.00 +/- 0.00
Eval num_timesteps=4000, episode_reward=0.90 +/- 0.00
Episode length: 685.00 +/- 0.00
Eval num_timesteps=5000, episode_reward=0.90 +/- 0.00
Episode length: 685.00 +/- 0.00
Eval num_timesteps=6000, episode_reward=0.90 +/- 0.00
Episode length: 685.00 +/- 0.00
Eval num_timesteps=7000, episode_reward=0.90 +/- 0.00
Episode length: 685.00 +/- 0.00
Stopping training because there was no new best model in the last 2 evaluations


<stable_baselines3.dqn.dqn.DQN at 0x16b4c911100>

In [270]:
# from stable_baselines3 import DQN
# import optuna

# max_return = 0.0

# def objective(trial):
#     global max_return
#     learning_rate = trial.suggest_float('learning rate', 0.0001, 0.01)
#     gamma = trial.suggest_float('gamma', 0.95, 0.999)
#     exploration_fraction = trial.suggest_float('exploration fraction', 0.1, 0.3)
#     total_timesteps = trial.suggest_int('total_timesteps', 250 * 50, 250 * 1000)


#     model = DQN("MlpPolicy", training_env, learning_rate=learning_rate, 
#                 buffer_size=5000, gamma=gamma, exploration_fraction=exploration_fraction)
    

#     model.learn(total_timesteps=total_timesteps)

#     obs, info = validation_env.reset()
#     while True:
#         action, _states = model.predict(obs, deterministic=True)
#         obs, reward, terminated, truncated, info = validation_env.step(action)
#         if terminated or truncated:
#             break

#     raw_return = (validation_env.portfolio_value - validation_env.budget) / validation_env.budget
#     if raw_return > max_return:
#         model.save("RL_Agent")
#         max_return = raw_return
#     del model
#     return -raw_return

    


# study = optuna.create_study()
# study.optimize(objective, n_trials = 20, show_progress_bar=True)

# print(f'Best Parameters: {study.best_params}')
# print(f'Best value: {study.best_value}')

In [271]:
# next step: think more about metrics to add to analyze performance during training
# use optuna for fine-tuning after experimenting with training first
# also add tests for custom environment 