In [17]:
import gym
import numpy as np
import pandas as pd
from stable_baselines3 import DDPG
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
from StockTrade import StockTrade
import os
import gym
import matplotlib.pyplot as plt

from stable_baselines3.common import results_plotter
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.results_plotter import load_results, ts2xy, plot_results
from stable_baselines3.common.noise import NormalActionNoise
from stable_baselines3.common.callbacks import BaseCallback

In [18]:
class SaveOnBestTrainingRewardCallback(BaseCallback):
    """
    Callback for saving a model (the check is done every ``check_freq`` steps)
    based on the training reward (in practice, we recommend using ``EvalCallback``).

    :param check_freq: (int)
    :param log_dir: (str) Path to the folder where the model will be saved.
      It must contains the file created by the ``Monitor`` wrapper.
    :param verbose: (int)
    """
    def __init__(self, check_freq: int, log_dir: str, verbose=1):
        super(SaveOnBestTrainingRewardCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.log_dir = log_dir
        self.save_path = os.path.join(log_dir, 'best_model')
        self.best_mean_reward = -np.inf

    def _init_callback(self) -> None:
        # Create folder if needed
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:

          # Retrieve training reward
          x, y = ts2xy(load_results(self.log_dir), 'timesteps')
          if len(x) > 0:
              # Mean training reward over the last 100 episodes
              mean_reward = np.mean(y[-100:])
              if self.verbose > 0:
                print("Num timesteps: {}".format(self.num_timesteps))
                print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(self.best_mean_reward, mean_reward))

              # New best model, you could save the agent here
              if mean_reward > self.best_mean_reward:
                  self.best_mean_reward = mean_reward
                  # Example for saving best model
                  if self.verbose > 0:
                    print("Saving new best model to {}".format(self.save_path))
                  self.model.save(self.save_path)

        return True

In [19]:
log_dir = "tmp/"
os.makedirs(log_dir, exist_ok=True)

In [22]:
env = StockTrade(stock_quantity=1, change_stocks=True)
env = Monitor(env, log_dir)

In [None]:
# The noise objects for DDPG
n_actions = env.action_space.shape
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

model = DDPG('MlpPolicy', env, action_noise=action_noise, verbose=1)
callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir)
timesteps = 1e4

model.learn(total_timesteps=int(timesteps), callback=callback)
plot_results([log_dir], timesteps, results_plotter.X_TIMESTEPS, "Stock Trade Game")
plt.show()
model.save("123Automator")

In [None]:
obs = env.reset()
dones = False
while not dones:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
env.close()

Using cuda device
Wrapping the env in a DummyVecEnv.
Num timesteps: 1000
Best mean reward: -inf - Last mean reward per episode: 24287534.95
Saving new best model to tmp/best_model
Num timesteps: 2000
Best mean reward: 24287534.95 - Last mean reward per episode: 11619183.33
Num timesteps: 3000
Best mean reward: 24287534.95 - Last mean reward per episode: 33030205.25
Saving new best model to tmp/best_model
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 752       |
|    ep_rew_mean     | 5.06e+07  |
| time/              |           |
|    episodes        | 4         |
|    fps             | 68        |
|    time_elapsed    | 43        |
|    total timesteps | 3008      |
| train/             |           |
|    actor_loss      | -3.46e+05 |
|    critic_loss     | 6.37e+08  |
|    learning_rate   | 0.001     |
|    n_updates       | 2009      |
----------------------------------
Num timesteps: 4000
Best mean reward: 33030205.25 - Last mean rewar

In [None]:
obs = env.reset()
dones = False
while not dones:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
env.close()