In [1]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gym
from gym import spaces
import copy
import os


In [24]:
#Pull in training data
data = pickle.load(open('train_ta', 'rb'))

In [25]:
#Technical indicators list. TODO: put these in a config file
TECHNICAL_INDICATORS_LIST = ["macd", "macds",
                             "boll_ub","boll_lb",
                             "rsi_5", "rsi_14", "rsi_30", 
                             "cci_30", "dx_30",
                             "open_5_sma", "open_14_sma", "open_30_sma"]

In [26]:
#stock tickers being looked at
tickers = list(data.keys())

In [27]:
#features is a list of all (non-price) features that are used in the observation

features = list(data['DIS'].keys())
features.remove('date')
features.remove('tic')
features.remove('price')

In [255]:
#Helper function to perform softmax
def softmax(x):
    return np.exp(x)/sum(np.exp(x))


INITIAL_BALANCE = 1000.0 #Start with balance of 1000 dollars
NUM_PAST_STATES = 14 #use past 14 days of price data as part of observation
EPISODE_LENGTH = 30 #each episode goes for 30 trading days


#OpenAI Gym style environment for RL
class TradeEnv(gym.Env):
    def __init__(self, tickers):
        super(TradeEnv, self).__init__()

        self.tickers = tickers

        self.data = {}
        self.features = {}
        self.means = {}
        self.stds = {}
        self.prices = {}
        for key, value in data.items():
            self.data[key] = value[features]
            self.means[key] = np.mean(value[features], axis = 0)
            self.stds[key] = np.std(value[features], axis = 0)
            
            #Normalize features to have zero mean and unit standard deviation
            self.features[key] = np.divide(value[features] - self.means[key],
                                          self.stds[key])
            
            self.prices[key] = value['price'].values
        
        self.prices = pd.DataFrame.from_dict(self.prices)
        
        #self.prices is a dataframe with each ticker being a key 
        #and the corresponding series representing the stock prices
        
        #Will be used later for normalization
        self.price_means = np.mean(self.prices, axis = 0).values
        self.price_stds = np.std(self.prices, axis = 0).values
        

        self.episode_length = EPISODE_LENGTH #number of trading minutes in episode

        self.num_past_states = NUM_PAST_STATES #number of past days that are used in state

        self.action_space = spaces.Box(low=-10, high=10, shape=(len(self.tickers) + 1,))
                                            

        obs_length = len(self.tickers)*(self.num_past_states) #observation due to past stacked states
        obs_length += 1 #balance
        obs_length += len(self.tickers) #holdings
        obs_length += len(self.tickers)*len(features) #number of technical analysis features
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf,
                                            shape=(obs_length,))


    def step(self, action_):
        
        #Apply softmax to RL output so that actions sum to 1
        action = softmax(action_)

        #Liquidate past holdings
        self.balance += np.sum(self.holdings)
        
        
        #New Portfolio at end of day
        self.holdings = self.balance*action[:-1]
        self.balance = self.balance*action[-1]
        
        #Net worth at end of day
        self.last_net_worth = self.balance + np.sum(self.holdings)
        
        #Step into next day
        self.index += 1
        #Get stock prices at next day
        stock_obs = self.get_stock_obs(self.index)
        self.next_prices = stock_obs[-1]
        
        #Update value of current holdings
        perc_change = np.divide(self.next_prices, self.curr_prices)
        self.holdings = np.multiply(self.holdings, perc_change)

        self.curr_prices = self.next_prices
        
        self.net_worth = self.balance + np.sum(self.holdings)

        rew = self.net_worth - self.last_net_worth # reward is the delta between last net worth and current net worth

        self.steps += 1
        done = (self.net_worth <= 0) or (self.steps >= self.episode_length)

        obs = self.get_obs(stock_obs, self.balance, self.holdings, self.index)
        self.cum_rew += rew

        return obs, rew, done, {}
    
    
    def get_stock_obs(self, index):

        
        ret= self.prices[index - self.num_past_states:index][self.tickers].values #stack data
        return ret

    def get_obs(self, stock_obs, balance, holdings, index):
        #Normalize stock prices for inclusion in observations
        prices_norm = np.divide(stock_obs - self.price_means,
                               self.price_stds).reshape(-1,)
        
        feature_vals = np.array([])
        ix = index - 1
        #Add in features at current timestep, for each ticker
        for tic in self.tickers:
            feature_vals = np.append(feature_vals, (self.features[tic].iloc[ix][features].values))
        
        #Form observation and normalize balance and holdings
        return np.concatenate([prices_norm, [balance/1000.0], holdings/1000.0, feature_vals])

        
    def reset(self):

        self.cum_rew = 0.0
        self.steps = 0
        self.index = np.random.randint(2*NUM_PAST_STATES, len(self.prices) - EPISODE_LENGTH - 10)

        self.init_prices = self.prices[self.index-1:self.index + EPISODE_LENGTH]
        stock_obs = self.get_stock_obs(self.index)
        self.holdings = np.zeros(len(self.tickers)) #holdings of each stock in number of shares
        self.balance = INITIAL_BALANCE
        self.last_net_worth = INITIAL_BALANCE
        self.net_worth = INITIAL_BALANCE
        
        self.curr_prices = stock_obs[-1]
        

        obs = self.get_obs(stock_obs, self.balance, self.holdings, self.index)
        return obs  # reward, done, info can't be included



In [256]:
test_env = TradeEnv(tickers = tickers)

In [258]:
#This is what self.prices looks like:
test_env.prices

Unnamed: 0,DIS,IFF
0,36.500500,32.250000
1,36.377190,33.500000
2,36.377190,33.375000
3,35.883938,32.000000
4,34.959095,31.312500
...,...,...
4251,108.580002,115.250000
4252,107.900002,116.029999
4253,108.540001,115.279999
4254,108.120003,116.080002


In [260]:
#This is an (example) of what self.features looks like for one ticker
test_env.features['DIS']

Unnamed: 0,volume,macd,macds,boll_ub,boll_lb,rsi_5,rsi_14,rsi_30,cci_30,dx_30,open_5_sma,open_14_sma,open_30_sma
0,-0.401020,0.489463,0.617954,-0.353622,-0.428058,-0.481647,0.307562,0.849076,0.113132,-0.236348,-0.253884,-0.256690,-0.315423
1,-0.870514,0.426991,0.585422,-0.353426,-0.421663,-0.402383,0.333753,0.868129,0.234302,-0.201806,-0.258847,-0.256044,-0.307076
2,-0.819901,0.336381,0.540152,-0.358686,-0.407370,-0.845417,0.091875,0.672550,0.001486,-0.887515,-0.264713,-0.256690,-0.300703
3,-0.507045,0.179425,0.470598,-0.372529,-0.384293,-1.506754,-0.412795,0.248515,-0.278378,-1.161282,-0.268774,-0.256852,-0.296529
4,0.618350,-0.118873,0.351596,-0.373050,-0.383547,-2.092038,-1.237887,-0.529302,-1.183956,-0.093750,-0.282762,-0.259116,-0.291901
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4251,-0.560822,2.733369,2.795886,2.319042,2.394432,1.108077,1.835282,2.232961,0.958134,2.694594,2.319728,2.279055,2.207089
4252,-0.253521,2.740388,2.819063,2.334341,2.391258,1.116279,1.840214,2.236842,0.882031,1.867211,2.338756,2.284824,2.218398
4253,-0.304855,2.825701,2.855721,2.355450,2.388831,1.500313,2.090202,2.437998,1.092081,2.077695,2.352661,2.292429,2.230729
4254,-0.196444,2.655015,2.848802,2.359830,2.397907,-0.161575,0.917907,1.614418,0.775609,1.143172,2.359613,2.301633,2.241658


In [261]:
#Use stable baselines for reinforcement learning
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common import results_plotter
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.results_plotter import load_results, ts2xy, plot_results
from stable_baselines3.common.callbacks import BaseCallback

In [262]:
class SaveRewardCallback(BaseCallback):
    """
    Callback for saving a model (the check is done every ``check_freq`` steps)
    based on the training reward (in practice, we recommend using ``EvalCallback``).

    :param check_freq: (int)
    :param log_dir: (str) Path to the folder where the model will be saved.
      It must contains the file created by the ``Monitor`` wrapper.
    :param verbose: (int)
    """
    def __init__(self, check_freq: int, log_dir: str, verbose=1):
        super(SaveRewardCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.log_dir = log_dir
        self.save_path = os.path.join(log_dir, 'best_model')
        self.best_mean_reward = -np.inf

    def _init_callback(self) -> None:
        # Create folder if needed
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self) -> bool:

        if self.n_calls % self.check_freq == 0:

          # Retrieve training reward
          x, y = ts2xy(load_results(self.log_dir), 'timesteps')
          if len(x) > 0:
              # Mean training reward over the last 100 episodes
              mean_reward = np.mean(y[-100:])
              if self.verbose > 0:
                print("Num timesteps: {}".format(self.num_timesteps))
                print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(self.best_mean_reward, mean_reward))

              # New best model, you could save the agent here
              if mean_reward > self.best_mean_reward:
                  self.best_mean_reward = mean_reward
                  # Example for saving best model
                  if self.verbose > 0:
                    print("Saving new best model to {}".format(self.save_path))
                  self.model.save(self.save_path)

        return True

In [221]:
#Make diretory for saving training statistics and best model
log_dir = "rl_with_ta/"
os.makedirs(log_dir, exist_ok=True)

In [222]:
#Make wrapped environment
trade_env = TradeEnv(tickers=tickers)
env = Monitor(trade_env, log_dir)

In [223]:
#make callback
callback = SaveRewardCallback(check_freq=10000, log_dir=log_dir)

In [225]:
import torch as th
#neural network architecture and training parameters
policy_kwargs = dict(activation_fn=th.nn.Tanh,
                     net_arch=[dict(vf=[256, 128], pi=[256, 150])])
train_kwargs = dict(batch_size=512,
                   n_epochs = 50)

In [226]:
#Make model
model = PPO('MlpPolicy', env, gamma = .9998, verbose=1)

Using cpu device
Wrapping the env in a DummyVecEnv.


In [253]:
#Learn
model.learn(total_timesteps=int(5e7), callback = callback, log_interval = 10)

Num timesteps: 1120
Best mean reward: 191.41 - Last mean reward per episode: 160.77
Num timesteps: 11120
Best mean reward: 191.41 - Last mean reward per episode: 141.66
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 30        |
|    ep_rew_mean          | 146       |
| time/                   |           |
|    fps                  | 421       |
|    iterations           | 10        |
|    time_elapsed         | 48        |
|    total_timesteps      | 20480     |
| train/                  |           |
|    approx_kl            | 1.2885337 |
|    clip_fraction        | 0.698     |
|    clip_range           | 0.2       |
|    entropy_loss         | 0.972     |
|    explained_variance   | 0.752     |
|    learning_rate        | 0.001     |
|    loss                 | 154       |
|    n_updates            | 33520     |
|    policy_gradient_loss | 0.0758    |
|    std                  | 0.178     |
|    value_loss           | 484

Num timesteps: 171120
Best mean reward: 191.41 - Last mean reward per episode: 125.27
Num timesteps: 181120
Best mean reward: 191.41 - Last mean reward per episode: 120.63
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 116        |
| time/                   |            |
|    fps                  | 439        |
|    iterations           | 90         |
|    time_elapsed         | 419        |
|    total_timesteps      | 184320     |
| train/                  |            |
|    approx_kl            | 0.36105424 |
|    clip_fraction        | 0.606      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.89      |
|    explained_variance   | 0.804      |
|    learning_rate        | 0.001      |
|    loss                 | 72.3       |
|    n_updates            | 34320      |
|    policy_gradient_loss | 0.00482    |
|    std                  | 0.326      |
|    val

Num timesteps: 331120
Best mean reward: 191.41 - Last mean reward per episode: 159.64
Num timesteps: 341120
Best mean reward: 191.41 - Last mean reward per episode: 162.46
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 153        |
| time/                   |            |
|    fps                  | 437        |
|    iterations           | 170        |
|    time_elapsed         | 795        |
|    total_timesteps      | 348160     |
| train/                  |            |
|    approx_kl            | 0.10335225 |
|    clip_fraction        | 0.493      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.966     |
|    explained_variance   | 0.802      |
|    learning_rate        | 0.001      |
|    loss                 | 69.1       |
|    n_updates            | 35120      |
|    policy_gradient_loss | -0.0184    |
|    std                  | 0.334      |
|    val

Num timesteps: 501120
Best mean reward: 191.41 - Last mean reward per episode: 147.09
Num timesteps: 511120
Best mean reward: 191.41 - Last mean reward per episode: 145.86
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 30          |
|    ep_rew_mean          | 146         |
| time/                   |             |
|    fps                  | 438         |
|    iterations           | 250         |
|    time_elapsed         | 1166        |
|    total_timesteps      | 512000      |
| train/                  |             |
|    approx_kl            | 0.105347104 |
|    clip_fraction        | 0.496       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.941      |
|    explained_variance   | 0.725       |
|    learning_rate        | 0.001       |
|    loss                 | 95.2        |
|    n_updates            | 35920       |
|    policy_gradient_loss | -0.0209     |
|    std                  | 0.

Num timesteps: 661120
Best mean reward: 191.41 - Last mean reward per episode: 146.36
Num timesteps: 671120
Best mean reward: 191.41 - Last mean reward per episode: 139.70
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 30        |
|    ep_rew_mean          | 138       |
| time/                   |           |
|    fps                  | 436       |
|    iterations           | 330       |
|    time_elapsed         | 1548      |
|    total_timesteps      | 675840    |
| train/                  |           |
|    approx_kl            | 0.1384635 |
|    clip_fraction        | 0.533     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.976    |
|    explained_variance   | 0.784     |
|    learning_rate        | 0.001     |
|    loss                 | 56.8      |
|    n_updates            | 36720     |
|    policy_gradient_loss | -0.0176   |
|    std                  | 0.336     |
|    value_loss           | 

Num timesteps: 821120
Best mean reward: 191.41 - Last mean reward per episode: 153.01
Num timesteps: 831120
Best mean reward: 191.41 - Last mean reward per episode: 143.11
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 157        |
| time/                   |            |
|    fps                  | 428        |
|    iterations           | 410        |
|    time_elapsed         | 1960       |
|    total_timesteps      | 839680     |
| train/                  |            |
|    approx_kl            | 0.21375106 |
|    clip_fraction        | 0.554      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.942     |
|    explained_variance   | 0.833      |
|    learning_rate        | 0.001      |
|    loss                 | 91.4       |
|    n_updates            | 37520      |
|    policy_gradient_loss | -0.0216    |
|    std                  | 0.334      |
|    val

Num timesteps: 991120
Best mean reward: 191.41 - Last mean reward per episode: 143.15
Num timesteps: 1001120
Best mean reward: 191.41 - Last mean reward per episode: 154.16
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 140        |
| time/                   |            |
|    fps                  | 421        |
|    iterations           | 490        |
|    time_elapsed         | 2381       |
|    total_timesteps      | 1003520    |
| train/                  |            |
|    approx_kl            | 0.19285901 |
|    clip_fraction        | 0.587      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.799     |
|    explained_variance   | 0.846      |
|    learning_rate        | 0.001      |
|    loss                 | 54.3       |
|    n_updates            | 38320      |
|    policy_gradient_loss | -0.00607   |
|    std                  | 0.316      |
|    va

Num timesteps: 1151120
Best mean reward: 191.41 - Last mean reward per episode: 158.78
Num timesteps: 1161120
Best mean reward: 191.41 - Last mean reward per episode: 137.65
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 145        |
| time/                   |            |
|    fps                  | 415        |
|    iterations           | 570        |
|    time_elapsed         | 2807       |
|    total_timesteps      | 1167360    |
| train/                  |            |
|    approx_kl            | 0.24746326 |
|    clip_fraction        | 0.577      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.849     |
|    explained_variance   | 0.848      |
|    learning_rate        | 0.001      |
|    loss                 | 57.8       |
|    n_updates            | 39120      |
|    policy_gradient_loss | -0.0212    |
|    std                  | 0.322      |
|    v

Num timesteps: 1311120
Best mean reward: 191.41 - Last mean reward per episode: 143.47
Num timesteps: 1321120
Best mean reward: 191.41 - Last mean reward per episode: 154.69
Num timesteps: 1331120
Best mean reward: 191.41 - Last mean reward per episode: 153.48
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 151        |
| time/                   |            |
|    fps                  | 418        |
|    iterations           | 650        |
|    time_elapsed         | 3178       |
|    total_timesteps      | 1331200    |
| train/                  |            |
|    approx_kl            | 0.19494024 |
|    clip_fraction        | 0.548      |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.12      |
|    explained_variance   | 0.738      |
|    learning_rate        | 0.001      |
|    loss                 | 63         |
|    n_updates            | 39920      |
|

Num timesteps: 1481120
Best mean reward: 191.41 - Last mean reward per episode: 166.98
Num timesteps: 1491120
Best mean reward: 191.41 - Last mean reward per episode: 166.19
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 30        |
|    ep_rew_mean          | 159       |
| time/                   |           |
|    fps                  | 423       |
|    iterations           | 730       |
|    time_elapsed         | 3532      |
|    total_timesteps      | 1495040   |
| train/                  |           |
|    approx_kl            | 0.2291115 |
|    clip_fraction        | 0.59      |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.928    |
|    explained_variance   | 0.854     |
|    learning_rate        | 0.001     |
|    loss                 | 75.7      |
|    n_updates            | 40720     |
|    policy_gradient_loss | -0.013    |
|    std                  | 0.33      |
|    value_loss           

Num timesteps: 1641120
Best mean reward: 191.41 - Last mean reward per episode: 159.98
Num timesteps: 1651120
Best mean reward: 191.41 - Last mean reward per episode: 143.49
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 149        |
| time/                   |            |
|    fps                  | 426        |
|    iterations           | 810        |
|    time_elapsed         | 3885       |
|    total_timesteps      | 1658880    |
| train/                  |            |
|    approx_kl            | 0.16006964 |
|    clip_fraction        | 0.538      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.868     |
|    explained_variance   | 0.864      |
|    learning_rate        | 0.001      |
|    loss                 | 108        |
|    n_updates            | 41520      |
|    policy_gradient_loss | -0.0234    |
|    std                  | 0.325      |
|    v

Num timesteps: 1811120
Best mean reward: 191.41 - Last mean reward per episode: 139.41
Num timesteps: 1821120
Best mean reward: 191.41 - Last mean reward per episode: 158.72
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 155        |
| time/                   |            |
|    fps                  | 429        |
|    iterations           | 890        |
|    time_elapsed         | 4239       |
|    total_timesteps      | 1822720    |
| train/                  |            |
|    approx_kl            | 0.16283824 |
|    clip_fraction        | 0.53       |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.905     |
|    explained_variance   | 0.77       |
|    learning_rate        | 0.001      |
|    loss                 | 126        |
|    n_updates            | 42320      |
|    policy_gradient_loss | -0.0204    |
|    std                  | 0.325      |
|    v

Num timesteps: 1971120
Best mean reward: 191.41 - Last mean reward per episode: 161.79
Num timesteps: 1981120
Best mean reward: 191.41 - Last mean reward per episode: 158.02
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 30        |
|    ep_rew_mean          | 143       |
| time/                   |           |
|    fps                  | 432       |
|    iterations           | 970       |
|    time_elapsed         | 4592      |
|    total_timesteps      | 1986560   |
| train/                  |           |
|    approx_kl            | 0.3548528 |
|    clip_fraction        | 0.595     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.871    |
|    explained_variance   | 0.784     |
|    learning_rate        | 0.001     |
|    loss                 | 59.7      |
|    n_updates            | 43120     |
|    policy_gradient_loss | -0.00456  |
|    std                  | 0.323     |
|    value_loss           

Num timesteps: 2131120
Best mean reward: 191.41 - Last mean reward per episode: 146.18
Num timesteps: 2141120
Best mean reward: 191.41 - Last mean reward per episode: 134.38
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 30        |
|    ep_rew_mean          | 138       |
| time/                   |           |
|    fps                  | 434       |
|    iterations           | 1050      |
|    time_elapsed         | 4945      |
|    total_timesteps      | 2150400   |
| train/                  |           |
|    approx_kl            | 0.4673414 |
|    clip_fraction        | 0.639     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.864    |
|    explained_variance   | 0.802     |
|    learning_rate        | 0.001     |
|    loss                 | 40.7      |
|    n_updates            | 43920     |
|    policy_gradient_loss | -0.00725  |
|    std                  | 0.324     |
|    value_loss           

Num timesteps: 2301120
Best mean reward: 191.41 - Last mean reward per episode: 135.33
Num timesteps: 2311120
Best mean reward: 191.41 - Last mean reward per episode: 135.31
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 154        |
| time/                   |            |
|    fps                  | 436        |
|    iterations           | 1130       |
|    time_elapsed         | 5298       |
|    total_timesteps      | 2314240    |
| train/                  |            |
|    approx_kl            | 0.21381235 |
|    clip_fraction        | 0.533      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.842     |
|    explained_variance   | 0.751      |
|    learning_rate        | 0.001      |
|    loss                 | 45.2       |
|    n_updates            | 44720      |
|    policy_gradient_loss | -0.0192    |
|    std                  | 0.32       |
|    v

Num timesteps: 2461120
Best mean reward: 191.41 - Last mean reward per episode: 135.92
Num timesteps: 2471120
Best mean reward: 191.41 - Last mean reward per episode: 154.21
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 30        |
|    ep_rew_mean          | 124       |
| time/                   |           |
|    fps                  | 438       |
|    iterations           | 1210      |
|    time_elapsed         | 5651      |
|    total_timesteps      | 2478080   |
| train/                  |           |
|    approx_kl            | 0.5567544 |
|    clip_fraction        | 0.616     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.781    |
|    explained_variance   | 0.727     |
|    learning_rate        | 0.001     |
|    loss                 | 75.1      |
|    n_updates            | 45520     |
|    policy_gradient_loss | 0.00259   |
|    std                  | 0.316     |
|    value_loss           

Num timesteps: 2631120
Best mean reward: 191.41 - Last mean reward per episode: 112.62
Num timesteps: 2641120
Best mean reward: 191.41 - Last mean reward per episode: 127.87
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 119        |
| time/                   |            |
|    fps                  | 439        |
|    iterations           | 1290       |
|    time_elapsed         | 6004       |
|    total_timesteps      | 2641920    |
| train/                  |            |
|    approx_kl            | 0.34078717 |
|    clip_fraction        | 0.617      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.927     |
|    explained_variance   | 0.758      |
|    learning_rate        | 0.001      |
|    loss                 | 61.7       |
|    n_updates            | 46320      |
|    policy_gradient_loss | -0.0167    |
|    std                  | 0.328      |
|    v

Num timesteps: 2791120
Best mean reward: 191.41 - Last mean reward per episode: 133.71
Num timesteps: 2801120
Best mean reward: 191.41 - Last mean reward per episode: 136.19
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 30        |
|    ep_rew_mean          | 115       |
| time/                   |           |
|    fps                  | 441       |
|    iterations           | 1370      |
|    time_elapsed         | 6358      |
|    total_timesteps      | 2805760   |
| train/                  |           |
|    approx_kl            | 5.1125007 |
|    clip_fraction        | 0.852     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.84     |
|    explained_variance   | 0.783     |
|    learning_rate        | 0.001     |
|    loss                 | 76        |
|    n_updates            | 47120     |
|    policy_gradient_loss | 0.127     |
|    std                  | 0.324     |
|    value_loss           

Num timesteps: 2951120
Best mean reward: 191.41 - Last mean reward per episode: 135.47
Num timesteps: 2961120
Best mean reward: 191.41 - Last mean reward per episode: 127.29
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 117        |
| time/                   |            |
|    fps                  | 442        |
|    iterations           | 1450       |
|    time_elapsed         | 6711       |
|    total_timesteps      | 2969600    |
| train/                  |            |
|    approx_kl            | 0.37740672 |
|    clip_fraction        | 0.614      |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.06      |
|    explained_variance   | 0.659      |
|    learning_rate        | 0.001      |
|    loss                 | 47.3       |
|    n_updates            | 47920      |
|    policy_gradient_loss | -0.0161    |
|    std                  | 0.344      |
|    v

Num timesteps: 3121120
Best mean reward: 191.41 - Last mean reward per episode: 118.45
Num timesteps: 3131120
Best mean reward: 191.41 - Last mean reward per episode: 126.02
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 122        |
| time/                   |            |
|    fps                  | 443        |
|    iterations           | 1530       |
|    time_elapsed         | 7064       |
|    total_timesteps      | 3133440    |
| train/                  |            |
|    approx_kl            | 0.46805215 |
|    clip_fraction        | 0.606      |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.15      |
|    explained_variance   | 0.661      |
|    learning_rate        | 0.001      |
|    loss                 | 86.1       |
|    n_updates            | 48720      |
|    policy_gradient_loss | -0.00279   |
|    std                  | 0.358      |
|    v

Num timesteps: 3281120
Best mean reward: 191.41 - Last mean reward per episode: 128.25
Num timesteps: 3291120
Best mean reward: 191.41 - Last mean reward per episode: 124.01
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 143        |
| time/                   |            |
|    fps                  | 444        |
|    iterations           | 1610       |
|    time_elapsed         | 7418       |
|    total_timesteps      | 3297280    |
| train/                  |            |
|    approx_kl            | 0.35282692 |
|    clip_fraction        | 0.651      |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.05      |
|    explained_variance   | 0.788      |
|    learning_rate        | 0.001      |
|    loss                 | 47.7       |
|    n_updates            | 49520      |
|    policy_gradient_loss | -0.0118    |
|    std                  | 0.341      |
|    v

Num timesteps: 3441120
Best mean reward: 191.41 - Last mean reward per episode: 148.12
Num timesteps: 3451120
Best mean reward: 191.41 - Last mean reward per episode: 130.56
Num timesteps: 3461120
Best mean reward: 191.41 - Last mean reward per episode: 115.24
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 115        |
| time/                   |            |
|    fps                  | 445        |
|    iterations           | 1690       |
|    time_elapsed         | 7772       |
|    total_timesteps      | 3461120    |
| train/                  |            |
|    approx_kl            | 0.38932618 |
|    clip_fraction        | 0.601      |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.09      |
|    explained_variance   | 0.834      |
|    learning_rate        | 0.001      |
|    loss                 | 78         |
|    n_updates            | 50320      |
|

Num timesteps: 3611120
Best mean reward: 191.41 - Last mean reward per episode: 135.64
Num timesteps: 3621120
Best mean reward: 191.41 - Last mean reward per episode: 132.54
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 123        |
| time/                   |            |
|    fps                  | 446        |
|    iterations           | 1770       |
|    time_elapsed         | 8125       |
|    total_timesteps      | 3624960    |
| train/                  |            |
|    approx_kl            | 0.34249324 |
|    clip_fraction        | 0.619      |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.03      |
|    explained_variance   | 0.775      |
|    learning_rate        | 0.001      |
|    loss                 | 63.7       |
|    n_updates            | 51120      |
|    policy_gradient_loss | -0.00719   |
|    std                  | 0.342      |
|    v

Num timesteps: 3771120
Best mean reward: 191.41 - Last mean reward per episode: 126.09
Num timesteps: 3781120
Best mean reward: 191.41 - Last mean reward per episode: 130.37
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 30        |
|    ep_rew_mean          | 137       |
| time/                   |           |
|    fps                  | 446       |
|    iterations           | 1850      |
|    time_elapsed         | 8478      |
|    total_timesteps      | 3788800   |
| train/                  |           |
|    approx_kl            | 0.9747775 |
|    clip_fraction        | 0.764     |
|    clip_range           | 0.2       |
|    entropy_loss         | -1.03     |
|    explained_variance   | 0.789     |
|    learning_rate        | 0.001     |
|    loss                 | 52.8      |
|    n_updates            | 51920     |
|    policy_gradient_loss | 0.0577    |
|    std                  | 0.344     |
|    value_loss           

Num timesteps: 3941120
Best mean reward: 191.41 - Last mean reward per episode: 154.78
Num timesteps: 3951120
Best mean reward: 191.41 - Last mean reward per episode: 143.47
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 30        |
|    ep_rew_mean          | 148       |
| time/                   |           |
|    fps                  | 447       |
|    iterations           | 1930      |
|    time_elapsed         | 8832      |
|    total_timesteps      | 3952640   |
| train/                  |           |
|    approx_kl            | 0.3790453 |
|    clip_fraction        | 0.629     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.886    |
|    explained_variance   | 0.785     |
|    learning_rate        | 0.001     |
|    loss                 | 54.9      |
|    n_updates            | 52720     |
|    policy_gradient_loss | -0.0145   |
|    std                  | 0.326     |
|    value_loss           

Num timesteps: 4101120
Best mean reward: 191.41 - Last mean reward per episode: 157.09
Num timesteps: 4111120
Best mean reward: 191.41 - Last mean reward per episode: 133.56
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 30        |
|    ep_rew_mean          | 139       |
| time/                   |           |
|    fps                  | 448       |
|    iterations           | 2010      |
|    time_elapsed         | 9185      |
|    total_timesteps      | 4116480   |
| train/                  |           |
|    approx_kl            | 0.4201376 |
|    clip_fraction        | 0.691     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.904    |
|    explained_variance   | 0.831     |
|    learning_rate        | 0.001     |
|    loss                 | 42.5      |
|    n_updates            | 53520     |
|    policy_gradient_loss | -0.00282  |
|    std                  | 0.329     |
|    value_loss           

Num timesteps: 4261120
Best mean reward: 191.41 - Last mean reward per episode: 137.85
Num timesteps: 4271120
Best mean reward: 191.41 - Last mean reward per episode: 156.62
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 148        |
| time/                   |            |
|    fps                  | 448        |
|    iterations           | 2090       |
|    time_elapsed         | 9538       |
|    total_timesteps      | 4280320    |
| train/                  |            |
|    approx_kl            | 0.23805878 |
|    clip_fraction        | 0.594      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.993     |
|    explained_variance   | 0.856      |
|    learning_rate        | 0.001      |
|    loss                 | 67.8       |
|    n_updates            | 54320      |
|    policy_gradient_loss | -0.0327    |
|    std                  | 0.337      |
|    v

Num timesteps: 4431120
Best mean reward: 191.41 - Last mean reward per episode: 144.98
Num timesteps: 4441120
Best mean reward: 191.41 - Last mean reward per episode: 143.21
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 150        |
| time/                   |            |
|    fps                  | 449        |
|    iterations           | 2170       |
|    time_elapsed         | 9892       |
|    total_timesteps      | 4444160    |
| train/                  |            |
|    approx_kl            | 0.20316765 |
|    clip_fraction        | 0.569      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.917     |
|    explained_variance   | 0.851      |
|    learning_rate        | 0.001      |
|    loss                 | 113        |
|    n_updates            | 55120      |
|    policy_gradient_loss | -0.0363    |
|    std                  | 0.327      |
|    v

Num timesteps: 4591120
Best mean reward: 191.41 - Last mean reward per episode: 146.45
Num timesteps: 4601120
Best mean reward: 191.41 - Last mean reward per episode: 124.88
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 130        |
| time/                   |            |
|    fps                  | 449        |
|    iterations           | 2250       |
|    time_elapsed         | 10245      |
|    total_timesteps      | 4608000    |
| train/                  |            |
|    approx_kl            | 0.18609178 |
|    clip_fraction        | 0.581      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.98      |
|    explained_variance   | 0.77       |
|    learning_rate        | 0.001      |
|    loss                 | 192        |
|    n_updates            | 55920      |
|    policy_gradient_loss | -0.0298    |
|    std                  | 0.334      |
|    v

Num timesteps: 4761120
Best mean reward: 191.41 - Last mean reward per episode: 133.52
Num timesteps: 4771120
Best mean reward: 191.41 - Last mean reward per episode: 135.44
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 138        |
| time/                   |            |
|    fps                  | 450        |
|    iterations           | 2330       |
|    time_elapsed         | 10599      |
|    total_timesteps      | 4771840    |
| train/                  |            |
|    approx_kl            | 0.33934146 |
|    clip_fraction        | 0.603      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.905     |
|    explained_variance   | 0.759      |
|    learning_rate        | 0.001      |
|    loss                 | 137        |
|    n_updates            | 56720      |
|    policy_gradient_loss | -0.0221    |
|    std                  | 0.329      |
|    v

Num timesteps: 4921120
Best mean reward: 191.41 - Last mean reward per episode: 143.71
Num timesteps: 4931120
Best mean reward: 191.41 - Last mean reward per episode: 122.84
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 30        |
|    ep_rew_mean          | 118       |
| time/                   |           |
|    fps                  | 450       |
|    iterations           | 2410      |
|    time_elapsed         | 10952     |
|    total_timesteps      | 4935680   |
| train/                  |           |
|    approx_kl            | 0.2032316 |
|    clip_fraction        | 0.598     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.874    |
|    explained_variance   | 0.812     |
|    learning_rate        | 0.001     |
|    loss                 | 143       |
|    n_updates            | 57520     |
|    policy_gradient_loss | -0.0373   |
|    std                  | 0.324     |
|    value_loss           

Num timesteps: 5081120
Best mean reward: 191.41 - Last mean reward per episode: 130.85
Num timesteps: 5091120
Best mean reward: 191.41 - Last mean reward per episode: 115.26
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 117        |
| time/                   |            |
|    fps                  | 451        |
|    iterations           | 2490       |
|    time_elapsed         | 11306      |
|    total_timesteps      | 5099520    |
| train/                  |            |
|    approx_kl            | 0.21412724 |
|    clip_fraction        | 0.58       |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.812     |
|    explained_variance   | 0.79       |
|    learning_rate        | 0.001      |
|    loss                 | 110        |
|    n_updates            | 58320      |
|    policy_gradient_loss | -0.0125    |
|    std                  | 0.318      |
|    v

Num timesteps: 5251120
Best mean reward: 191.41 - Last mean reward per episode: 106.31
Num timesteps: 5261120
Best mean reward: 191.41 - Last mean reward per episode: 124.37
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 119        |
| time/                   |            |
|    fps                  | 451        |
|    iterations           | 2570       |
|    time_elapsed         | 11660      |
|    total_timesteps      | 5263360    |
| train/                  |            |
|    approx_kl            | 0.17219836 |
|    clip_fraction        | 0.568      |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.09      |
|    explained_variance   | 0.783      |
|    learning_rate        | 0.001      |
|    loss                 | 128        |
|    n_updates            | 59120      |
|    policy_gradient_loss | -0.0303    |
|    std                  | 0.345      |
|    v

Num timesteps: 5411120
Best mean reward: 191.41 - Last mean reward per episode: 120.61
Num timesteps: 5421120
Best mean reward: 191.41 - Last mean reward per episode: 133.50
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 107        |
| time/                   |            |
|    fps                  | 451        |
|    iterations           | 2650       |
|    time_elapsed         | 12013      |
|    total_timesteps      | 5427200    |
| train/                  |            |
|    approx_kl            | 0.39515918 |
|    clip_fraction        | 0.648      |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.06      |
|    explained_variance   | 0.658      |
|    learning_rate        | 0.001      |
|    loss                 | 52.6       |
|    n_updates            | 59920      |
|    policy_gradient_loss | -0.00669   |
|    std                  | 0.344      |
|    v

Num timesteps: 5571120
Best mean reward: 191.41 - Last mean reward per episode: 129.72
Num timesteps: 5581120
Best mean reward: 191.41 - Last mean reward per episode: 119.02
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 119        |
| time/                   |            |
|    fps                  | 452        |
|    iterations           | 2730       |
|    time_elapsed         | 12366      |
|    total_timesteps      | 5591040    |
| train/                  |            |
|    approx_kl            | 0.89426106 |
|    clip_fraction        | 0.74       |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.785     |
|    explained_variance   | 0.669      |
|    learning_rate        | 0.001      |
|    loss                 | 75.4       |
|    n_updates            | 60720      |
|    policy_gradient_loss | 0.0466     |
|    std                  | 0.318      |
|    v

Num timesteps: 5741120
Best mean reward: 191.41 - Last mean reward per episode: 132.83
Num timesteps: 5751120
Best mean reward: 191.41 - Last mean reward per episode: 116.63
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 124        |
| time/                   |            |
|    fps                  | 452        |
|    iterations           | 2810       |
|    time_elapsed         | 12720      |
|    total_timesteps      | 5754880    |
| train/                  |            |
|    approx_kl            | 0.37308145 |
|    clip_fraction        | 0.646      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.996     |
|    explained_variance   | 0.857      |
|    learning_rate        | 0.001      |
|    loss                 | 69.6       |
|    n_updates            | 61520      |
|    policy_gradient_loss | -0.0219    |
|    std                  | 0.335      |
|    v

Num timesteps: 5901120
Best mean reward: 191.41 - Last mean reward per episode: 124.87
Num timesteps: 5911120
Best mean reward: 191.41 - Last mean reward per episode: 132.88
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 30        |
|    ep_rew_mean          | 143       |
| time/                   |           |
|    fps                  | 452       |
|    iterations           | 2890      |
|    time_elapsed         | 13073     |
|    total_timesteps      | 5918720   |
| train/                  |           |
|    approx_kl            | 1.7511499 |
|    clip_fraction        | 0.772     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.86     |
|    explained_variance   | 0.88      |
|    learning_rate        | 0.001     |
|    loss                 | 52        |
|    n_updates            | 62320     |
|    policy_gradient_loss | 0.034     |
|    std                  | 0.327     |
|    value_loss           

Num timesteps: 6071120
Best mean reward: 191.41 - Last mean reward per episode: 125.48
Num timesteps: 6081120
Best mean reward: 191.41 - Last mean reward per episode: 143.42
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 141        |
| time/                   |            |
|    fps                  | 452        |
|    iterations           | 2970       |
|    time_elapsed         | 13427      |
|    total_timesteps      | 6082560    |
| train/                  |            |
|    approx_kl            | 0.40532106 |
|    clip_fraction        | 0.644      |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.03      |
|    explained_variance   | 0.709      |
|    learning_rate        | 0.001      |
|    loss                 | 95.2       |
|    n_updates            | 63120      |
|    policy_gradient_loss | -0.024     |
|    std                  | 0.34       |
|    v

Num timesteps: 6231120
Best mean reward: 191.41 - Last mean reward per episode: 147.53
Num timesteps: 6241120
Best mean reward: 191.41 - Last mean reward per episode: 130.18
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 30        |
|    ep_rew_mean          | 146       |
| time/                   |           |
|    fps                  | 453       |
|    iterations           | 3050      |
|    time_elapsed         | 13781     |
|    total_timesteps      | 6246400   |
| train/                  |           |
|    approx_kl            | 0.7345991 |
|    clip_fraction        | 0.656     |
|    clip_range           | 0.2       |
|    entropy_loss         | -1.07     |
|    explained_variance   | 0.725     |
|    learning_rate        | 0.001     |
|    loss                 | 73.1      |
|    n_updates            | 63920     |
|    policy_gradient_loss | -0.0137   |
|    std                  | 0.343     |
|    value_loss           

KeyboardInterrupt: 