In [1]:
import gym
import PortfolioAllocationGym
import numpy as np
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_checker import check_env
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
env_kwargs = {'filename':'sp500.csv',
    'date_from':'2008-01-01',
    'date_to':'2017-12-31',
    'investment':1000000,
    'risk_free_rate': 0.5, # approx US Treasury Note return
    'sample_size':100,
    'random_sample':False,
    'reward_function':'portfolio_value'}

train_env = gym.make('PortfolioAllocation-v0', **env_kwargs)

In [3]:
check_env(train_env)



In [4]:
venv, obs = train_env.get_sb_env()

In [5]:
from stable_baselines3 import A2C
from stable_baselines3.a2c import MlpPolicy

In [6]:
import torch

# Parameters from hyper tune @ Users/kbines/rl-baselines3-zoo/ac2_50_pv_normalized2
model_kwargs =   {'gamma': 0.98,
    'normalize_advantage': False,
    'max_grad_norm': 1,
    'use_rms_prop': False,
    'gae_lambda': 1.0,
    'n_steps': 5,
    'learning_rate': 0.006091038442400068,
    'ent_coef': 4.071869686147734e-06,
    'vf_coef': 0.36340337458493177,
    'policy_kwargs': dict(
        log_std_init=0.5523434134392059,
        ortho_init= True,
        activation_fn=torch.nn.modules.activation.Tanh,
        net_arch=[dict(pi=[256,256], vf=[256,256])])
    }
a2c_model = A2C(policy = MlpPolicy,
                env = venv,
                **model_kwargs)


In [7]:
from PortfolioAllocationGym.callbacks import TensorBoardCallback as tbc
from datetime import datetime

In [9]:
train_env.data.head()

Unnamed: 0,tic,date,open,low,high,close,adj_close,ema_50,ema_200,bb_bbm,bb_bbh,bb_bbl,bb_bbhi,bb_bbli,stoch,stoch_signal,macd,macd_signal,obv,daily_returns
0,ADS,2008-01-02,74.23,73.48,75.0,0.236298,0.248366,0.281037,0.284947,0.267884,0.256992,0.279459,-0.220206,-0.193997,0.066845,0.051535,-0.452985,-0.424347,-0.213241,-1.081389
0,AET,2008-01-02,57.81,56.31,57.96,0.070947,0.071515,0.072913,0.047411,0.084877,0.071229,0.099954,-0.220206,-0.193997,-0.536298,-0.31009,0.129,0.232274,-0.165315,-1.895462
0,AIG,2008-01-02,1161.2,1124.0,1181.4,10.151912,7.105668,7.596702,8.68034,7.398261,7.460343,7.311878,-0.220206,-0.193997,-0.819693,-0.585422,-0.794233,-1.098376,-0.285696,-3.430649
0,AIZ,2008-01-02,66.99,65.54,66.99,0.156356,0.056326,0.050446,0.021367,0.065733,0.059553,0.072498,-0.220206,-0.193997,-0.962372,-0.683983,0.213489,0.323027,-0.185744,-1.79913
0,AMG,2008-01-02,117.49,111.96,118.36,0.595848,0.628865,0.734786,0.761461,0.722121,0.743306,0.696765,-0.220206,-0.193997,0.281832,0.373337,-0.914791,-0.643394,-0.18026,-4.374945


In [8]:
# Random Agent, before training
mean_reward, std_reward = evaluate_policy(a2c_model, venv, n_eval_episodes=5)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

day: 2516                 reward: 0.000                 sharpe: -0.008                  psr: 0.000                  cum. rtns: -100.000                 portf val: 0.00
day: 2516                 reward: 0.000                 sharpe: -0.008                  psr: 0.000                  cum. rtns: -100.000                 portf val: 0.00
day: 2516                 reward: 0.000                 sharpe: -0.008                  psr: 0.000                  cum. rtns: -100.000                 portf val: 0.00
day: 2516                 reward: 0.000                 sharpe: -0.008                  psr: 0.000                  cum. rtns: -100.000                 portf val: 0.00
day: 2516                 reward: 0.000                 sharpe: -0.008                  psr: 0.000                  cum. rtns: -100.000                 portf val: 0.00
mean_reward:27542794.00 +/- 0.00


In [10]:
total_timesteps = 2 * (len(venv.venv.envs[0].data.date.unique())-1)
#total_timesteps = 2 * (len(train_env.data.date.unique())-1)
trained_a2c_model= a2c_model.learn(total_timesteps=total_timesteps,
                                   tb_log_name='A2C'+datetime.now().strftime("%H-%M"))

day: 2516                 reward: 0.000                 sharpe: -0.009                  psr: 0.000                  cum. rtns: -100.000                 portf val: 0.00
day: 2516                 reward: 0.000                 sharpe: -0.009                  psr: 0.000                  cum. rtns: -100.000                 portf val: 0.00


In [25]:
mean_reward, std_reward = evaluate_policy(trained_a2c_model, venv, n_eval_episodes=5)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

day: 2516                 reward: 2489789.039                 sharpe: 0.276                  psr: 0.000                  cum. rtns: 148.979                 portf val: 2,489,789.04
day: 2516                 reward: 2489401.154                 sharpe: 0.276                  psr: 0.000                  cum. rtns: 148.940                 portf val: 2,489,401.15
day: 2516                 reward: 2489093.883                 sharpe: 0.276                  psr: 0.000                  cum. rtns: 148.909                 portf val: 2,489,093.88
day: 2516                 reward: 2488820.449                 sharpe: 0.276                  psr: 0.000                  cum. rtns: 148.882                 portf val: 2,488,820.45
day: 2516                 reward: 2488584.486                 sharpe: 0.276                  psr: 0.000                  cum. rtns: 148.858                 portf val: 2,488,584.49
mean_reward:57.55 +/- 0.54


In [None]:
'''

eval_kwargs = {'filename':'sp500.csv',
    'date_from':'2018-01-01',
    'date_to':'2020-12-31',
    'investment':1000000,
    'risk_free_rate': 0.5,
    'reward_function':'daily_returns'}

eval_env =  Monitor(gym.make('PortfolioAllocation-v0', **eval_kwargs))

mean_reward, std_reward = evaluate_policy(trained_a2c_model, eval_env, n_eval_episodes=10)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
'''

In [39]:
trained_a2c_model.save('sp500_08_17_opt_49')



