In [7]:
import gym
import PortfolioAllocationGym
import numpy as np
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_checker import check_env
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

In [16]:
env_kwargs = {'filename':'sp500.csv',
    'date_from':'2008-01-01',
    'date_to':'2017-12-31',
    'investment':1000000,
    'risk_free_rate': 0.5, # approx US Treasury Note return
    'sample_size':100,
    'random_sample':False,
    'reward_function':'portfolio_value'}

train_env = gym.make('PortfolioAllocation-v0', **env_kwargs)

In [17]:
check_env(train_env)



In [18]:
venv, obs = train_env.get_sb_env()

In [19]:
from stable_baselines3 import A2C
from stable_baselines3.a2c import MlpPolicy

In [21]:
import torch

model_kwargs =   {'gamma': 0.9999,
    'normalize_advantage': False,
    'max_grad_norm': 0.7,
    'use_rms_prop': False,
    'gae_lambda': 0.92,
    'n_steps': 10,
    'learning_rate': 0.0038610316815332825,
    'ent_coef': 0.012292116134058367,
    'vf_coef': 0.7960524189522955,
    'policy_kwargs': dict(
        log_std_init=-3.353286611055509,
        ortho_init= False,
        activation_fn=torch.nn.modules.activation.ReLU,
        net_arch=[dict(pi=[64, 64, 64], vf=[64, 64, 64])])
    }
a2c_model = A2C(policy = MlpPolicy,
                env = venv,
                **model_kwargs)


In [22]:
from PortfolioAllocationGym.callbacks import TensorBoardCallback as tbc
from datetime import datetime

In [23]:
# Random Agent, before training
mean_reward, std_reward = evaluate_policy(a2c_model, venv, n_eval_episodes=5)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

day: 2516                 reward: 3393953.530                 sharpe: 0.395                  psr: 0.000                  cum. rtns: 239.395                 portf val: 3,393,953.53
day: 2516                 reward: 3361192.112                 sharpe: 0.399                  psr: 0.000                  cum. rtns: 236.119                 portf val: 3,361,192.11
day: 2516                 reward: 3355335.076                 sharpe: 0.399                  psr: 0.000                  cum. rtns: 235.534                 portf val: 3,355,335.08
day: 2516                 reward: 3350199.976                 sharpe: 0.399                  psr: 0.000                  cum. rtns: 235.020                 portf val: 3,350,199.98
day: 2516                 reward: 3347830.353                 sharpe: 0.399                  psr: 0.000                  cum. rtns: 234.783                 portf val: 3,347,830.35
mean_reward:73.04 +/- 27.95


In [24]:
#total_timesteps = 2 * (len(train_env.venv.venv.envs[0].data.date.unique())-1)
total_timesteps = 10 * (len(train_env.data.date.unique())-1)
trained_a2c_model= a2c_model.learn(total_timesteps=total_timesteps,
                                   tb_log_name='A2C'+datetime.now().strftime("%H-%M"))

day: 2516                 reward: 2975263.327                 sharpe: 0.392                  psr: 0.000                  cum. rtns: 197.526                 portf val: 2,975,263.33
day: 2516                 reward: 2845850.310                 sharpe: 0.340                  psr: 0.000                  cum. rtns: 184.585                 portf val: 2,845,850.31
day: 2516                 reward: 2584910.383                 sharpe: 0.298                  psr: 0.000                  cum. rtns: 158.491                 portf val: 2,584,910.38
day: 2516                 reward: 2805226.133                 sharpe: 0.379                  psr: 0.000                  cum. rtns: 180.523                 portf val: 2,805,226.13
day: 2516                 reward: 2456507.539                 sharpe: 0.284                  psr: 0.000                  cum. rtns: 145.651                 portf val: 2,456,507.54
day: 2516                 reward: 2201985.942                 sharpe: 0.287                  psr: 0.

In [25]:
mean_reward, std_reward = evaluate_policy(trained_a2c_model, venv, n_eval_episodes=5)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

day: 2516                 reward: 2489789.039                 sharpe: 0.276                  psr: 0.000                  cum. rtns: 148.979                 portf val: 2,489,789.04
day: 2516                 reward: 2489401.154                 sharpe: 0.276                  psr: 0.000                  cum. rtns: 148.940                 portf val: 2,489,401.15
day: 2516                 reward: 2489093.883                 sharpe: 0.276                  psr: 0.000                  cum. rtns: 148.909                 portf val: 2,489,093.88
day: 2516                 reward: 2488820.449                 sharpe: 0.276                  psr: 0.000                  cum. rtns: 148.882                 portf val: 2,488,820.45
day: 2516                 reward: 2488584.486                 sharpe: 0.276                  psr: 0.000                  cum. rtns: 148.858                 portf val: 2,488,584.49
mean_reward:57.55 +/- 0.54


In [None]:
'''

eval_kwargs = {'filename':'sp500.csv',
    'date_from':'2018-01-01',
    'date_to':'2020-12-31',
    'investment':1000000,
    'risk_free_rate': 0.5,
    'reward_function':'daily_returns'}

eval_env =  Monitor(gym.make('PortfolioAllocation-v0', **eval_kwargs))

mean_reward, std_reward = evaluate_policy(trained_a2c_model, eval_env, n_eval_episodes=10)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
'''

In [39]:
trained_a2c_model.save('sp500_08_17_opt_49')



