In [1]:
import gym
import PortfolioAllocationGym
import numpy as np
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_checker import check_env
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
env_kwargs = {'filename':'sp500.csv',
    'date_from':'2008-01-01',
    'date_to':'2017-12-31',
    'investment':1000000,
    'risk_free_rate': 0.5, # approx US Treasury Note return
    'sample_size':100,
    'random_sample':False,
    'reward_function':'portfolio_value'}

train_env = gym.make('PortfolioAllocation-v0', **env_kwargs)

In [None]:
check_env(train_env)

In [4]:
venv, obs = train_env.get_sb_env()

In [5]:
from stable_baselines3 import A2C
from stable_baselines3.a2c import MlpPolicy

In [6]:
import torch

# Parameters from hyper tune @ Users/kbines/rl-baselines3-zoo/ac2_50_pv_normalized2
model_kwargs =   {'gamma': 0.98,
    'normalize_advantage': False,
    'max_grad_norm': 1,
    'use_rms_prop': False,
    'gae_lambda': 1.0,
    'n_steps': 5,
    'learning_rate': 0.006091038442400068,
    'ent_coef': 4.071869686147734e-06,
    'vf_coef': 0.36340337458493177,
    'policy_kwargs': dict(
        log_std_init=0.5523434134392059,
        ortho_init= True,
        activation_fn=torch.nn.modules.activation.Tanh,
        net_arch=[dict(pi=[256,256], vf=[256,256])])
    }
a2c_model = A2C(policy = MlpPolicy,
                env = venv,
                **model_kwargs)


In [7]:
from PortfolioAllocationGym.callbacks import TensorBoardCallback as tbc
from datetime import datetime

In [None]:
train_env.data.head()

In [None]:
# Random Agent, before training
mean_reward, std_reward = evaluate_policy(a2c_model, venv, n_eval_episodes=5)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

In [None]:
total_timesteps = 2 * (len(venv.venv.envs[0].data.date.unique())-1)
#total_timesteps = 2 * (len(train_env.data.date.unique())-1)
trained_a2c_model= a2c_model.learn(total_timesteps=total_timesteps,
                                   tb_log_name='A2C'+datetime.now().strftime("%H-%M"))

In [None]:
mean_reward, std_reward = evaluate_policy(trained_a2c_model, venv, n_eval_episodes=5)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

In [None]:
'''

eval_kwargs = {'filename':'sp500.csv',
    'date_from':'2018-01-01',
    'date_to':'2020-12-31',
    'investment':1000000,
    'risk_free_rate': 0.5,
    'reward_function':'daily_returns'}

eval_env =  Monitor(gym.make('PortfolioAllocation-v0', **eval_kwargs))

mean_reward, std_reward = evaluate_policy(trained_a2c_model, eval_env, n_eval_episodes=10)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
'''

In [39]:
trained_a2c_model.save('sp500_08_17_opt_49')



