In [7]:
import gym
import PortfolioAllocationGym
import numpy as np
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_checker import check_env
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

In [8]:
env_kwargs = {'filename':'sp500.csv',
    'date_from':'2008-01-01',
    'date_to':'2017-12-31',
    'investment':1000000,
    'risk_free_rate': 0.5, # approx US Treasury Note return
    'sample_size':100,
    'random_sample':False,
    'reward_function':'portfolio_value'}

train_env = gym.make('PortfolioAllocation-v0', **env_kwargs)

In [9]:
check_env(train_env)



In [10]:
venv, obs = train_env.get_sb_env()

In [11]:
from stable_baselines3 import A2C
from stable_baselines3.a2c import MlpPolicy

In [12]:
import torch

# Parameters from hyper tune @ Users/kbines/rl-baselines3-zoo/ac2_50_pv_normalized2
model_kwargs =   {'gamma': 0.98,
    'normalize_advantage': False,
    'max_grad_norm': 1,
    'use_rms_prop': False,
    'gae_lambda': 1.0,
    'n_steps': 5,
    'learning_rate': 0.006091038442400068,
    'ent_coef': 4.071869686147734e-06,
    'vf_coef': 0.36340337458493177,
    'policy_kwargs': dict(
        log_std_init=0.5523434134392059,
        ortho_init= True,
        activation_fn=torch.nn.modules.activation.Tanh,
        net_arch=[dict(pi=[256,256], vf=[256,256])])
    }
a2c_model = A2C(policy = MlpPolicy,
                env = venv,
                **model_kwargs)


In [13]:
from PortfolioAllocationGym.callbacks import TensorBoardCallback as tbc
from datetime import datetime

In [14]:
train_env.data.head()

Unnamed: 0,tic,date,open,low,high,close,adj_close,ema_50,ema_200,bb_bbm,bb_bbh,bb_bbl,bb_bbhi,bb_bbli,stoch,stoch_signal,macd,macd_signal,obv,daily_returns
0,ADS,2008-01-02,74.23,73.48,75.0,74.18,69.52,72.000821,69.983579,71.1745,73.988632,68.360368,0.0,0.0,-61.929825,-65.658055,-0.947575,-0.815673,-59951500.0,-1.081389
0,AET,2008-01-02,57.81,56.31,57.96,56.64,51.24,50.843909,46.863639,52.39,53.828833,50.951167,0.0,0.0,-182.374101,-137.44732,0.425078,0.630939,46705600.0,-1.895462
0,AIG,2008-01-02,1161.2,1124.0,1181.4,1126.0,778.32,815.676941,887.124643,803.0615,855.727415,750.395585,0.0,0.0,-238.966565,-192.106023,-1.752432,-2.300637,-221198000.0,-3.430649
0,AIZ,2008-01-02,66.99,65.54,66.99,65.7,49.67,48.560024,44.328743,50.425,52.561638,48.288362,0.0,0.0,-267.458867,-211.672085,0.624353,0.830878,1241900.0,-1.79913
0,AMG,2008-01-02,117.49,111.96,118.36,112.32,108.85,118.126769,116.363668,117.799,126.765511,108.832489,0.0,0.0,-18.998167,-1.774477,-2.036778,-1.298258,13445500.0,-4.374945


In [9]:
# Random Agent, before training
mean_reward, std_reward = evaluate_policy(a2c_model, venv, n_eval_episodes=5)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

day: 2516                 reward: 2386601.100                 sharpe: 0.363                  psr: 0.000                  cum. rtns: 138.660                 portf val: 2,386,601.10
day: 2516                 reward: 2206407.131                 sharpe: 0.349                  psr: 0.000                  cum. rtns: 120.641                 portf val: 2,206,407.13
day: 2516                 reward: 2180480.656                 sharpe: 0.345                  psr: 0.000                  cum. rtns: 118.048                 portf val: 2,180,480.66
day: 2516                 reward: 2172903.532                 sharpe: 0.343                  psr: 0.000                  cum. rtns: 117.290                 portf val: 2,172,903.53
day: 2516                 reward: 2167495.626                 sharpe: 0.342                  psr: 0.000                  cum. rtns: 116.750                 portf val: 2,167,495.63
mean_reward:79.85 +/- 27.42


In [15]:
total_timesteps = 1 * (len(venv.venv.envs[0].data.date.unique())-1)
#total_timesteps = 2 * (len(train_env.data.date.unique())-1)
trained_a2c_model= a2c_model.learn(total_timesteps=total_timesteps,
                                   tb_log_name='A2C'+datetime.now().strftime("%H-%M"))

day: 2516                 reward: 2439331.456                 sharpe: 0.291                  psr: 0.000                  cum. rtns: 143.933                 portf val: 2,439,331.46


In [14]:
trained_a2c_model.save('ac2_tuned_2000')

In [15]:
mean_reward, std_reward = evaluate_policy(trained_a2c_model, venv, n_eval_episodes=5)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

day: 2516                 reward: 2863442.950                 sharpe: 0.406                  psr: 0.000                  cum. rtns: 186.344                 portf val: 2,863,442.95
day: 2516                 reward: 2863442.950                 sharpe: 0.406                  psr: 0.000                  cum. rtns: 186.344                 portf val: 2,863,442.95
day: 2516                 reward: 2863442.950                 sharpe: 0.406                  psr: 0.000                  cum. rtns: 186.344                 portf val: 2,863,442.95
day: 2516                 reward: 2863442.950                 sharpe: 0.406                  psr: 0.000                  cum. rtns: 186.344                 portf val: 2,863,442.95
day: 2516                 reward: 2863442.950                 sharpe: 0.406                  psr: 0.000                  cum. rtns: 186.344                 portf val: 2,863,442.95
mean_reward:68.48 +/- 0.01


In [None]:
'''

eval_kwargs = {'filename':'sp500.csv',
    'date_from':'2018-01-01',
    'date_to':'2020-12-31',
    'investment':1000000,
    'risk_free_rate': 0.5,
    'reward_function':'daily_returns'}

eval_env =  Monitor(gym.make('PortfolioAllocation-v0', **eval_kwargs))

mean_reward, std_reward = evaluate_policy(trained_a2c_model, eval_env, n_eval_episodes=10)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
'''

In [39]:
trained_a2c_model.save('sp500_08_17_opt_49')




In [20]:
x = np.array([-0.2,-0.4,0,0.2,0.4])
np.true_divide(np.exp(x), np.sum(np.exp(x)))

array([0.15737927, 0.12885125, 0.19222347, 0.23478228, 0.28676373])