In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import Formatter
import datetime as dt
import pandas as pd

from stable_baselines.common.policies import MlpLstmPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines.ppo2 import PPO2
from stable_baselines import bench, logger
from stable_baselines.results_plotter import load_results, ts2xy

from importlib import reload
from time import time
import os.path

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [2]:
class MyFormatter(Formatter):
    def __init__(self, dates, fmt='%Y-%m-%d'):
        self.dates = dates
        self.fmt = fmt

    def __call__(self, x, pos=0):
        'Return the label for time x at position pos'
        ind = int(x)
        if ind >= len(self.dates) or ind < 0:
            return ''
        else:
            return self.dates[ind].strftime(self.fmt)

def evaluate(model, environment, num_steps=40000):
    pnl = []
    dates = []
    trans_cost = []
    action_ls = []
    obs = environment.reset()
    for i in range(num_steps):
        # _states are only useful when using LSTM policies
        action, _states = model.predict(obs, deterministic=True)
        # here, action, rewards and dones are arrays, because we are using vectorized env
        obs, rewards, dones, info = environment.step(action)
        date = info[0]["date"].to_pydatetime()
        cost = info[0]["transaction_cost"]
        # Stats
        pnl.append(rewards[0])
        dates.append(date)
        trans_cost.append(cost)
        action_ls.append(action)
        if dones[0]:
            break
    
    pnl = np.array(pnl)
    dates = np.array(dates)
    actions = np.array(action_ls)
  
    return pnl, dates, trans_cost, actions


def evaluate_short(model, environment, num_steps=40000):
    pnl = []
    obs = environment.reset()
    for i in range(num_steps):
        # _states are only useful when using LSTM policies
        action, _states = model.predict(obs, deterministic=True)
        # here, action, rewards and dones are arrays, because we are using vectorized env
        obs, rewards, dones, info = environment.step(action)
        pnl.append(rewards[0])
        if dones[0]:
            break
    return sum(pnl)


def annual_sharpe(pnl):
    mean = pnl.mean()
    var = pnl.std()
    day_sharpe = (mean / var) * np.sqrt(390)
    year_sharpe = day_sharpe * np.sqrt(252)
    return year_sharpe

def annual_return(pnl, principal=1000000):    
    ret = pnl / principal
    return np.mean(ret) * 390 * 252

def annual_volatility(pnl, principal=1000000):
    log_ret = np.log(1 + pnl / principal)
    return log_ret.std() * np.sqrt(252)

def maximum_drawdown(pnl):
    cum_pnl = np.cumsum(pnl)
    ind = np.argmax(np.maximum.accumulate(cum_pnl) - cum_pnl)
    return (np.maximum.accumulate(cum_pnl)[ind] - cum_pnl[ind]) / np.maximum.accumulate(cum_pnl)[ind]

def annual_turnover(weights):
    turnover = np.sum(np.abs(weights[1:] - weights[:-1])) / weights.shape[0]
    return turnover * 390 * 252

## Load Environment

In [3]:
from envs import equity_env

In [4]:
env = equity_env.EquityEnv(split_data=True)
env = DummyVecEnv([lambda: env])

-- Data Loaded --
-- Environment Created --


## Initialize Model

In [None]:
model = PPO2(MlpLstmPolicy, env, n_steps=240, learning_rate=0.000001, verbose=0, nminibatches=1, 
             policy_kwargs={"n_lstm":36, "layers":[36,36]}, tensorboard_log="./outputs/equity_train_tensorboard/")





Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Please use `layer.__call__` method instead.






## Train Model for 100 episodes

In [None]:
start = time()
model.learn(total_timesteps=100, tb_log_name="0bps_new_env")
end = time()
t = end - start
print("Takes %d s to run" % t)

## Hyper-parameter tuning

In [None]:
import optunity

In [None]:
dev_env = equity_env.EquityEnv(split_data=False, test_option="develop")
dev_env = DummyVecEnv([lambda: dev_env])

In [None]:
search = {"episode_len": {"120": {"ent_coef": [0.01, 0.1],
                                  "vf_coef": [0.3, 0.8],
                                  "cliprange": [0.1, 0.5],
                                  "learning_rate": [1e-7, 1e-3],
                                  "n_lstm": [8, 256]},
                          "480": {"ent_coef": [0.01, 0.1],
                                  "vf_coef": [0.3, 0.8],
                                  "cliprange": [0.1, 0.5],
                                  "learning_rate": [1e-7, 1e-3],
                                  "n_lstm": [8, 256]}, 
                          "1200": {"ent_coef": [0.01, 0.1],
                                  "vf_coef": [0.3, 0.8],
                                  "cliprange": [0.1, 0.5],
                                  "learning_rate": [1e-7, 1e-3],
                                  "n_lstm": [8, 256]}}}

In [None]:
def performance(episode_len, ent_coef, vf_coef, cliprange, learning_rate, n_lstm):
    ep_len = int(episode_len)
    env = equity_env.EquityEnv(split_data=True, episode_length=ep_len)
    env = DummyVecEnv([lambda: env])
    model = PPO2(MlpLstmPolicy, env, n_steps=240, ent_coef=ent_coef, vf_coef=vf_coef,
                 cliprange=cliprange, learning_rate=learning_rate, verbose=0, 
                 nminibatches=1, policy_kwargs={"n_lstm":int(n_lstm), "layers":[36,36]})
    episode_num = int(200 / (ep_len / 120))
    model.learn(total_timesteps=ep_len*episode_num)        
    return evaluate_short(model, dev_env)

In [None]:
t = time()
res = performance("1", 0.01, 0.5, 0.2, 1e-4, 36)
print("Takes %d s to run" % (time() - t))

In [None]:
res

#### warning: the following two chunks of code can take a long time to run. 

In [None]:
optimal_configuration, info, _ = optunity.maximize_structured(performance, search_space=search, num_evals=50)

In [None]:
model = PPO2(MlpLstmPolicy, env, n_steps=240, ent_coef=0.01, vf_coef=0.5,
             cliprange=0.2, learning_rate=1e-5, verbose=0, 
             nminibatches=1, policy_kwargs={"n_lstm":36, "layers":[36,36]})

pnl, dates, trans_cost, actions = evaluate(model, dev_env)

In [None]:
annual_sharpe(pnl)

In [None]:
annual_return(pnl)

In [None]:
annual_volatility(pnl)

In [None]:
maximum_drawdown(pnl)

In [None]:
annual_turnover(actions)

## Construct Baseline Strategies

In [None]:
prices = pd.read_csv("data/price.csv", parse_dates=[0])
dates = prices.iloc[:,0].apply(lambda x: pd.to_datetime(x))
principal = 1000000

### 1. Momentum Strategy

In [None]:
# First, need to get return. need to define a look back period
def momentum_signal(price_data, look_back, normalize=True):
    price_data = prices.iloc[:, [2,4,6]].values
    delay_price = np.roll(price_data, look_back, axis=0)
    delay_price[:look_back] = np.nan
    mom_sig = (price_data - delay_price) / delay_price
    if normalize:
        mom_sig = mom_sig - mom_sig.mean(axis=1,keepdims=True)
        mom_sig = mom_sig / ((mom_sig > 0) * mom_sig).sum(axis=1, keepdims=True)
    return mom_sig

ret1 = momentum_signal(prices, 1, False)

In [None]:
mom = momentum_signal(prices, 2100)

pnl_mom = np.sum(mom * np.roll(ret1, -1, axis=0), axis=1) * principal
pnl_mom = np.nan_to_num(pnl_mom)

plt.style.use("ggplot")
formatter = MyFormatter(dates)
fig, ax = plt.subplots(figsize=(11, 7))
ax.xaxis.set_major_formatter(formatter)
ax.plot(np.arange(pnl_mom.shape[0]), np.cumsum(pnl_mom))
fig.autofmt_xdate()
plt.show()

print(annual_sharpe(pnl_mom))

### 2. Buy-and-hold Strategy

In [None]:
mut = ret1+1
mut[0] = 1
pnl_hold = np.sum((np.cumprod(mut, axis=0)-1) / 3 * principal, axis=1)
pnl_hold = pnl_hold - np.roll(pnl_hold, 1)
pnl_hold[0] = 0
plt.style.use("ggplot")
formatter = MyFormatter(dates)
fig, ax = plt.subplots(figsize=(11, 7))
ax.xaxis.set_major_formatter(formatter)
ax.plot(np.arange(pnl_hold.shape[0]), np.cumsum(pnl_hold))
fig.autofmt_xdate()
plt.show()

print(annual_sharpe(pnl_hold))

### 3. Compare RL with the baseline strategies

In [None]:
import pickle

In [None]:
with open("outputs/model_pnl_35000.dms", "rb") as f:
    pnl_ppo = pickle.load(f)

In [None]:
n = pnl_ppo.shape[0]
pnl_mom = pnl_mom[:n]
pnl_hold = pnl_hold[:n]

In [None]:
plt.style.use("ggplot")
formatter = MyFormatter(dates)
fig, ax = plt.subplots(figsize=(11, 7))
ax.xaxis.set_major_formatter(formatter)
ax.plot(np.arange(pnl_mom.shape[0]), np.cumsum(pnl_mom), c="r", label="Momentum")
ax.plot(np.arange(pnl_hold.shape[0]), np.cumsum(pnl_hold), c="b", label="Buy and Hold")
ax.plot(np.arange(pnl_ppo.shape[0]), np.cumsum(pnl_ppo), c="k", label="PPO Agent")
fig.autofmt_xdate()
plt.legend()
plt.show()

In [None]:
annual_sharpe(pnl_ppo)

In [None]:
annual_return(pnl_ppo)

In [None]:
annual_volatility(pnl_ppo)

In [None]:
annual_turnover(pnl_ppo)

In [None]:
maximum_drawdown(pnl_ppo)