In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
%matplotlib inline

In [2]:
from keras import initializations
from keras.optimizers import SGD, Adam

Using Theano backend.


In [3]:
import trading as trd
from trading import State, create_model, copy_model, track_model

In [4]:
# relaod module after modifying it
import importlib
importlib.reload(trd)
importlib.reload(trd.stock_history)
importlib.reload(trd.portfolio)
importlib.reload(trd.benchmarks)
importlib.reload(trd.rl)
importlib.reload(trd)

<module 'trading' from 'C:\\Users\\hamza\\Documents\\school\\cs_229\\cs229_final_project\\src\\trading\\__init__.py'>

## Formulate Problem

In [5]:
# transaction cost to buy/sell a stock
trans_cost = 0.001
# starting cash
cash = 1e6
# starting portfolio allocation (%lo, %hi)
starting_weights = (0.5, 0.5)
# reward function (either Sharpe Ratio or last reward)
reward = trd.sharpe_ratio_reward

In [6]:
# number of inputs
n = State.num_states()
# number of outputs
k = trd.actions.size
# size of training set
m = 27

## Data

In [7]:
train_data, test_data = trd.get_stock_pairs(m)

In [8]:
portfolio_states = [State(p, cash=cash, target_weights=starting_weights, trans_cost=trans_cost) for p in train_data]
# list to delete from, keep all the portfolio states in portfolio_states
#  generates a (shallow) copy rather than copy the list's reference 
available_states = portfolio_states[:]

## Hyperparameters

In [9]:
# size of experience replay
D = 6
# discount factor
γ = 0.99
# ϵ-greedy parameter
ϵ = 0.15
# hidden layer size
H = 100
# activation function
non_lin = 'relu'

# custom init
#  small starting seems to help
scale = 1E-4
def my_init(shape, name=None):
    return initializations.normal(shape, scale=scale, name=name)
#my_init = 'glorot_normal'

# alpha / learning rate
#α = 0.0001
# momentum in [0.5, 0.9, 0.95, 0.99]
#opt = SGD(lr=α, decay=1e-5, momentum=0.95, nesterov=True)
opt = Adam()

In [10]:
model = create_model(n=n, k=k, H=H, non_linearity=non_lin, init=my_init, optimizer=opt)

## Target Network

In [11]:
# target network drift
τ = 0.001

In [12]:
target = create_model(n=n, k=k, H=H, non_linearity=non_lin, init=my_init, optimizer=opt)

In [13]:
# start off with exact same init
copy_model(target, model)

## Start Training

In [14]:
# prints debugging info every so many iteration 
DEBUG = True
DEBUG_EVERY = 2500

In [17]:
train_record = pd.DataFrame(columns=('reward', 'loss'))
i = 0

In [18]:
#for _ in range(1):
while True:
    if available_states == []:
        # nothing left :(
        break
    # fill the experience replay
    elif len(available_states) < D:
        # getting close to the end
        exp_rep = np.random.permutation(available_states)
    else:
        exp_rep = np.random.choice(available_states, size=D, replace=False)

    # the actual size of the experience replay
    d = len(exp_rep)
    
    # actual state values of each portfolio
    states = np.array([st.state for st in exp_rep])
 
    qvalues = model.predict(states)

    # max_a w/ ϵ
    chosen_actions = trd.choose_actions(qvalues, ϵ)

    for (st, a) in zip(exp_rep, trd.actions[chosen_actions]):
        # execute the action
        st.execute_trade(a)

        # step forward to the next day
        try:
            st.step()
        except StopIteration:
            # reached end of data; no more stepping for this one
            available_states.remove(st)

    states_prime = np.array([st.state for st in exp_rep])
    rewards = np.array([reward(st) for st in  exp_rep])

    # max_a' Q(s', a')
    # use target network 
    qvalues_prime = np.max(target.predict(states_prime), axis=1)

    # the values we want (to minimize the MSE of)
    qvalues[np.arange(0,d), chosen_actions] = rewards + γ * qvalues_prime
    
    # train the network
    loss = model.train_on_batch(states, qvalues)
    loss = np.asscalar(loss[-1])
    
    # allow the target to drift behind
    track_model(target, model, τ)
    
    if np.isnan(loss) or (np.infty in qvalues) or (np.infty in qvalues_prime):
        # we hit the rails . . .
        # again
        break

    # append new value
    # not very efficient, but this probably not the slowest step
    train_record.loc[i,:] = [np.mean(rewards), loss]
    
    if (DEBUG) and (i % DEBUG_EVERY == 0):
        print('\niter:  {:7d}\tloss:  {:<16g}'.format(i, loss))
        # print('reward: {:<+16g}'.format(np.mean(rewards)))
        # print('loss:   {:<16g}'.format(loss))
        
    i += 1


iter:       0
reward: +0              
loss:   0.000464857     

iter:    1000
reward: +0.119442       
loss:   0.00356383      

iter:    2000
reward: -0.0481931      
loss:   0.00165883      

iter:    3000
reward: -0.0133415      
loss:   0.00122855      

iter:    4000
reward: +0.0393694      
loss:   0.000479116     

iter:    5000
reward: -0.00444697     
loss:   0.000369338     

iter:    6000
reward: +0.0405035      
loss:   0.000209102     

iter:    7000
reward: +0.0324255      
loss:   7.01475e-05     

iter:    8000
reward: +0.0201244      
loss:   0.000200227     

iter:    9000
reward: +0.0432288      
loss:   0.000184521     

iter:   10000
reward: +0.0524292      
loss:   9.21818e-05     

iter:   11000
reward: +0.0334165      
loss:   0.000158889     

iter:   12000
reward: +0.0395492      
loss:   2.6841e-05      

iter:   13000
reward: +0.0195314      
loss:   2.8149e-05      

iter:   14000
reward: +0.0255102      
loss:   0.000167486     

iter:   15000
reward: +0

## Testing

In [24]:
test_states = [State(p, cash=cash, target_weights=starting_weights, trans_cost=trans_cost) for p in test_data]
available_test_states = test_states[:]

In [None]:
while True:
    if available_test_states == []:
        break
    
    for st in available_test_states:
        for 