In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
%matplotlib inline

In [3]:
from keras import initializations
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import SGD

Using Theano backend.


In [4]:
import trading as trd
from trading import State

In [5]:
# relaod module after modifying it
import importlib
importlib.reload(trd)
importlib.reload(trd.stock_history)
importlib.reload(trd.portfolio)
importlib.reload(trd.benchmarks)
importlib.reload(trd.environment)
importlib.reload(trd)

<module 'trading' from 'C:\\Users\\hamza\\Documents\\school\\cs_229\\cs229_final_project\\src\\trading\\__init__.py'>

## Formulate Problem

In [6]:
# transaction cost to buy/sell a stock
trans_cost = 0.001
# starting cash
cash = 1e4
# starting portfolio allocation (%lo, %hi)
starting_weights = (0.5, 0.5)
# reward function (either Sharpe Ratio or last reward)
reward = trd.sharpe_ratio_reward

In [7]:
# number of inputs
n = State.num_states()
# number of outputs
k = trd.actions.size

## Hyperparameters

In [8]:
# size of training set
m = 27
# size of experience replay
d = 6
# alpha / learning rate
α = 0.0001
# discount factor
γ = 0.9
# ϵ-greedy parameter
ϵ = 0.15

In [9]:
scale = 0.001
def my_init(shape, name=None):
    return initializations.normal(shape, scale=scale, name=name)

In [10]:
model = Sequential([
    Dense(input_dim=n, output_dim=100, init=my_init),
    Activation('relu'),
    Dense(output_dim=100, init=my_init),
    Activation('relu'),
    Dense(output_dim=100, init=my_init),
    Activation('relu'),
    Dense(output_dim=k)])

# momentum in [0.5, 0.9, 0.95, 0.99]
# use Adam?
sgd = SGD(lr=α, decay=1e-6, momentum=0.05, nesterov=True)
model.compile(loss='mean_squared_error', optimizer=sgd, metrics=['mean_squared_error'])

In [11]:
train_data, test_data = trd.get_stock_pairs(m)

In [12]:
portfolio_states = [State(p, cash=cash, target_weights=starting_weights, trans_cost=trans_cost) for p in train_data]
# list to delete from, keep all the portfolio states in portfolio_states
#  generates a (shallow) copy rather than copy the list's reference 
available_states = portfolio_states[:]

## Start Training

In [13]:
# prints debugging info every so many iteration 
DEBUG = True
DEBUG_EVERY = 50

In [None]:
train_record = pd.DataFrame(columns=('reward', 'loss'))
i = 1

while True:
    if available_states == []:
        # nothing left :(
        break

    # fill the experience replay
    exp_rep = np.random.choice(available_states, size=d, replace=False)

    # actual state values of each portfolio
    states = np.array([st.state for st in exp_rep])
    qvalues = model.predict(states)

    # max_a w/ ϵ
    chosen_actions = trd.choose_actions(qvalues, ϵ)

    for (st, a) in zip(exp_rep, trd.actions[chosen_actions]):
        # execute the action
        st.execute_trade(a)

        # step forward to the next day
        try:
            st.step()
        except StopIteration:
            # reached end of data; no more stepping for this one
            available_states.remove(st)

    states_prime = np.array([st.state for st in exp_rep])
    rewards = np.array([reward(st) for st in  exp_rep])

    # Q(s, a) of the choosen actions (!= max_a' Q(s, a'))
    # qvalues_curr = np.choose(chosen_action, qvalues.T)

    # max_a' Q(s', a')
    # use target network here (if doing one)
    qvalues_prime = model.predict(states_prime)
    qvalues_prime = np.max(qvalues_prime, axis=1)

    # the target we want (to minimize the MSE of)
    qvalues[np.arange(0,d), chosen_actions] += rewards + γ * qvalues_prime
    
    loss = model.train_on_batch(states, qvalues)
    
    # append new value
    # not very efficient, but this probably not the slowest step
    train_record.loc[i,:] = [np.mean(rewards), np.asscalar(loss[-1])]
    
    if (DEBUG) and (i % DEBUG_EVERY == 0):
        print('\niter:   {:d}'.format(i))
        print('reward: {:g}'.format(np.mean(rewards)))
        print('loss:   {:16g}'.format(np.asscalar(loss[-1])))
        
    i += 1


iter:   50
reward: 0.234929
loss:         0.00834743

iter:   100
reward: 0.151324
loss:         0.00543099

iter:   150
reward: 0.0271389
loss:         0.00371745
