In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
%matplotlib inline

In [50]:
from keras import initializations
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import SGD

In [3]:
import trading as trd
from trading import State

In [4]:
# relaod module after modifying it
import importlib
importlib.reload(trd)
importlib.reload(trd.stock_history)
importlib.reload(trd.portfolio)
importlib.reload(trd.benchmarks)
importlib.reload(trd.environment)
importlib.reload(trd)

<module 'trading' from 'C:\\Users\\hamza\\Documents\\school\\cs_229\\cs229_final_project\\src\\trading\\__init__.py'>

## Formulate Problem

In [5]:
# number of inputs
n = State.num_states()
# number of outputs
k = trd.actions.size

In [84]:
# size of training set
m = 27
# size of experience replay
d = 3
# alpha / learning rate
α = 0.0001
# discount factor
γ = 0.95
# ϵ-greedy parameter
ϵ = 0.2

In [36]:
# transaction cost to buy/sell a stock
trans_cost = 0.001
# starting cash
cash = 1e4
# starting portfolio allocation (%lo, %hi)
starting_weights = (0.5, 0.5)
# reward function (either Sharpe Ratio or last reward)
reward = trd.sharpe_ratio_reward

In [8]:
# prints debugging info every so many iteration 
DEBUG = True
DEBUG_EVERY = 1

In [51]:
scale = 0.001
def my_init(shape, name=None):
    return initializations.normal(shape, scale=scale, name=name)

In [86]:
model = Sequential([
    Dense(input_dim=n, output_dim=100, init=my_init),
    Activation('relu'),
    Dense(output_dim=100, init=my_init),
    Activation('relu'),
    Dense(output_dim=k)])

# momentum in [0.5, 0.9, 0.95, 0.99]
# use Adam?
sgd = SGD(lr=α, decay=1e-6, momentum=0.05, nesterov=True)
model.compile(loss='mean_squared_error', optimizer=sgd, metrics=['mean_squared_error'])

In [32]:
train_data, test_data = trd.get_stock_pairs(m)

In [64]:
portfolio_states = [State(p, cash=cash, target_weights=starting_weights, trans_cost=trans_cost) for p in train_data]
# list to delete from, keep all the portfolio states in portfolio_states
#  generates a (shallow) copy rather than copy the list's reference 
available_states = portfolio_states[:]

## Start Training

In [87]:
train_record = pd.DataFrame(columns=('reward', 'loss'))
loss=[]
i = 1

for _ in range(1, 50):
    #while True:
    # fill the experience replay
    exp_rep = np.random.choice(available_states, size=d, replace=False)

    # actual state values of each portfolio
    states = np.array([st.state for st in exp_rep])
    qvalues = model.predict(states)

    # max_a w/ ϵ
    chosen_actions = trd.choose_actions(qvalues, ϵ)

    for (st, a) in zip(exp_rep, trd.actions[chosen_actions]):
        # execute the action
        st.execute_trade(a)

        # step forward to the next day
        try:
            st.step()
        except StopIteration:
            # reached end of data; no more stepping for this one
            available_states.remove(st)

    states_prime = np.array([st.state for st in exp_rep])
    rewards = np.array([reward(st) for st in  exp_rep])

    # Q(s, a) of the choosen actions (!= max_a' Q(s, a'))
    # qvalues_curr = np.choose(chosen_action, qvalues.T)

    # max_a' Q(s', a')
    # use target network here (if doing one)
    qvalues_prime = model.predict(states_prime)
    qvalues_prime = np.max(qvalues_prime, axis=1)

    # the target we want (to minimize the MSE of)
    qvalues[np.arange(0,d), chosen_actions] += rewards + γ * qvalues_prime
    
    loss += model.train_on_batch(states, qvalues)
    
    # append new value
    # not very efficient, but this probably not the slowest step
    train_record.loc[i,:] = [1, 2]
    
    if (DEBUG) and (i % DEBUG_EVERY == 0):
        print('\niter:   {:d}'.format(i))
        print('reward: {:g}'.format(np.mean(rewards)))
        print('loss:   {:16g}'.format(np.asscalar(loss[-1])))
        
    i += 1


iter:   1
reward: -0.120583
loss:         0.00922819
[[-0.08477008  0.06704208 -0.05822471 -0.06357161 -0.09334806 -0.08105459
   0.0821226 ]
 [-0.08923924  0.06977071 -0.06013535  0.49448817 -0.09852659 -0.08430305
   0.08611676]
 [-0.08414466  0.06588989 -0.05724438 -0.03698598 -0.09208937 -0.08000246
   0.08079929]]

iter:   2
reward: 0.224866
loss:          0.0217754
[[-0.09000338  0.07090481 -0.06195276  0.58599728 -0.09747821 -0.0856939
   0.08629722]
 [-0.08679777  0.06867784 -0.05973517  0.24999789 -0.09482667 -0.08294706
   0.08371108]
 [-0.08966844  0.07037794 -0.06072672  0.62809672 -0.09848593 -0.08497159
   0.08638484]]

iter:   3
reward: 0.102966
loss:          0.0248774
[[-0.11334131  0.09088433 -0.08457876  0.23713785 -0.08334183 -0.10827741
   0.68073891]
 [-0.11233083  0.09248442 -0.088455    0.6591182  -0.07891841 -0.11038841
   0.09043629]
 [-0.10911554  0.08625171 -0.07891845  0.1929775  -0.08119555 -0.1024804
   0.08760024]]

iter:   4
reward: 0.117645
loss:     