In [None]:
%pylab inline 

import gym
from gym import error, spaces, utils
from gym.utils import seeding
from collections import Counter
import time
import progressbar as pb

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, LSTM, Reshape, Dropout
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent 
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

In [None]:
# load the market data
input_source = np.load(open('data_spy.npy','rb'))
to_predict = np.load(open('data_spy_targets.npy','rb'))

In [None]:
input_source.shape, to_predict.shape

In [None]:
to_predict = to_predict[3,:].reshape(-1)

In [None]:
plot(to_predict);

In [None]:
input_source = input_source.T
input_source.shape

In [None]:
df=pd.DataFrame(input_source)

In [None]:
corr = df.corr()
fig, ax = plt.subplots(figsize=(12, 12))
ax.matshow(corr)
plt.xticks(range(len(corr.columns)), corr.columns);
plt.yticks(range(len(corr.columns)), corr.columns);

In [None]:
bars_per_episode = 1000
winlen = 10
class TradingEnv(gym.Env):
    
    """ This gym implements a simple trading environment for reinforcement learning. """
    
    metadata = {'render.modes': ['human']}
    
    def __init__(self):
        self.action_space = spaces.Discrete( 3 )
        self.observation_space= spaces.Box( #np.min(input_source, axis=0), 
                                            #np.max(input_source, axis=0)
                                            np.ones((winlen,input_source.shape[1], ))*-999999, 
                                            np.ones((winlen,input_source.shape[1], ))*999999, 
                                          )
        self.reset()
        
    def _configure(self, display=None):
        self.display = display

    def _seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def step(self, action):
        
        #assert self.action_space.contains(action), "%r (%s) invalid" % (action, type(action))
        
        if (self.idx < self.end_idx) and (self.balance > 0):
            self.idx += 1
            done = False
        else:
            done = True
        
        info = {}
        
        observation = input_source[self.idx - winlen : self.idx, :]
        
        # execute the action and get the reward
        if action == 0 and self.position == 0: # buy 
            self.position = -1
            self.open_idx = self.idx
        if action == 1 and self.position == 0: # sell
            self.position = 1
            self.open_idx = self.idx
        if action == 2 or ((self.position==0) and ((self.idx - self.open_idx) > 8)): # close
            if self.position == -1: # long
                self.balance += (to_predict[self.idx] - to_predict[self.open_idx])*1000
            elif self.position == 1: # short
                self.balance += (to_predict[self.open_idx] - to_predict[self.idx])*1000
            self.position = 0
        if action == 3:
            pass
        
        reward = self.balance - self.prev_balance
        self.prev_balance = self.balance
        
        return observation, reward, done, info
    
    def reset(self):
        # reset and return first observation
        self.idx = np.random.randint(0, input_source.shape[0] - bars_per_episode - winlen)
        self.end_idx = self.idx + bars_per_episode
        self.position = 0
        self.open_idx = 0
        self.balance = 1000
        self.prev_balance = self.balance
        return input_source[self.idx - winlen : self.idx, :]
    
    def _render(self, mode='human', close=False):
        #... TODO
        pass        


In [None]:
env = TradingEnv()

In [None]:
env.observation_space.shape

In [None]:
model = Sequential()
model.add(Reshape(env.observation_space.shape, input_shape=(1,) + env.observation_space.shape))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dense(env.action_space.n, activation='softmax'))

memory = SequentialMemory(limit=10000, window_length=1)
policy = BoltzmannQPolicy()
dqn = DQNAgent(model=model, 
               nb_actions=env.action_space.n, 
               memory=memory, 
               nb_steps_warmup=10,
               enable_double_dqn=True, 
               enable_dueling_network=True, 
               dueling_type='avg', 
               target_model_update=1e-2, 
               policy=policy)
dqn.compile(Adam(lr=0.002), metrics=['mae'])

In [None]:
# training is here
h = dqn.fit(env, nb_steps=300000, nb_max_episode_steps=bars_per_episode, visualize=False, verbose=1)
rewards = h.history['episode_reward']

In [None]:
plot(rewards);

In [None]:
# visualize the behavior for one random episode
observation = env.reset()
done = False
navs = []
while not done:
    action = dqn.forward(observation)
    observation, reward, done, info = env.step(action)
    navs.append(reward)

kl = []
t = 0
for n in navs:
    t += n
    kl.append(t)
plot(kl);

In [None]:
# calculate the likelihood of success for any given episode
l = 1000
krl = []
p = pb.ProgressBar(max_value=l)
for i in range(l):
    p.update(i)
    observation = env.reset()
    done = False
    navs = []
    while not done:
        action = dqn.forward(observation)
        observation, reward, done, info = env.step(action)
        navs.append(reward)
    krl.append(sum(navs))
p.finish()

In [None]:
krl = array(krl)
print('Profit likelihood: %3.3f%%' % (100*(sum(krl > 0) / len(krl))))