In [None]:
%pylab inline 

import gym
from gym import error, spaces, utils
from gym.utils import seeding
from collections import Counter
import time
import progressbar as pb

from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Flatten, LSTM, Reshape, Dropout, Input, Concatenate
from keras.optimizers import Adam

from rl.agents.cem import CEMAgent 
from rl.agents.ddpg import DDPGAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory, EpisodeParameterMemory
from rl.processors import WhiteningNormalizerProcessor
from rl.agents import DDPGAgent
from rl.memory import SequentialMemory
from rl.random import OrnsteinUhlenbeckProcess

In [None]:
from empyrical import sortino_ratio, calmar_ratio, omega_ratio

In [None]:
# load the market data
input_source = np.load(open('data_spy.npy','rb'))
to_predict = np.load(open('data_spy_targets.npy','rb'))

In [None]:
input_source.shape, to_predict.shape

In [None]:
to_predict = to_predict[3,:].reshape(-1)

In [None]:
plot(to_predict);

In [None]:
input_source = input_source.T
input_source.shape

In [None]:
test_input_source = input_source[int(0.8*len(input_source)):, :]
input_source = input_source[0:int(0.8*len(input_source)), :]

In [None]:
df=pd.DataFrame(input_source)

In [None]:
corr = df.corr()
fig, ax = plt.subplots(figsize=(12, 12))
ax.matshow(corr)
plt.xticks(range(len(corr.columns)), corr.columns);
plt.yticks(range(len(corr.columns)), corr.columns);

In [None]:
bars_per_episode = 1000
winlen = 10
class TradingEnv(gym.Env):
    
    """ This gym implements a simple trading environment for reinforcement learning. """
    
    metadata = {'render.modes': ['human']}
    
    def __init__(self):
        self.action_space = spaces.Discrete( 3 )
        self.observation_space= spaces.Box( #np.min(input_source, axis=0), 
                                            #np.max(input_source, axis=0)
                                            np.ones((winlen,input_source.shape[1], ))*-999999, 
                                            np.ones((winlen,input_source.shape[1], ))*999999, 
                                          )
        self.reset()
        
    def _configure(self, display=None):
        self.display = display

    def _seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def step(self, action):
        
        #assert self.action_space.contains(action), "%r (%s) invalid" % (action, type(action))
        
        if (self.idx < self.end_idx) and (self.balance > 0):
            self.idx += 1
            done = False
        else:
            done = True
        
        info = {}
        
        observation = input_source[self.idx - winlen : self.idx, :]
        
        # execute the action and get the reward
        if np.argmax(action) == 0 and self.position == 0: # buy 
            self.position = -1
            self.open_idx = self.idx
        if np.argmax(action) == 1 and self.position == 0: # sell
            self.position = 1
            self.open_idx = self.idx
        if np.argmax(action) == 2:# or ((self.position==0) and ((self.idx - self.open_idx) > 8)): # close
            if self.position == -1: # long
                self.balance += (to_predict[self.idx] - to_predict[self.open_idx])*1000
            elif self.position == 1: # short
                self.balance += (to_predict[self.open_idx] - to_predict[self.idx])*1000
            self.position = 0
        if np.argmax(action) == 3:
            pass
        
        self.returns.append(self.balance - 1000)
        
        if len(self.returns) > 5:
            reward = sortino_ratio(np.diff(np.array(self.returns[:])))
            #print(np.diff(np.array(self.returns[1:])))
            #print(reward)
            if isnan(reward) or isinf(reward):
                reward = 0
        else:
            reward = 0
        self.prev_balance = self.balance
        
        
        return observation, reward, done, info
    
    def reset(self):
        # reset and return first observation
        self.idx = np.random.randint(winlen+1, input_source.shape[0] - bars_per_episode - winlen)
        self.end_idx = self.idx + bars_per_episode
        self.position = 0
        self.open_idx = 0
        self.balance = 1000
        self.prev_balance = self.balance
        self.returns = []
        return input_source[self.idx - winlen : self.idx, :]
    
    def reset2(self):
        # reset and return first observation
        self.idx = winlen
        self.end_idx = self.idx + bars_per_episode
        self.position = 0
        self.open_idx = 0
        self.balance = 1000
        self.prev_balance = self.balance
        self.returns = []
        return input_source[self.idx - winlen : self.idx, :]
    
    def _render(self, mode='human', close=False):
        #... TODO
        pass        


In [None]:
env = TradingEnv()

In [None]:
env.observation_space.shape

In [None]:
# Next, we build a very simple model.
actor = Sequential()
actor.add(Reshape(env.observation_space.shape, input_shape=(1,) + env.observation_space.shape))
actor.add(LSTM(16))
#actor.add(Dropout(0.5))
actor.add(Dense(32))
actor.add(Activation('relu'))
actor.add(Dense(env.action_space.n, activation='tanh'))
print(actor.summary())

action_input = Input(shape=(env.action_space.n,), name='action_input')
observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input')
flattened_observation = Flatten()(observation_input)
x = Dense(64)(flattened_observation)
x = Activation('relu')(x)
x = Concatenate()([x, action_input])
x = Dense(64)(x)
x = Activation('relu')(x)
x = Dense(1)(x)
x = Activation('linear')(x)
critic = Model(inputs=[action_input, observation_input], outputs=x)
print(critic.summary())

In [None]:
#class MyProcessor(WhiteningNormalizerProcessor):
#    def process_action(self, action):
#        return np.clip(action, -1., 1.)

In [None]:
memory = SequentialMemory(limit=100000, window_length=1)
random_process = OrnsteinUhlenbeckProcess(size=env.action_space.n, theta=.15, mu=0., sigma=.1)
agent = DDPGAgent(nb_actions=env.action_space.n, actor=actor, critic=critic, critic_action_input=action_input,
                  memory=memory, nb_steps_warmup_critic=10, nb_steps_warmup_actor=10,
                  random_process=random_process, gamma=.99, target_model_update=1e-3,
                  processor=WhiteningNormalizerProcessor()
                 )
agent.compile([Adam(lr=1e-4), Adam(lr=1e-3)], metrics=['mae'])

In [None]:
# training is here
h = agent.fit(env, nb_steps=50000, nb_max_episode_steps=bars_per_episode, visualize=False, verbose=1)
rewards = h.history['episode_reward']

In [None]:
plot(rewards);

In [None]:
# visualize the behavior for one random episode
bars_per_episode = 100000
input_source = test_input_source 

observation = env.reset()
done = False
navs = []
while not done:
    action = agent.forward(observation)
    observation, reward, done, info = env.step(action)
    navs.append(reward)

kl = []
t = 0
for n in navs:
    t += n
    kl.append(t)
plot(kl);

In [None]:
navs

In [None]:
# calculate the likelihood of success for any given episode
l = 100
krl = []
p = pb.ProgressBar(max_value=l)
for i in range(l):
    p.update(i)
    observation = env.reset2()
    done = False
    navs = []
    while not done:
        action = agent.forward(observation)
        observation, reward, done, info = env.step(action)
        navs.append(reward)
    krl.append(sum(navs))
p.finish()

In [None]:
krl = array(krl)
print('Profit likelihood: %3.3f%%' % (100*(sum(krl > 0) / len(krl))))