In [1]:
import torch
from torch import nn
import numpy as np
import random
import math 
from collections import deque
import gym
import trading_env
import datetime

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [38]:
rnn = nn.RNN(2, 16, 1, nonlinearity='relu', batch_first=True)
x = torch.randn( 64, 100, 2 )
y, _ = rnn(x, )
y[:, -1, :].std(dim=0)

tensor([0.1823, 0.0011, 0.1685, 0.1584, 0.0353, 0.0116, 0.0603, 0.1194, 0.0552,
        0.0384, 0.1925, 0.0722, 0.1355, 0.0838, 0.1881, 0.0849],
       grad_fn=<StdBackward1>)

In [37]:
rnn = nn.RNN(2, 16, 1, nonlinearity='tanh', batch_first=True)
x = torch.randn( 64, 100, 2 )
y, _ = rnn(x, )
y[:, -1, :].std(dim=0)

tensor([0.2459, 0.2060, 0.2549, 0.2907, 0.2015, 0.2075, 0.2868, 0.2838, 0.2824,
        0.2549, 0.2592, 0.4063, 0.2579, 0.1398, 0.3846, 0.2637],
       grad_fn=<StdBackward1>)

In [36]:
lstm = nn.LSTM(2, 16, 5, batch_first=True)
x = torch.randn( 64, 100, 2 )
y, _ = lstm(x, )
y[:, -1, :].std(dim=0)

tensor([2.1250e-05, 1.6928e-04, 1.3342e-04, 5.8910e-05, 5.5610e-05, 1.2909e-04,
        1.1839e-04, 1.5173e-04, 5.6734e-05, 4.0919e-05, 3.1744e-05, 8.9828e-05,
        9.1484e-05, 1.0185e-04, 1.1654e-04, 1.7232e-04],
       grad_fn=<StdBackward1>)

In [13]:
class Actor():

    def __init__(self, env, learning_rate = 0.0001, tau = 0.001, quiet = True):
        
        self.env = env
        self._quiet = quiet
        self._tau = tau

        self.state_dim = np.prod(np.array(env.observation_space.shape))
        
        # Actor network
        self.model = self.create_actor_network().to(device)
        # Target network
        self.target = self.create_actor_network().to(device)

        self.optimizer = torch.optim.Adam(self.model.parameters(), lr = learning_rate)

        # syncronize actor and target
        for var1, var2 in zip(self.model.parameters(), self.target.parameters()):
            var2 = var1

    def create_actor_network(self):
        
        # neural featurizer parameters
        h1 = 256
        h2 = 128
        h3 = 128
        
        class LSTM_Last(nn.Module):
            
            def __init__(self, *args, **kwargs):
                super(LSTM_Last, self).__init__()
                self.lstm = nn.LSTM(*args, **kwargs)
            
            def forward(self, x):
                x, _ = self.lstm(x)
                return x[:, -1, :]
        
        model = nn.Sequential(
            LSTM_Last(2, h1, 2, batch_first=True),
            nn.Linear(h1, h2),
            nn.ReLU(),
            nn.Linear(h2, h3),
            nn.ReLU(),
            nn.Linear(h3, 1),
            nn.Tanh(),
        )
        
        return model 
    
    def act(self, state):
        state = torch.Tensor([state]).float().to(device)      
        action = self.model(state).detach().cpu().item()
        return action

    def act_target(self, states):
        # called at calculating q_targets
        action = self.target(states)
        return action

    def update_target_network(self):
        
        with torch.no_grad():
            for var1, var2 in zip(self.model.parameters(), self.target.parameters()):
                var2 = self._tau * var1 + (1-self._tau) * var2

In [30]:
class Critic():

    def __init__(self, env, learning_rate = 0.001, tau = 0.001, quiet = True):
        
        self._env = env
        self._quiet = quiet
        self._tau = tau
        
        self._state_dim = np.prod(np.array(env.observation_space.shape))

        # critic network
        self.model = self.create_critic_network().to(device)

        # critic target network
        self.target = self.create_critic_network().to(device)

        # optimizer only applies to model network
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr = learning_rate)
        
        # synchronize critic and target
        with torch.no_grad():
            for var1, var2 in zip(self.model.parameters(), self.target.parameters()):
                var2 = var1
        
    def create_critic_network(self):
        
        # neural featurizer parameters
        h1 = 256
        h2 = 128
        h3 = 128
        
        class LSTM_Last(nn.Module):
            
            def __init__(self, *args, **kwargs):
                super(LSTM_Last, self).__init__()
                self.lstm = nn.LSTM(*args, **kwargs)
                self.model = nn.Sequential(
                                nn.Linear(h1+1, h2),
                                nn.ReLU(),
                                nn.Linear(h2, h3),
                                nn.ReLU(),
                                nn.Linear(h3, 1),
                            )
            
            def forward(self, x, a):
                x, _ = self.lstm(x)
                x = torch.cat([x[:,-1,:], a], 1)
                return self.model(x)
        
        model = LSTM_Last(2, h1, 2, batch_first=True)

        return model

    def get_q_values(self, states, actions):
        
        return self.model(states, actions).squeeze()

    def get_target_q_values(self, states, actions):

        return self.target(states, actions).squeeze()
    
    def update_target_network(self):
        
        with torch.no_grad():
            for var1, var2 in zip(self.model.parameters(), self.target.parameters()):
                var2 = self._tau * var1 + (1-self._tau) * var2

In [31]:
class DDPGAgent():
    def __init__(self, env, buffer_size = 1000000, discount_rate = 0.99, batch_size = 128, tau = 0.001, 
                 actor_lr = 1e-5, critic_lr = 1e-4, update_steps = 100,
                 quiet = True):
        
        self.actor = Actor(env, learning_rate = actor_lr, quiet = quiet, tau = tau)
        self.critic = Critic(env, learning_rate = critic_lr, quiet = quiet, tau = tau)
        
        self._batch_size = batch_size
        self._discount_rate = discount_rate
        self._update_steps = update_steps
        
        # Memory
        self.memory = deque( maxlen = buffer_size )
        
    def get_gradient_norm(self, model):

        total_norm = 0.0
        for p in model.parameters():
            param_norm = p.grad.data.norm(2)
            total_norm += param_norm.item() ** 2
        total_norm = total_norm ** (1. / 2)

        return total_norm

    def store_step(self, state, action, reward, next_state, done):
        
        self.memory.append([state, action, reward, next_state, done])
    
    def train(self):
        
        actor_grad = 0
        critic_grad = 0
        for _ in range(self._update_steps):
            batch = random.sample(self.memory, self._batch_size)
            
            states, actions, rewards, next_states, dones = zip(*batch)

            states = torch.Tensor(states).float().to(device)
            actions = torch.Tensor(actions).float().to(device).unsqueeze(1)
            rewards = torch.Tensor(rewards).float().to(device)
            next_states = torch.Tensor(next_states).float().to(device)
            dones = torch.Tensor(dones).float().to(device)

            target_actions = self.actor.act_target(next_states)
            q_targets = rewards + (1.0 - dones) * self._discount_rate * self.critic.get_target_q_values(next_states, target_actions)
            
            mse_loss = nn.MSELoss()
            
            self.critic.optimizer.zero_grad()
            q_values = self.critic.get_q_values(states, actions)
            critic_loss = mse_loss( q_values, q_targets )
            critic_loss.backward()
            critic_grad += self.get_gradient_norm(self.critic.model)
            self.critic.optimizer.step()

            # freeze critic network for actor update, to avoid unnecessary grad calculation
            for p in self.critic.model.parameters():
                p.requires_grad = False

            self.actor.optimizer.zero_grad()
            actor_loss = -self.critic.get_q_values( states, self.actor.model(states) ).mean()
            # print("actor loss before update: {}".format(actor_loss.item()))
            actor_loss.backward()
            actor_grad += self.get_gradient_norm(self.actor.model)
            self.actor.optimizer.step()
            # actor_loss = -self.critic.get_q_values( states, self.actor.model(states) ).mean()
            # print("actor loss after update: {}".format(actor_loss.item()))

            # unfreeze critic network after actor update
            for p in self.critic.model.parameters():
                p.requires_grad = True

            self.actor.update_target_network()
            self.critic.update_target_network()
        
        return actor_grad / self._update_steps, critic_grad / self._update_steps


In [32]:
env_trading = gym.make('test_trading-v2')
NUM_EP = 400

agentDDPG = DDPGAgent(env_trading, 
                    buffer_size=1000000,
                    tau = 0.01, 
                    actor_lr = 1e-4, 
                    critic_lr = 1e-4)

# Ornstein-Uhlenbeck noise by lirnli/OpenAI-gym-solutions    
def UONoise():
    theta = 0.15
    sigma = 0.2
    state = 0
    while True:
        yield state
        state += -theta*state+sigma*np.random.randn()

Loading historical data file


In [6]:
state = env_trading.reset()

In [7]:
import sys

In [33]:
date = datetime.datetime(2017, 7, 15, 0, 0)
date_test = datetime.datetime(2017, 7, 15, 0, 0)
noise = UONoise()
scores = []
scores_test = []
sample_actions = [] # Keep track of actions every 100 episode
portfolios = []
actor_grads = []
critic_grads = []

while (len(agentDDPG.memory) < 10000):
    state = env_trading.reset(date = date)
#     state = np.reshape(state,200)
    while (True):
        # action = agentDDPG.actor.act(state)
        # action = np.clip( action + next(noise), -1, 1 )
        action = env_trading.action_space.sample()[0]
        next_state, reward, done, _ = env_trading.step(action)
#         state = state.reshape(200)
#         next_state = next_state.reshape(200)
        agentDDPG.store_step(state, action, reward, next_state, done)
        state = next_state
        print("\rPopulating memory buffer: {:5d}/10000".format(len(agentDDPG.memory)), end="")
        sys.stdout.flush()
        if done:
            break

print("\n")

Populating memory buffer: 11520/10000



In [34]:
for e in range(NUM_EP):
#     state = np.reshape(env_trading.reset(date=date), 200)
    score = 0

    rewards = []
    actions = []
    while(True):
        action = agentDDPG.actor.act(state)
        action += next( noise )
        action = np.clip(action, -1, 1)
        actions.append(action)
        next_state, reward, done, _ = env_trading.step( action )
#         next_state = np.reshape(next_state, 200)
        score += reward
        rewards.append( reward )

        agentDDPG.store_step(state, action, reward, next_state, done)

        if done:
            actor_grad, critic_grad = agentDDPG.train()
            actor_grads.append(actor_grad)
            critic_grads.append(critic_grad)
            scores.append(score)
            # print("Episode: {}, Total reward: {}".format(e, score))
            break
        state = next_state

    # Testing session
#     state = np.reshape(env_trading.reset( date = date_test ), 200)
    score_test = 0
    actions_test = []
    while(True):
        action = agentDDPG.actor.act(state)
        next_state, reward, done, _ = env_trading.step( action )
        actions_test.append( action )
#         next_state = np.reshape(next_state, 200)
        score_test += reward
        if done:
            # agentDDPG.actor.update_averages( rewards, [score_test] )
            # agentDDPG.actor.record_summary( e )
            scores_test.append(score_test)
            portfolios.append( env_trading.portfolio_value )
            if e % 100 == 0:
                sample_actions.append( actions_test )
            print("\rEpisode: {}, Training reward: {:.2f}, Testing reward: {:.2f}, Actor grad: {:.4f}, Critic grad: {:.4f}, Actions: {:.4f}+/-{:.4f}, Test Actions: {:.4f}+/-{:.4f}".format(e, score, score_test, actor_grad, critic_grad, np.mean(actions), np.std(actions), np.mean(actions_test), np.std(actions_test)), end="")
            sys.stdout.flush()
            break
        state = next_state

Episode: 3, Training reward: 1.30, Testing reward: 0.09, Actor grad: 0.0068, Critic grad: 0.0791, Actions: 0.0318+/-0.0000, Test Actions: 0.2360+/-0.0000000

KeyboardInterrupt: 

In [27]:
batch = random.sample(agentDDPG.memory, 64)
            
states, actions, rewards, next_states, dones = zip(*batch)

states = torch.Tensor(states).float().to(device)
actions = torch.Tensor(actions).float().to(device).unsqueeze(1)

In [29]:
agentDDPG.critic.model((states, actions))

TypeError: forward() missing 1 required positional argument: 'a'