In [1]:
from machin.frame.algorithms import DDPGPer
import torch
import torch.nn as nn

from drl4dypm.env import *

# networks

## without action

In [43]:
# without action

class Actor(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Actor, self).__init__()

        self.fc1 = nn.Linear(state_dim, 16)
        self.fc2 = nn.Linear(16, 16)
        self.fc3 = nn.Linear(16, action_dim)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, state):
        a = torch.relu(self.fc1(state))
        a = torch.relu(self.fc2(a))
        a = self.softmax(self.fc3(a))
        return a


class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()

        self.fc1 = nn.Linear(state_dim + action_dim, 16)
        self.fc2 = nn.Linear(16, 16)
        self.fc3 = nn.Linear(16, 1)

    def forward(self, state, action):
        state_action = torch.cat([state, action], 1)
        q = torch.relu(self.fc1(state_action))
        q = torch.relu(self.fc2(q))
        q = self.fc3(q)
        return q

## with action

In [39]:
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Actor, self).__init__()

        self.fc1 = nn.Linear(state_dim+action_dim, 16)
        self.fc2 = nn.Linear(16, 16)
        self.fc3 = nn.Linear(16, action_dim)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, state, last_action):
        a = torch.cat([state, last_action], 1)
        a = torch.relu(self.fc1(a))
        a = torch.relu(self.fc2(a))
        a = self.softmax(self.fc3(a))
        return a


class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()

        self.fc1 = nn.Linear(state_dim + 2*action_dim, 16)
        self.fc2 = nn.Linear(16, 16)
        self.fc3 = nn.Linear(16, 1)

    def forward(self, state, last_action, action):
        state_action = torch.cat([state, last_action, action], 1)
        q = torch.relu(self.fc1(state_action))
        q = torch.relu(self.fc2(q))
        q = self.fc3(q)
        return q

## with LSTM

In [39]:
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Actor, self).__init__()
        
        self.lstm_out = 20*10*2
        self.lstm = nn.LSTM(state_dim, 20, 2, batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(self.lstm_out, 16)
        self.fc2 = nn.Linear(16, 16)
        self.fc3 = nn.Linear(16, action_dim)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, state):
        a = torch.relu(self.lstm(state)[0].reshape(-1,self.lstm_out))
        a = torch.relu(self.fc1(a))
        a = torch.relu(self.fc2(a))
        a = self.softmax(self.fc3(a))
        return a


class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()

        self.lstm_out = 20*10*2
        self.lstm = nn.LSTM(state_dim, 20, 2, batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(self.lstm_out, 16)
        self.fc2 = nn.Linear(16, 16)
        self.fc3 = nn.Linear(16, 1)

    def forward(self, state, action):
        a = torch.relu(self.lstm(state)[0].reshape(-1,self.lstm_out))
        state_action = torch.cat([a, action], 1)
        q = torch.relu(self.fc1(state_action))
        q = torch.relu(self.fc2(q))
        q = self.fc3(q)
        return q

# setup

In [13]:
# environment params
trading_days = 252
# asset_names = ['AAL','AMZN','GOOG','FB','TSLA','CVS','FDX']
asset_names = ['AAL','CVS','FDX','F','AIG','CAT']
k = 10
cost_bps = 1e-3
# path_to_data = 'data/stock_price.csv'
path_to_data = 'data/stock_price_1D.csv'

# agent params
num_assets = len(asset_names)
state_dim = 3*num_assets
action_dim = 1+num_assets

# training params
max_episode = 100
min_episode_to_train = 10


noise_param = (0, 0.01)
noise_mode = "normal"

In [40]:
actor = Actor(state_dim*k, action_dim,)
actor_t = Actor(state_dim*k, action_dim)
critic = Critic(state_dim*k, action_dim)
critic_t = Critic(state_dim*k, action_dim)

ddpg_per = DDPGPer(actor, actor_t, critic, critic_t,
                   torch.optim.Adam,
                   nn.MSELoss(reduction='sum'))




In [37]:
env = TradingEnvironment(num_steps=trading_days, 
                         asset_names=asset_names, 
                         k=k, 
                         cost_bps=cost_bps,
                         agent_names=['base'],
                         path_to_data=path_to_data,
                         reward_scale_factor=1
                        )

# training

## with action

In [42]:
cols = ['episode','reward','critic_loss','actor_loss']
line = '|'.join([f'{col:<12}' for col in cols])
print(line)

smoothed_total_reward = 0


for e in range(max_episode):
    total_reward = 0
    state, end = env.init_step()
    last_actions = env.simulator.last_actions
    
    while not end:
        with torch.no_grad():
            # generate action
            action = ddpg_per.act_with_noise(
                {'state': torch.tensor(state[1], dtype=torch.float32).view(1,-1),
                'last_action': torch.tensor(last_actions['base'], dtype=torch.float32).view(1,-1)},
                noise_param=noise_param,
                mode=noise_mode
            )
        
            # execute action and move to next step
            actions = {'base': action.numpy().reshape(-1)}
            rewards, next_state, end = env.take_step(actions, state[0])
            
            # store experience
            ddpg_per.store_transition({
                'state': {'state': torch.tensor(state[1], dtype=torch.float32).view(1,-1),
                         'last_action': torch.tensor(last_actions['base'], dtype=torch.float32).view(1,-1)},
                'action': {'action': torch.tensor(actions['base'], dtype=torch.float32).view(1,-1)},
                'next_state': {'state': torch.tensor(next_state[1], dtype=torch.float32).view(1,-1),
                              'last_action': torch.tensor(actions['base'], dtype=torch.float32).view(1,-1)},
                'reward': rewards['base'],
                'terminal': end
            })
            
            
        state = next_state
        last_actions = actions
        
        total_reward += rewards['base']
    
    
    for _ in range(trading_days):
        actor_loss, critic_loss = ddpg_per.update()
        
    
    smoothed_total_reward = (smoothed_total_reward * 0.9 +
                         total_reward * 0.1)

    
    line = f'{e:<12}|' + '|'.join([f'{col:<12.4f}' for col in [smoothed_total_reward, critic_loss, actor_loss,]])
    
    print(line)
    
    env.reset()

The framework is not responsible for any un-matching device issues caused by this operation.[0m


episode     |reward      |critic_loss |actor_loss  


The framework is not responsible for any un-matching device issues caused by this operation.[0m
The framework is not responsible for any un-matching device issues caused by this operation.[0m
The framework is not responsible for any un-matching device issues caused by this operation.[0m


0           |0.8777      |0.0002      |-0.1300     
1           |1.6425      |0.0003      |-0.0995     
2           |2.3271      |0.0001      |-0.0588     
3           |2.8785      |0.0003      |-0.0103     
4           |3.4439      |0.0003      |0.0379      
5           |3.9524      |0.0003      |0.0742      
6           |4.4288      |0.0003      |0.1111      
7           |4.8399      |0.0003      |0.1556      
8           |5.1882      |0.0004      |0.2179      
9           |5.5283      |0.0004      |0.2587      
10          |5.8383      |0.0006      |0.2995      
11          |6.1015      |0.0007      |0.3050      
12          |6.2857      |0.0008      |0.3435      
13          |6.5074      |0.0006      |0.4106      
14          |6.7557      |0.0009      |0.4337      
15          |6.9239      |0.0006      |0.4485      
16          |7.1098      |0.0013      |0.4849      
17          |7.2645      |0.0009      |0.5054      
18          |7.4014      |0.0009      |0.5520      
19          

## without action

In [38]:
cols = ['episode','reward','critic_loss','actor_loss']
line = '|'.join([f'{col:<12}' for col in cols])
print(line)

smoothed_total_reward = 0


for e in range(max_episode):
    total_reward = 0
    state, end = env.init_step()
    
    while not end:
        with torch.no_grad():
            # generate action
            action = ddpg_per.act_with_noise(
                {'state': torch.tensor(state[1], dtype=torch.float32).view(1,-1)},
                noise_param=noise_param,
                mode=noise_mode
            )
        
            # execute action and move to next step
            actions = {'base': action.numpy().reshape(-1)}
            rewards, next_state, end = env.take_step(actions, state[0])
            
            # store experience
            ddpg_per.store_transition({
                'state': {'state': torch.tensor(state[1], dtype=torch.float32).view(1,-1)},
                'action': {'action': torch.tensor(actions['base'], dtype=torch.float32).view(1,-1)},
                'next_state': {'state': torch.tensor(next_state[1], dtype=torch.float32).view(1,-1)},
                'reward': rewards['base'],
                'terminal': end
            })
            
            
        state = next_state
        total_reward += rewards['base']
    
    
    for _ in range(trading_days):
        actor_loss, critic_loss = ddpg_per.update()
        
    
    smoothed_total_reward = (smoothed_total_reward * 0.9 +
                         total_reward * 0.1)

    
    line = f'{e:<12}|' + '|'.join([f'{col:<12.4f}' for col in [smoothed_total_reward, critic_loss, actor_loss,]])
    
    print(line)
    
    env.reset()

episode     |reward      |critic_loss |actor_loss  
0           |0.8563      |1019.7191   |724.9994    
1           |1.6298      |94.5890     |745.6669    
2           |2.3550      |1014.6796   |755.8207    
3           |3.0166      |968.8516    |766.0856    
4           |3.5588      |346.4882    |767.2852    
5           |4.0107      |873.3723    |762.9828    
6           |4.3967      |503.6833    |756.4333    
7           |4.7507      |641.8921    |752.8463    
8           |5.1071      |1060.1573   |743.0658    
9           |5.4160      |281.6003    |739.4130    
10          |5.7740      |443.4556    |724.6275    
11          |6.0103      |646.4091    |716.1813    
12          |6.2924      |365.4511    |736.3979    
13          |6.5409      |539.7708    |755.8531    
14          |6.7320      |198.5395    |762.7327    
15          |6.9298      |654.8489    |764.5886    
16          |7.1208      |136.3229    |764.9364    
17          |7.2766      |322.9198    |766.0290    
18          