In [11]:
import argparse
from data_preprocessing import Data
from trading_env import TradingEnv, RunAgent
from agent import Agent, Informer_Agent
from model import DQN
import torch
import numpy as np
import random
import os

def training(args):
    T = args.T
    M = args.bs # minibatch size
    alpha = args.lr # Learning rate
    gamma = args.gamma # Discount factor
    theta = args.para_target # Target network
    n_units = args.n_units # number of units in a hidden layer
    closing_path = os.path.join(args.root_path, 'data_closing/' + args.stock + '-closing.json')
    states_path = os.path.join(args.root_path, 'data_states/' + args.stock + '-states.json')

    RunAgent(TradingEnv(Data(closing_path, states_path, T)), Agent()).run(5000, args)

    # weight initialization!!




In [12]:
parser = argparse.ArgumentParser(description='DQN_Trading')
parser.add_argument('--root_path', type=str, default='./', help="root path")
parser.add_argument('--gpu_id', type=str, default='0', help="device id to run")
parser.add_argument('--agent', type=float, default=0.001, help="the parameter which controls the soft update")
parser.add_argument('--bs', type=int, default=16, help="training batch size")
parser.add_argument('--lr', type=float, default=0.00025, help="training learning rate")
parser.add_argument('--gamma', type=float, default=0.001, help="the discount factor of Q learning")
parser.add_argument('--n_units', type=int, default=32, help="the number of units in a hidden layer")
parser.add_argument('--T', type=int, default=84, help="the length of series data")
parser.add_argument('--stock', type=str, default='AIG', help="determine which stock")
parser.add_argument('--seed', type=int, default=2037, help="random seed")
parser.add_argument('--para_target', type=float, default=0.001, help="the parameter which controls the soft update")


args = parser.parse_args([])

SEED = args.seed
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
file_path = './data_closing/'
datafile_list = os.listdir(file_path)


In [13]:
args.stock = "KRW-ADA_20210320_20220324_84"

In [None]:
print(args.stock)
SEED = args.seed
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
training(args)

KRW-ADA_20210320_20220324_84
4367100.0


In [10]:
print(args.stock)
SEED = args.seed
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
training(args)

KRW-ADA_20210320_20220324_84


  return torch.max_pool1d(input, kernel_size, stride, padding, dilation, ceil_mode)


6706600.0
5961050.0
7418050.0
10237350.0
10410650.0


In [5]:
T = args.T
M = args.bs # minibatch size
alpha = args.lr # Learning rate
gamma = args.gamma # Discount factor
theta = args.para_target # Target network
n_units = args.n_units # number of units in a hidden layer
closing_path = os.path.join(args.root_path, 'data_closing/' + args.stock + '-closing.json')
states_path = os.path.join(args.root_path, 'data_states/' + args.stock + '-states.json')


# weight initialization!!

In [6]:
data = Data(closing_path, states_path, T)

In [7]:
closing_path
states_path

'./data_states/KRW-ADA_20210320_20220324_84-states.json'

In [8]:
#RunAgent(TradingEnv(Data(closing_path, states_path, T)), Agent()).run(5000, args)


In [9]:
episodes = 5000
device = 'cuda'

In [None]:
agent =RunAgent(TradingEnv(Data(closing_path, states_path, T)), Informer_Agent()).run(5000, args)


  return torch.max_pool1d(input, kernel_size, stride, padding, dilation, ceil_mode)


6056450.0
-4488300.0


In [13]:
state = agent.env.reset() # initial_state


In [14]:
step=2

In [42]:
action = agent.agent.act(state) # select greedy action, exploration is done in step-method
actions, rewards, new_states, state, done = agent.env.step(action, step)

agent.agent.store(state, actions, new_states, rewards, action, step)
print(len(agent.agent.memory))
agent.agent.optimize(step)


57


In [43]:
from memory import Transition, ReplayMemory
import copy

In [57]:
transitions = agent.agent.memory.sample(agent.agent.batch_size)
# Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
# detailed explanation). This converts batch-array of Transitions
# to Transition of batch-arrays.
batch = Transition(*zip(*transitions))

# Compute a mask of non-final states and concatenate the batch elements
# (a final state would've been the one after which simulation ended)
next_state = torch.FloatTensor(batch.next_state).to(device)
non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, next_state)))
non_final_next_states = torch.cat([s for s in next_state if s is not None])

state_batch = torch.FloatTensor(batch.state).to(device)
action_batch = torch.LongTensor(torch.add(torch.tensor(batch.action), torch.tensor(1))).to(device)
reward_batch = torch.FloatTensor(batch.reward).to(device)

# Compute Q(s_t, a) - the model computes Q(s_t), then we select the
# columns of actions taken. These are the actions which would've been taken
# for each batch state according to policy_net
# chane model lstm to informer
#l = self.policy_net(state_batch).size(0)
batch_x, batch_x_mark, batch_y, batch_y_mark = agent.agent.informer_input(state_batch)
state_action_values = agent.agent.policy_net(batch_x, batch_x_mark, batch_y, batch_y_mark).squeeze().max(1)[0]




# Compute V(s_{t+1}) for all next states.
# Expected values of actions for non_final_next_states are computed based
# on the "older" target_net; selecting their best reward with max(1)[0].
# This is merged based on the mask, such that we'll have either the expected
# state value or 0 in case the state was final.
next_state_values = torch.zeros(agent.agent.batch_size, device=device)
batch_x, batch_x_mark, batch_y, batch_y_mark = agent.agent.informer_input(next_state)
next_state_values[non_final_mask] = agent.agent.target_net(batch_x, batch_x_mark, batch_y, batch_y_mark).squeeze().max(1)[0]
# Compute the expected Q values
expected_state_action_values = (next_state_values * agent.agent.gamma) + reward_batch

# Compute the loss
loss = torch.nn.MSELoss()(expected_state_action_values, state_action_values)

# Optimize the model

loss.backward()
#for param in agent.agent.policy_net.parameters():
#        param.grad.data.clamp_(-1, 1)

agent.agent.optimizer.step()

if step % agent.agent.T == 0:
    # print('soft_update')
    gamma = 0.001
    param_before = copy.deepcopy(agent.agent.target_net)
    target_update = copy.deepcopy(agent.agent.target_net.state_dict())
    for k in target_update.keys():
        target_update[k] = agent.agent.target_net.state_dict()[k] * (1 - gamma) + agent.agent.policy_net.state_dict()[k] * gamma
    agent.agent.target_net.load_state_dict(target_update)





In [54]:
param.grad

In [47]:
agent.agent.informer_input(next_state_values)

IndexError: too many indices for tensor of dimension 1

In [51]:
torch.zeros(agent.agent.batch_size, device=device)

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       device='cuda:0')

In [35]:
device='cuda'

In [36]:
agent.agent.T

84

In [37]:
# if len(agent.agent.memory) < agent.agent.batch_size * 10:
#     return
transitions = agent.agent.memory.sample(agent.agent.batch_size)
# Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
# detailed explanation). This converts batch-array of Transitions
# to Transition of batch-arrays.
batch = Transition(*zip(*transitions))

# Compute a mask of non-final states and concatenate the batch elements
# (a final state would've been the one after which simulation ended)
next_state = torch.FloatTensor(batch.next_state).to(device)
non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, next_state)))
non_final_next_states = torch.cat([s for s in next_state if s is not None])

state_batch = torch.FloatTensor(batch.state).to(device)
action_batch = torch.LongTensor(torch.add(torch.tensor(batch.action), torch.tensor(1))).to(device)
reward_batch = torch.FloatTensor(batch.reward).to(device)

# Compute Q(s_t, a) - the model computes Q(s_t), then we select the
# columns of actions taken. These are the actions which would've been taken
# for each batch state according to policy_net
l = agent.agent.policy_net(state_batch).size(0)






state_action_values = agent.agent.policy_net(state_batch)[agent.agent.T-1:l:agent.agent.T].gather(1, action_batch.reshape((agent.agent.batch_size, 1)))
state_action_values = state_action_values.squeeze(-1)


# Compute V(s_{t+1}) for all next states.
# Expected values of actions for non_final_next_states are computed based
# on the "older" target_net; selecting their best reward with max(1)[0].
# This is merged based on the mask, such that we'll have either the expected
# state value or 0 in case the state was final.
next_state_values = torch.zeros(agent.agent.batch_size, device=device)
next_state_values[non_final_mask] = agent.agent.target_net(next_state)[agent.agent.T-1:l:agent.agent.T].max(1)[0].detach()
# Compute the expected Q values
expected_state_action_values = (next_state_values * agent.agent.gamma) + reward_batch

# Compute the loss
loss = torch.nn.MSELoss()(expected_state_action_values, state_action_values)

# Optimize the model

loss.backward()
for param in agent.agent.policy_net.parameters():
    param.grad.data.clamp_(-1, 1)

agent.agent.optimizer.step()

if step % agent.agent.T == 0:
    print('soft_update')
gamma = 0.001
param_before = copy.deepcopy(agent.agent.target_net)
target_update = copy.deepcopy(agent.agent.target_net.state_dict())
for k in target_update.keys():
    target_update[k] = agent.agent.target_net.state_dict()[k] * (1 - gamma) + agent.agent.policy_net.state_dict()[k] * gamma
agent.agent.target_net.load_state_dict(target_update)


<All keys matched successfully>

In [38]:
state_action_values = agent.agent.policy_net(state_batch)[agent.agent.T-1:l:agent.agent.T].gather(1, action_batch.reshape((agent.agent.batch_size, 1)))

In [39]:
state_action_values.shape

torch.Size([16, 1])

In [40]:
state_action_values.squeeze(-1)

tensor([ 1.1752,  8.1532, 14.1446, -4.7372,  9.9657,  1.6826, -0.0201, 10.3614,
        15.9313, -4.4890, -9.1605,  1.9446, -1.1246,  0.4593,  3.1795,  1.4774],
       device='cuda:0', grad_fn=<SqueezeBackward1>)

In [41]:
agent.agent.target_net(next_state)[agent.agent.T-1:l:agent.agent.T].max(1)[0]

tensor([ 0.1589,  0.0453,  0.0434, -0.0080,  0.0396,  0.0083, -0.0106,  0.0420,
         0.1315,  0.0087,  0.1765,  0.1443,  0.1407,  0.1642,  0.0020,  0.0470],
       device='cuda:0', grad_fn=<MaxBackward0>)

In [21]:
from models.model import Informer

label_len=84
pred_len = 3
model = Informer(1, 1, 1, 84, 84, 3, device = device).to(device)

def informer_input(state_batch):
    batch_x = state_batch.float().to(device)
    batch_y = state_batch.float()
    batch_x_mark = state_batch.float().to(device)
    batch_y_mark = state_batch.float().to(device)
    dec_inp = torch.zeros([state_batch.shape[0], pred_len, state_batch.shape[-1]]).float()
    dec_inp = torch.cat([state_batch[:,:label_len,:], state_batch], dim=1).float().to(device)
    batch_x[batch_x < 0] = 0
    batch_x[batch_x > 5] = 0
    return batch_x[:16,:,:1], batch_x_mark[:16,:,:5], dec_inp[:16,:84,:1], batch_y_mark[:16,:84,:5]


In [51]:
batch_x, batch_x_mark, batch_y, batch_y_mark = informer_input(state_batch)
state_action_values = model(batch_x, batch_x_mark, batch_y, batch_y_mark).squeeze().max(1)[0]

  return torch.max_pool1d(input, kernel_size, stride, padding, dilation, ceil_mode)


In [64]:
next_state_values = torch.zeros(agent.agent.batch_size, device=device)
next_state_values[non_final_mask] = agent.agent.target_net(next_state)[agent.agent.T-1:l:agent.agent.T].max(1)[0].detach()
# Compute the expected Q values
expected_state_action_values = (next_state_values * agent.agent.gamma) + reward_batch

# Compute the loss
loss = torch.nn.MSELoss()(expected_state_action_values, state_action_values)

# Optimize the model

loss.backward()
for param in agent.agent.policy_net.parameters():
    param.grad.data.clamp_(-1, 1)

agent.agent.optimizer.step()

if step % agent.agent.T == 0:
    print('soft_update')
gamma = 0.001
param_before = copy.deepcopy(agent.agent.target_net)
target_update = copy.deepcopy(agent.agent.target_net.state_dict())
for k in target_update.keys():
    target_update[k] = agent.agent.target_net.state_dict()[k] * (1 - gamma) + agent.agent.policy_net.state_dict()[k] * gamma
agent.agent.target_net.load_state_dict(target_update)

<All keys matched successfully>

tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True])

In [59]:
batch_x, batch_x_mark, batch_y, batch_y_mark = informer_input(next_state)


In [60]:
state_action_values = model(batch_x, batch_x_mark, batch_y, batch_y_mark).squeeze().max(1)[0]

In [62]:
state_action_values.shape

torch.Size([16])

In [56]:
next_state_values[non_final_mask]

tensor([ 0.1589,  0.0453,  0.0434, -0.0080,  0.0396,  0.0083, -0.0106,  0.0420,
         0.1315,  0.0087,  0.1765,  0.1443,  0.1407,  0.1642,  0.0020,  0.0470],
       device='cuda:0')

In [57]:
non_final_mask

tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True])

In [None]:
param_before

In [104]:
agent.agent.target_net

DQN(
  (first_two_layers): Sequential(
    (0): Linear(in_features=14, out_features=256, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=256, out_features=256, bias=True)
    (3): ELU(alpha=1.0)
  )
  (lstm): LSTM(256, 256, batch_first=True)
  (last_linear): Linear(in_features=256, out_features=3, bias=True)
)

In [85]:
agent.agent.policy_net(state_batch)

tensor([[ 7.4809,  3.1719, -2.6508],
        [10.3841,  1.4601, -1.2563],
        [10.7359,  0.8411, -3.1959],
        ...,
        [ 1.5511, 10.1949, -8.1049],
        [ 2.0535,  4.0915,  1.2815],
        [-1.7908,  8.5168,  3.0911]], device='cuda:0', grad_fn=<AddmmBackward>)

In [39]:
state_batch.shape

torch.Size([16, 84, 14])

In [40]:
agent.agent.policy_net

DQN(
  (first_two_layers): Sequential(
    (0): Linear(in_features=14, out_features=256, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=256, out_features=256, bias=True)
    (3): ELU(alpha=1.0)
  )
  (lstm): LSTM(256, 256, batch_first=True)
  (last_linear): Linear(in_features=256, out_features=3, bias=True)
)

In [54]:
state = agent.env.reset() # initial_state

for step in range(episodes):
    action = agent.agent.act(state) # select greedy action, exploration is done in step-method

    actions, rewards, new_states, state, done = agent.env.step(action, step)

    if done:
        break

    agent.agent.store(state, actions, new_states, rewards, action, step)
    agent.agent.optimize(step)



  return array(a, dtype, copy=False, order=order)


TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.

In [55]:
agent.env.print_stats(args)

In [59]:
with open('batch.pickle', 'rb') as handle:
    batch = pickle.load(handle)
state_batch = torch.FloatTensor(batch.state).to(device)

batch_x = state_batch.float().to(device)
batch_y = state_batch.float()
batch_x_mark = state_batch.float().to(device)
batch_y_mark = state_batch.float().to(device)
dec_inp = torch.zeros([state_batch.shape[0], pred_len, state_batch.shape[-1]]).float()
dec_inp = torch.cat([state_batch[:,:label_len,:], state_batch], dim=1).float().to(device)


In [60]:
from models.model import Informer
import pickle
import torch

In [72]:
device='cuda'
model = Informer(1, 1, 1, 84, 84, 3, device = device).to(device)
pred_len=84
label_len=84

#with open('batch.pickle', 'wb') as handle:
#    pickle.dump(batch, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('batch.pickle', 'rb') as handle:
    batch = pickle.load(handle)

state_batch = torch.FloatTensor(batch.state).to(device)

In [76]:
import torch.nn.init as weight_init

for param_p in model.parameters(): 
    weight_init.normal_(param_p)

In [78]:
batch_x = state_batch.float().to(device)
batch_y = state_batch.float()
batch_x_mark = state_batch.float().to(device)
batch_y_mark = state_batch.float().to(device)
dec_inp = torch.zeros([state_batch.shape[0], pred_len, state_batch.shape[-1]]).float()
dec_inp = torch.cat([state_batch[:,:label_len,:], state_batch], dim=1).float().to(device)
batch_x[batch_x < 0] = 0
batch_x[batch_x > 5] = 0

outputs = model(batch_x[:10,:,:1], batch_x_mark[:10,:,:5], dec_inp[:10,:84,:1], batch_y_mark[:10,:84,:5])


In [None]:
`

In [63]:
outputs.squeeze().max(1)[0]

tensor([0.3154, 1.0376, 0.3154, 0.6597, 1.0058, 1.0376, 0.8619, 1.1936, 0.9132,
        0.8046], device='cuda:0', grad_fn=<MaxBackward0>)

In [11]:
outputs.shape

torch.Size([10, 3, 1])

In [None]:
batch_y = batch_y[:,-pred_len:,0:].to(device)

In [206]:
model(state_batch,state_batch,state_batch,state_batch)

RuntimeError: Given groups=1, weight of size [512, 1, 3], expected input[16, 14, 98] to have 1 channels, but got 14 channels instead