In [59]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import random
from collections import deque
import math

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class Model(nn.Module):
    def __init__(self, state_dim):  
        super(Model, self).__init__()

        h1 = 512
        h2 = 256
        h3 = 128
        
        self.hidden_1 = nn.Linear(state_dim, h1)
        self.hidden_2 = nn.Linear(h1, h2)
        self.hidden_3 = nn.Linear(h2, h3)

        self.output = nn.Linear(h3, 3)

    def forward(self, x):
        x = F.relu(self.hidden_1(x))
        x = F.relu(self.hidden_2(x))
        x = F.relu(self.hidden_3(x))
        return self.output(x)

class DQNAgent():
    def __init__(self, env, gamma=0.99, buffer_size = 1000000,
        epsilon=1.0, epsilon_min=0.01, epsilon_log_decay=0.999, 
        alpha=1e-4, alpha_decay=0.001, batch_size=128, quiet=False):
        
        self.env = env
        self.memory = deque(maxlen = buffer_size)
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_log_decay
        self.alpha = alpha
        self.alpha_decay = alpha_decay
        self._batch_size = batch_size
        self.quiet = quiet
        self._state_dim = np.prod(np.array(env.observation_space.shape))
        
        self.model = Model(self._state_dim).to(device)
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.alpha)

    def store_step(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state, step = None):
        state = torch.from_numpy(state).float().to(device)
        if step is not None:
            epsilon = max(self.epsilon_min, min(self.epsilon, 1.0 - math.log10((step + 1) * self.epsilon_decay)))
            q_values = self.model(state)
            q_values = q_values.detach().cpu().numpy()
            random_action = np.random.choice(3) - 1 # action space: -1, 0, 1
            q_max_actrion = np.argmax(q_values) - 1
            action = random_action if (np.random.random() <= epsilon) else q_max_action
            return action
            print("here")
        else:
            q_values = self.model(state)
            q_values = q_values.detach().cpu().numpy()
            return np.argmax(q_values) - 1

    def train(self):
        batch_size = self._batch_size
        x_batch, y_batch = [], []
        
        batch = random.sample(self.memory, batch_size)
        
        states, actions, rewards, next_states, dones = zip(*batch)

        states = torch.tensor(states).float().to(device)
        actions = torch.tensor(actions).long().to(device)
        rewards = torch.tensor(rewards).float().to(device)
        next_states = torch.tensor(next_states).float().to(device)
        dones = torch.tensor(dones).float().to(device)

        y_preds = self.model(states)
        actions = actions.unsqueeze(1)
        y_preds = y_preds.gather(1, actions)
        
        y_targets = rewards + (1 - dones) * self.gamma * self.model(next_states).detach().max(1)[0]

        loss = torch.mean( (y_preds - y_targets)**2 )
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
            
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        # self.memory = []
        

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import datetime

import gym
import trading_env

In [4]:
env_trading = gym.make('test_trading-v2')
NUM_EP = 400
date = datetime.datetime(2017, 7, 10, 0, 0)
data = env_trading.historical_data["close"]
env_trading.reset(date=date)
# plt.plot(data[env_trading.start_index:env_trading.start_index + int(env_trading.episode_steps) 
#             if env_trading.start_index + int(env_trading.episode_steps) < data.shape[0]
#             else data.shape[0]])

# plt.show()

Loading historical data file


array([[1.02819104, 0.32829856],
       [1.03334742, 0.6872087 ],
       [1.04243449, 1.04861368],
       [1.04057481, 0.68456713],
       [1.04049028, 0.43732447],
       [1.03964497, 0.53619503],
       [1.03393914, 0.36730053],
       [1.03516484, 0.68563439],
       [1.03989856, 0.41589084],
       [1.04370245, 0.72421409],
       [1.04387151, 0.3563747 ],
       [1.04361792, 0.15185302],
       [1.04357566, 0.32994251],
       [1.04361792, 0.5751208 ],
       [1.04366019, 0.30587477],
       [1.04801352, 0.73526962],
       [1.05418428, 0.92314449],
       [1.04615385, 0.60235922],
       [1.05071851, 0.41567612],
       [1.05879121, 1.14874656],
       [1.0658918 , 0.62194044],
       [1.05350803, 2.02918238],
       [1.05528318, 0.76909247],
       [1.05663567, 0.62731009],
       [1.05164835, 1.0132689 ],
       [1.05126796, 0.44822558],
       [1.04945055, 0.70310004],
       [1.04268808, 0.67092055],
       [1.04556213, 0.38741467],
       [1.04797126, 0.17656399],
       [1.

In [60]:
agentDQN = DQNAgent(env_trading, gamma=0.99, buffer_size = 10000,
                    epsilon=1.0, epsilon_min=0.01, epsilon_log_decay=0.999, 
                    alpha=1e-4, alpha_decay=0.001, batch_size=8, quiet=False)

rewards = []
rewards_test = []
portfolio = []
for i in range( NUM_EP ):
    state = env_trading.reset(date = datetime.datetime( 2017, 7, 15, 0, 0 ))
    state = np.reshape(state,200)
    total_reward = 0

    while(True):
        action = agentDQN.act(state, step=i)
#         print(action)
        next_state, reward, done, _ = env_trading.step(action)
        state = np.reshape(state,200)
        next_state = next_state.reshape(200)
        agentDQN.store_step(state, action, reward, next_state, done)
        total_reward += reward
        if done:
            rewards.append(total_reward)
#             portfolio.append(env_trading.portfolio_value)
            #print("Episode: {}, Total reward: {}".format(i,total_reward))
            break
    if len( agentDQN.memory ) > agentDQN._batch_size:
        agentDQN.train()

    state_test = env_trading.reset( date = datetime.datetime(2017, 8, 15, 0, 0) )
    state_test = np.reshape( state_test, 200 )
    total_reward_test = 0

    while( True ):
        action = agentDQN.act(state_test)
#         print(action)
        state_test, reward_test, done_test, _ = env_trading.step(action)
        state_test = np.reshape(state_test,200)
        total_reward_test += reward_test
        if done_test:
            rewards_test.append(total_reward_test)
            portfolio.append(env_trading.portfolio_value)
            print("Episode: {}, Training reward: {}, Testing reward: {}".format(i, total_reward, total_reward_test))
            break

RuntimeError: CUDA error: device-side assert triggered

In [1]:
with open('dqn_log.txt', 'r') as f:
    lines = f.readlines()

'Episode: 0, Training reward: 32.641549267868896, Testing reward: 7.8558623343574725\n'