### DQN approach for optimizing trading stratergy

Macroagent from [arXiv](https://arxiv.org/abs/1812.10252) paper

```Environment``` class contains the following methods:

* ```state_generator```: yields new state;
* ```_reward```: calculates reward for specified action.

In [None]:
class Environment:    
    def __init__(self):
        self.ticker = "btcusd"
        self.timeframe = 1
        self.train_start_date = datetime(2018, 11, 15)
        self.train_end_date = val_start_date = datetime(2018, 11, 16)
        self.val_end_date = datetime(2018, 11, 17)
        self.ind_lookback = 10
        self.volatility_m = 5
        self.lookback = 5
        self.inds = {'sma': ('sma', self.ind_lookback), 'ema': ('ema', self.ind_lookback)}
        self.lots = 10
        self.data = self._calculate_states()
        self.start_index = self.minute = self.data.dropna().index[0]+self.lookback
        self.train_len, self.val_len = self._calculate_lengths()
        self.open_price = self.data.open[self.start_index]
        self.assets = np.zeros(self.lots)
        self.lots_used = 0
        
        
    def _calculate_lengths(self):
        train_end_int = int(self.train_end_date.strftime("%Y%m%d%H%M%S"))
        train_len = self.data[self.data.time < train_end_int].shape[0] - self.start_index
        val_len = self.data[self.data.time >= train_end_int].shape[0]
        return train_len, val_len
        
        
    def state_generator(self):
        while True:
            action = yield
            reward = self._reward(action) if action is not None else None
            close_prices = self.data['close'][self.minute - self.lookback:self.minute]
            ind_vec = self.data[
                ['price_z', 
                'volume_z', 
                'pc_z', 
                'vc_z', 
                'volatility']][self.minute-1:self.minute].stack()
            self.minute += 1
            self.open_price = self.data.open[self.minute]
            yield np.concatenate([ind_vec, close_prices, self.assets]), reward
        
        
    def _calculate_states(self):
        def z(series):
            mean = series.rolling(self.ind_lookback).sum()/self.ind_lookback
            std = series.rolling(self.ind_lookback).std()
            return (series-mean)/std
        
        data = loader.import_candles(
            self.ticker, 
            self.timeframe,
            self.train_start_date, 
            self.val_end_date,
            indicators=self.inds, 
            reverse=False)
        data['price_z'] = z(data.close)
        data['volume_z'] = z(data.vol)
        pc = data.close/(data.close.rolling(self.ind_lookback).sum()/self.ind_lookback) - 1
        vc = data.vol/(data.vol.rolling(self.ind_lookback).sum()/self.ind_lookback) - 1
        data['pc_z'] = z(pc)
        data['vc_z'] = z(vc)
        data['volatility'] = 100*(
            data.ema - data.ema.shift(self.volatility_m))/data.ema.shift(self.volatility_m)
        return data
    
    
    def _reward(self, action):
        global global_profit, trades
        reward_value = None
        if action == 1:
            reward_value = 0
        elif action == 2:
            if self.lots_used < self.lots:
                self.assets[self.lots_used] = self.open_price
                trades += 1
                self.lots_used += 1
            reward_value = 0
        elif action == 0:
            if not self.lots_used:
                reward_value = -1
            else:
                pr = np.sum(-self.assets[:self.lots_used]+self.open_price)
                global_profit += pr
                reward_value = np.sign(np.sum(-self.assets[:self.lots_used]+self.open_price)).astype(np.int8)
                self.assets = np.zeros(self.lots)
                self.lots_used = 0
        return reward_value
    
    
    def reset(self):
        self.minute = self.start_index
        self.open_price = self.data.open[self.start_index]
        self.assets = np.zeros(self.lots)
        self.lots_used = 0

```ReplayMemory``` class contains the following methods:

* ```update_M```: to store new SARSA tuple (s, a, r, s', a') in replay memory;
* ```batch```: return batch of the specified ```batch_size``` from the replay memory.

In [None]:
class ReplayMemory:
    def __init__(self, gen, start_ind):
        self.warm_up_length = 60
        self.used_memory = 0
        self.capacity = 120
        self.M = np.zeros((self.capacity, 43))
        action = None
        next(gen)
        state_t, _ = gen.send(action)
        for i in range(self.warm_up_length):
            action = np.random.randint(0,3)
            next(gen)
            state_t_next, reward_t = g.send(action)
            self.M[i] = np.concatenate(([start_ind+i], state_t, [action, reward_t], state_t_next))
            state_t = state_t_next
        
        
    def update_M(self, ind, state_t, action, reward_t, state_t_next):
        vec = np.concatenate(([ind], state_t, [action, reward_t], state_t_next))
        if self.used_memory >= self.capacity:
            self.M = np.roll(self.M, -1, axis=0)
            self.M[-1] = vec
        else:
            self.M[self.used_memory] = vec
        self.used_memory += 1
        
        
    def batch(self, batch_size):
        upper_lim = np.minimum(self.capacity, np.maximum(self.used_memory, self.warm_up_length))
        return self.M[np.random.randint(0, upper_lim, batch_size)]
        

class ```MacroAgent``` contains the followint methods:

* ```update_eps```: to change exploration-exploitation parameter

In [None]:
class MacroAgent:    
    def __init__(self):
        self.eps = 0.99
        self.eps_end = 0.1
        self.gamma = 0.99
        self.epochs = 500
        self.eps_step = (self.eps - self.eps_end)/(1400*150)
        self.update_target_network = 1000

        
    def update_eps(self):
        if self.eps > self.eps_end:
            self.eps -= self.eps_step 

In [None]:
#for storing profit and the number of trades in Tensorboard
class Logger:
    def __init__(self, log_dir):
        self.writer = tf.summary.FileWriter(log_dir)

    def log_scalar(self, tag, value, step):
        summary = tf.Summary(value=[tf.Summary.Value(tag=tag,
                                                     simple_value=value)])
        self.writer.add_summary(summary, step)

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import Adam
import tensorflow as tf
import tradingene.data.load as loader
import ipdb
from time import time


global_profit = trades = 0.


### DQN
def create_network(net_input):
    tf.summary.histogram('input', net_input)
    out1 = tf.nn.relu(net_input@weights['layer1']+biases['layer1'])
    tf.summary.histogram('out1', out1)
    out2 = tf.nn.relu(out1@weights['layer2']+biases['layer2'])
    tf.summary.histogram('out2', out2)
    out = out2@weights['layer3']+biases['layer3']
    tf.summary.histogram('out', out)
    return out


# Target Network
def target_network(target_input):
    target_out1 = tf.nn.relu(target_input@target_weights['layer1']+target_biases['layer1'])
    target_out2 = tf.nn.relu(target_out1@target_weights['layer2']+target_biases['layer2'])
    target_out = target_out2@target_weights['layer3']+target_biases['layer3']
    return target_out



def calculate_q(batch, gamma, q_vals):
    q = np.zeros(batch.shape[0])
    q = batch[:,22] + gamma*q_vals
    if 1436 in batch[:,0]:
        for i in range(batch.shape[0]):
            if batch[i, 0] == 1436:
                q[i] -= gamma*q_vals[i]   
    return q

In [None]:
#Initialize everything
env = Environment()
g = env.state_generator()
mem = ReplayMemory(g, env.start_index)
agent = MacroAgent()

In [None]:
topology = (20, 40, 40, 3)
global global_profit, trades
batch_size = 20
tf.reset_default_graph()

#Weights and biases for the DQN and the target network:
weights = {'layer'+str(i+1):
       tf.Variable(tf.random_normal(
           [topology[i], topology[i+1]], 
           mean=.0, 
           stddev=0.03), name="w"+str(i+1)) 
       for i in range(len(topology)-1)}
biases = {'layer'+str(i+1): 
      tf.Variable(tf.random_normal(
          [topology[i+1]],
          mean=.0, 
          stddev=0.03), name="b"+str(i+1)) 
      for i in range(len(topology)-1)}
target_weights = {'layer'+str(i+1):
       tf.Variable(weights['layer'+str(i+1)].initialized_value()) 
       for i in range(len(topology)-1)}
target_biases = {'layer'+str(i+1): 
      tf.Variable(biases['layer'+str(i+1)].initialized_value()) 
      for i in range(len(topology)-1)}

net_input = tf.placeholder(tf.float32, [None, topology[0]], name="net_input")
actions = tf.placeholder(tf.int64, [None], name="actions_input")
q = tf.placeholder(tf.float32, [None], name="q_input")
Q_values = create_network(net_input)
target = tf.reduce_max(target_network(net_input), axis=1)
argmax_Q_values = tf.argmax(Q_values, axis=1)
action_Q_values = tf.reduce_max(Q_values, axis=1)
mask = tf.one_hot(actions, depth=3, dtype=tf.bool, on_value=True, off_value=False)
q_pred = tf.boolean_mask(Q_values, mask)
error = q - q_pred
quadratic_loss = tf.reduce_mean(tf.square(error))
loss = tf.reduce_mean(tf.losses.huber_loss(q, q_pred))
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
train = optimizer.minimize(loss)
epochs = 2500

step = 0
init = tf.global_variables_initializer()
tf.summary.scalar("loss",loss)
merged_summary_op = tf.summary.merge_all()
fold = 'fold30/1/'
fold2 = 'fold30/2/'
with tf.Session() as sess:
    sess.run(init)
    tr_summary_writer = tf.summary.FileWriter(fold)
    logger = Logger(fold2)
    for epoch in range(epochs):
        print("START!")
        t = time()
        global_profit = 0.
        trades = 0
        random_minus_one = random_zero = random_plus_one = 0
        minus_one = zero = plus_one = 0
        g = env.state_generator()
        action = None
        next(g)
        state_t, _ = g.send(action)
        for i in range(env.train_len):
            if agent.eps > np.random.uniform():
                action_t = np.random.randint(0, 3)
                if action_t == 0:
                    random_minus_one += 1
                elif action_t == 1:
                    random_zero += 1
                else:
                    random_plus_one += 1
            else:
                action_t_ = sess.run([argmax_Q_values], feed_dict={net_input:np.array([state_t])})
                action_t = action_t_[0][0]
                if action_t == 2:
                    plus_one += 1
                elif action_t == 1:
                    zero += 1
                elif action_t == 0:
                    minus_one += 1
            next(g)
            state_t_next, reward_t = g.send(action_t)
            mem.update_M(env.start_index+i, state_t, action_t, reward_t, state_t_next)
            batch = mem.batch(batch_size)
            Q_values_next = sess.run([target], feed_dict={net_input:batch[:,23:]})
            a,b = sess.run([target, action_Q_values], feed_dict={net_input:batch[:,23:]})
            q_ = calculate_q(batch, agent.gamma, Q_values_next[0])        
            _, loss_value, summ = sess.run([train, loss, merged_summary_op], 
                                                       feed_dict={net_input:batch[:,1:21], 
                                                                  actions:batch[:,21],
                                                                  q:q_})

            agent.update_eps()
            state_t = state_t_next
            if step % agent.update_target_network == 0:
                upd1 = tf.assign(target_weights['layer1'], weights['layer1'].eval())
                upd2 = tf.assign(target_biases['layer1'], biases['layer1'].eval())
                upd3 = tf.assign(target_weights['layer2'], weights['layer2'].eval())
                upd4 = tf.assign(target_biases['layer2'], biases['layer2'].eval())
                upd5 = tf.assign(target_weights['layer3'], weights['layer3'].eval())
                upd6 = tf.assign(target_biases['layer3'], biases['layer3'].eval())
                sess.run([upd1, upd2, upd3, upd4, upd5, upd6])
            step += 1
        logger.log_scalar("profit", global_profit, epoch)
        logger.log_scalar("trades", trades, epoch)
        tr_summary_writer.add_summary(summ, epoch)
        print(epoch, global_profit, trades, random_minus_one, random_zero, random_plus_one, minus_one, zero, plus_one)
        print(time() - t)
        if epoch != epochs-1:
            env.reset()
        print("================================================================================")
#         input("")