In [2]:
import sys
import gym
import pylab
import random
import numpy as np
from collections import deque
from keras.layers import Dense
from keras.optimizers import Adam
from keras.models import Sequential
from keras.backend import clear_session
from keras import backend as K

import os
import logging
import random
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from colour import Color
from gym import spaces, logger
import time
import warnings
warnings.filterwarnings("ignore")

class trading_env:
    def __init__(self, obs_data_len, step_len,
                 df, fee, max_position=5, deal_col_name='price', plot_col_name='middle',
                 feature_names=['price', 'volume'], 
                 return_transaction=True,
                 fluc_div=100.0, gameover_limit=5,
                 *args, **kwargs):

        self.df = df
        self.action_space = 3
        # self.action_space = spaces.Discrete(3)
        self.action_describe = {0:'do nothing',  1:'long', 2:'short'}
        
        self.obs_len = obs_data_len
        # self.feature_len = len(feature_names)
        self.feature_len = len(feature_names)+8  # return_transaction

        # self.observation_space = np.array([self.obs_len*self.feature_len,])
        # self.observation_space = np.array([self.obs_len*self.feature_len,])
        # print(self.feature_len)
        # print(self.obs_len)
        # print(self.observation_space.shape)

        self.using_feature = feature_names
        self.price_name = deal_col_name
        self.price_plot = plot_col_name
        
        self.step_len = step_len
        self.fee = fee
        self.max_position = max_position
        
        self.fluc_div = fluc_div
        self.gameover = gameover_limit
        self.return_transaction = return_transaction
        
        self.begin_fs = self.df[self.df['serial_number']==0]
        self.date_leng = len(self.begin_fs)
        
        self.render_on = 0
        self.buy_color, self.sell_color = (1, 2)
        self.new_rotation, self.cover_rotation = (1, 2)
        self.transaction_details = pd.DataFrame()
    
    def _random_choice_section(self):
        random_int = np.random.randint(self.date_leng)
        if random_int == self.date_leng - 1:
            begin_point = self.begin_fs.index[random_int]
            end_point = None
        else:
            begin_point, end_point = self.begin_fs.index[random_int: random_int+2]
        df_section = self.df.iloc[begin_point: end_point]
        return df_section

    def reset(self):
        self.df_sample = self._random_choice_section()
        self.step_st = 0

        # define the price to calculate the reward
        self.price = self.df_sample[self.price_name].as_matrix()
        # define the price to plot BB
        # self.price_mid = self.df_sample[self.price_plot].as_matrix()

        # define the observation feature
        self.obs_features = self.df_sample[self.using_feature].as_matrix()
        
        # maybe make market position feature in final feature, set as option
        self.posi_arr = np.zeros_like(self.price)
        # position variation
        self.posi_variation_arr = np.zeros_like(self.posi_arr)
        # position entry or cover :new_entry->1  increase->2 cover->-1 decrease->-2
        self.posi_entry_cover_arr = np.zeros_like(self.posi_arr)
        # self.position_feature = np.array(self.posi_l[self.step_st:self.step_st+self.obs_len])/(self.max_position*2)+0.5
        
        self.price_mean_arr = self.price.copy()
        self.reward_fluctuant_arr = (self.price - self.price_mean_arr)*self.posi_arr
        self.reward_makereal_arr = self.posi_arr.copy()
        self.reward_arr = self.reward_fluctuant_arr*self.reward_makereal_arr

        self.info = None
        self.transaction_details = pd.DataFrame()
        
        # observation part
        # state = xdata[0:1, :]
        # self.obs_state = self.obs_features[self.step_st: self.step_st+self.obs_len ]
        self.obs_state = self.obs_features[self.step_st: self.step_st+self.obs_len]


        self.obs_posi = self.posi_arr[self.step_st: self.step_st+self.obs_len]
        self.obs_posi_var = self.posi_variation_arr[self.step_st: self.step_st+self.obs_len]
        self.obs_posi_entry_cover = self.posi_entry_cover_arr[self.step_st: self.step_st+self.obs_len]
        self.obs_price = self.price[self.step_st: self.step_st+self.obs_len]
        self.obs_price_mean = self.price_mean_arr[self.step_st: self.step_st+self.obs_len]
        self.obs_reward_fluctuant = self.reward_fluctuant_arr[self.step_st: self.step_st+self.obs_len]
        self.obs_makereal = self.reward_makereal_arr[self.step_st: self.step_st+self.obs_len]
        self.obs_reward = self.reward_arr[self.step_st: self.step_st+self.obs_len]
        
        if self.return_transaction:
            self.obs_return = np.concatenate((self.obs_state, 
                                            self.obs_posi[:, np.newaxis], 
                                            self.obs_posi_var[:, np.newaxis],
                                            self.obs_posi_entry_cover[:, np.newaxis],
                                            self.obs_price[:, np.newaxis],
                                            self.obs_price_mean[:, np.newaxis],
                                            self.obs_reward_fluctuant[:, np.newaxis],
                                            self.obs_makereal[:, np.newaxis],
                                            self.obs_reward[:, np.newaxis]), axis=1)
        else:
            self.obs_return = self.obs_state

        # # Convert Tuple to list
        # self.obs_return = [list(i) for i in self.obs_return]

        self.t_index = 0
        return self.obs_return
    
    def _long(self, open_posi, enter_price, current_mkt_position, current_price_mean):
        if open_posi:
            self.chg_price_mean[:] = enter_price
            self.chg_posi[:] = 1
            self.chg_posi_var[:1] = 1
            self.chg_posi_entry_cover[:1] = 1
        else:
            after_act_mkt_position = current_mkt_position + 1
            self.chg_price_mean[:] = (current_price_mean*current_mkt_position + \
                                        enter_price)/after_act_mkt_position
            self.chg_posi[:] = after_act_mkt_position
            self.chg_posi_var[:1] = 1
            self.chg_posi_entry_cover[:1] = 2
            
    def _short(self, open_posi, enter_price, current_mkt_position, current_price_mean):
        if open_posi:
            self.chg_price_mean[:] = enter_price
            self.chg_posi[:] = -1
            self.chg_posi_var[:1] = -1
            self.chg_posi_entry_cover[:1] = 1
        else:
            after_act_mkt_position = current_mkt_position - 1
            self.chg_price_mean[:] = (current_price_mean*abs(current_mkt_position) + \
                                      enter_price)/abs(after_act_mkt_position)
            self.chg_posi[:] = after_act_mkt_position
            self.chg_posi_var[:1] = -1
            self.chg_posi_entry_cover[:1] = 2
          
    def _short_cover(self, current_price_mean, current_mkt_position):
        self.chg_price_mean[:] = current_price_mean
        self.chg_posi[:] = current_mkt_position + 1
        self.chg_makereal[:1] = 1
        self.chg_reward[:] = ((self.chg_price - self.chg_price_mean)*(-1) - self.fee)*self.chg_makereal
        self.chg_posi_var[:1] = 1
        self.chg_posi_entry_cover[:1] = -1
    
    def _long_cover(self, current_price_mean, current_mkt_position):
        self.chg_price_mean[:] = current_price_mean
        self.chg_posi[:] = current_mkt_position - 1
        self.chg_makereal[:1] = 1
        self.chg_reward[:] = ((self.chg_price - self.chg_price_mean)*(1) - self.fee)*self.chg_makereal
        self.chg_posi_var[:1] = -1
        self.chg_posi_entry_cover[:1] = -1
    
    def _stayon(self, current_price_mean, current_mkt_position):
        self.chg_posi[:] = current_mkt_position
        self.chg_price_mean[:] = current_price_mean

    def step(self, action):
        current_index = self.step_st + self.obs_len -1
        current_price_mean = self.price_mean_arr[current_index]
        current_mkt_position = self.posi_arr[current_index]

        self.t_index += 1
        self.step_st += self.step_len

        # observation part
        self.obs_state = self.obs_features[self.step_st: self.step_st+self.obs_len]
        self.obs_posi = self.posi_arr[self.step_st: self.step_st+self.obs_len]

        # position variation
        self.obs_posi_var = self.posi_variation_arr[self.step_st: self.step_st+self.obs_len]

        # position entry or cover :new_entry->1  increase->2 cover->-1 decrease->-2
        self.obs_posi_entry_cover = self.posi_entry_cover_arr[self.step_st: self.step_st+self.obs_len]
        self.obs_price = self.price[self.step_st: self.step_st+self.obs_len]
        self.obs_price_mean = self.price_mean_arr[self.step_st: self.step_st+self.obs_len]
        self.obs_reward_fluctuant = self.reward_fluctuant_arr[self.step_st: self.step_st+self.obs_len]
        self.obs_makereal = self.reward_makereal_arr[self.step_st: self.step_st+self.obs_len]
        self.obs_reward = self.reward_arr[self.step_st: self.step_st+self.obs_len]

        # change part
        self.chg_posi = self.obs_posi[-self.step_len:]
        self.chg_posi_var = self.obs_posi_var[-self.step_len:]
        self.chg_posi_entry_cover = self.obs_posi_entry_cover[-self.step_len:]
        self.chg_price = self.obs_price[-self.step_len:]
        self.chg_price_mean = self.obs_price_mean[-self.step_len:]
        self.chg_reward_fluctuant = self.obs_reward_fluctuant[-self.step_len:]
        self.chg_makereal = self.obs_makereal[-self.step_len:]
        self.chg_reward = self.obs_reward[-self.step_len:]

        done = False
        if self.step_st+self.obs_len+self.step_len >= len(self.price):
            done = True
            action = -1
            if current_mkt_position != 0:
                self.chg_price_mean[:] = current_price_mean
                self.chg_posi[:] = 0
                self.chg_posi_var[:1] = -current_mkt_position
                self.chg_posi_entry_cover[:1] = -2
                self.chg_makereal[:1] = 1
                self.chg_reward[:] = ((self.chg_price - self.chg_price_mean)*(current_mkt_position) - abs(current_mkt_position)*self.fee)*self.chg_makereal

            self.transaction_details = pd.DataFrame([self.posi_arr,
                                                     self.posi_variation_arr,
                                                     self.posi_entry_cover_arr,
                                                     self.price_mean_arr,
                                                     self.reward_fluctuant_arr,
                                                     self.reward_makereal_arr,
                                                     self.reward_arr], 
                                                     index=['position', 'position_variation', 'entry_cover',
                                                            'price_mean', 'reward_fluctuant', 'reward_makereal',
                                                            'reward'], 
                                                     columns=self.df_sample.index).T

            self.info = self.df_sample.join(self.transaction_details)  # Original

            
        # use next tick, maybe choice avg in first 10 tick will be better to real backtest
        enter_price = self.chg_price[0]
        if action == 1 and self.max_position > current_mkt_position >= 0:
            open_posi = (current_mkt_position == 0)
            self._long(open_posi, enter_price, current_mkt_position, current_price_mean)
        
        elif action == 2 and -self.max_position < current_mkt_position <= 0:
            open_posi = (current_mkt_position == 0)
            self._short(open_posi, enter_price, current_mkt_position, current_price_mean)
        
        elif action == 1 and current_mkt_position < 0:
            self._short_cover(current_price_mean, current_mkt_position)

        elif action == 2 and current_mkt_position>0:
            self._long_cover(current_price_mean, current_mkt_position)

        elif action == 1 and current_mkt_position==self.max_position:
            action = 0

        elif action == 2 and current_mkt_position==-self.max_position:
            action = 0
        
        if action == 0:
            if current_mkt_position != 0:
                self._stayon(current_price_mean, current_mkt_position)

        self.chg_reward_fluctuant[:] = (self.chg_price - self.chg_price_mean)*self.chg_posi - np.abs(self.chg_posi)*self.fee

        if self.return_transaction:
            self.obs_return = np.concatenate((self.obs_state, 
                                            self.obs_posi[:, np.newaxis],                  # 1
                                            self.obs_posi_var[:, np.newaxis],              # 2
                                            self.obs_posi_entry_cover[:, np.newaxis],      # 3
                                            self.obs_price[:, np.newaxis],                 # 4
                                            self.obs_price_mean[:, np.newaxis],            # 5
                                            self.obs_reward_fluctuant[:, np.newaxis],      # 6
                                            self.obs_makereal[:, np.newaxis],              # 7
                                            self.obs_reward[:, np.newaxis]), axis=1)       # 8
        else:
            self.obs_return = self.obs_state

        # return self.obs_return, self.obs_reward.sum(), done, self.info   # Original
        # return self.obs_return , self.obs_reward[0], done ,self.obs_reward_fluctuant[0] ,self.info
        # return self.obs_return , self.obs_reward[0], done ,self.obs_reward_fluctuant[0] ,self.reward_arr[:self.step_st+self.obs_len].cumsum()[-1] ,self.info
        return self.obs_return , self.obs_reward[0], done ,self.obs_reward_fluctuant[0] ,self.info

    def _gen_trade_color(self, ind, long_entry=(0, 1, 0, 0.8), long_cover=(1, 1, 1, 0.5), 
                         short_entry=(1, 0, 0, 0.8), short_cover=(1, 1, 1, 0.5)): 
        if self.posi_variation_arr[ind]>0 and self.posi_entry_cover_arr[ind]>0:
            return long_entry
        elif self.posi_variation_arr[ind]>0 and self.posi_entry_cover_arr[ind]<0:
            return long_cover
        elif self.posi_variation_arr[ind]<0 and self.posi_entry_cover_arr[ind]>0:
            return short_entry
        elif self.posi_variation_arr[ind]<0 and self.posi_entry_cover_arr[ind]<0:
            return short_cover 
    
    def _plot_trading(self):
        price_x = list(range(len(self.price[:self.step_st+self.obs_len])))
        # price_ma = list(range(len(self.price_mid[:self.step_st+self.obs_len])))

        self.price_plot = self.ax.plot(price_x, self.price[:self.step_st+self.obs_len], c=(0, 0.68, 0.95, 0.9),zorder=1)
        # self.price_plot2 = self.ax.plot(price_ma, self.price_mid[:self.step_st+self.obs_len], c=(0.5, 0, 0, 0.2),zorder=1)

        # maybe seperate up down color
        #self.price_plot = self.ax.plot(price_x, self.price[:self.step_st+self.obs_len], c=(0, 0.75, 0.95, 0.9),zorder=1)
        # self.features_plot = [self.ax3.plot(price_x, self.obs_features[:self.step_st+self.obs_len, i], 
        #                                     c=self.features_color[i])[0] for i in range(self.feature_len)]
        rect_high = self.obs_price.max() - self.obs_price.min()
        self.target_box = self.ax.add_patch(
                            patches.Rectangle(
                            (self.step_st, self.obs_price.min()), self.obs_len, rect_high,
                            label='observation',edgecolor=(0.9, 1, 0.2, 0.8),facecolor=(0.95,1,0.1,0.1),
                            linestyle='-',linewidth=1.5,
                            fill=True)
                            )     # remove background)
        self.fluc_reward_plot_p = self.ax2.fill_between(price_x, 0, self.reward_fluctuant_arr[:self.step_st+self.obs_len],
                                                        where=self.reward_fluctuant_arr[:self.step_st+self.obs_len]>=0, 
                                                        facecolor=(1, 0.8, 0, 0.2), edgecolor=(1, 0.8, 0, 0.9), linewidth=0.8)
        self.fluc_reward_plot_n = self.ax2.fill_between(price_x, 0, self.reward_fluctuant_arr[:self.step_st+self.obs_len],
                                                        where=self.reward_fluctuant_arr[:self.step_st+self.obs_len]<=0, 
                                                        facecolor=(0, 1, 0.8, 0.2), edgecolor=(0, 1, 0.8, 0.9), linewidth=0.8)
        self.posi_plot_long = self.ax3.fill_between(price_x, 0, self.posi_arr[:self.step_st+self.obs_len], 
                                                    where=self.posi_arr[:self.step_st+self.obs_len]>=0, 
                                                    facecolor=(1, 0.5, 0, 0.2), edgecolor=(1, 0.5, 0, 0.9), linewidth=1)
        self.posi_plot_short = self.ax3.fill_between(price_x, 0, self.posi_arr[:self.step_st+self.obs_len], 
                                                     where=self.posi_arr[:self.step_st+self.obs_len]<=0, 
                                                     facecolor=(0, 0.5, 1, 0.2), edgecolor=(0, 0.5, 1, 0.9), linewidth=1)
        self.reward_plot_p = self.ax2.fill_between(price_x, 0, 
                                                   self.reward_arr[:self.step_st+self.obs_len].cumsum(),
                                                   where=self.reward_arr[:self.step_st+self.obs_len].cumsum()>=0,
                                                   facecolor=(0, 1, 0, 0.2), edgecolor=(0, 1, 0, 0.9), linewidth=1)
        self.reward_plot_n = self.ax2.fill_between(price_x, 0, 
                                                   self.reward_arr[:self.step_st+self.obs_len].cumsum(),
                                                   where=self.reward_arr[:self.step_st+self.obs_len].cumsum()<=0,
                                                   facecolor=(1, 0, 0, 0.2), edgecolor=(1, 0, 0, 0.9), linewidth=1)

        trade_x = self.posi_variation_arr.nonzero()[0]
        trade_x_buy = [i for i in trade_x if self.posi_variation_arr[i]>0]
        trade_x_sell = [i for i in trade_x if self.posi_variation_arr[i]<0]
        trade_y_buy = [self.price[i] for i in trade_x_buy]
        trade_y_sell =  [self.price[i] for i in trade_x_sell]
        trade_color_buy = [self._gen_trade_color(i) for i in trade_x_buy] 
        trade_color_sell = [self._gen_trade_color(i) for i in trade_x_sell]
        self.trade_plot_buy = self.ax.scatter(x=trade_x_buy, y=trade_y_buy, s=100, marker='^', 
                                              c=trade_color_buy, edgecolors=(0,1,0,0.4), zorder=2)
        self.trade_plot_sell = self.ax.scatter(x=trade_x_sell, y=trade_y_sell, s=100, marker='v', 
                                               c=trade_color_sell, edgecolors=(1,0,0,0.4), zorder=2)

    def render(self, save=False):
        if self.render_on == 0:
            matplotlib.style.use('dark_background')
            self.render_on = 1

            left, width = 0.1, 0.8
            rect1 = [left, 0.42, width, 0.55]
            rect2 = [left, 0.12, width, 0.3]
            rect3 = [left, 0.01, width, 0.11]

            self.fig = plt.figure(figsize=(15,8))
            self.fig.suptitle('%s'%self.df_sample['datetime'].iloc[0].date(), fontsize=14, fontweight='bold')
            # self.ax = self.fig.add_subplot(1,1,1)
            self.ax = self.fig.add_axes(rect1)  # left, bottom, width, height
            self.ax2 = self.fig.add_axes(rect2, sharex=self.ax)
            self.ax3 = self.fig.add_axes(rect3, sharex=self.ax)
            self.ax.grid(color='gray', linestyle='-', linewidth=0.5)
            self.ax2.grid(color='gray', linestyle='-', linewidth=0.5)
            self.ax3.grid(color='gray', linestyle='-', linewidth=0.5)
            self.features_color = [c.rgb+(0.9,) for c in Color('yellow').range_to(Color('cyan'), self.feature_len)]
            #fig, ax = plt.subplots()
            self._plot_trading()

            self.ax.set_xlim(0,len(self.price[:self.step_st+self.obs_len])+200)
            plt.ion()
            #self.fig.tight_layout()
            plt.show()
            if save:
                self.fig.savefig('fig/%s.png' % str(self.t_index))

        elif self.render_on == 1:
            self.ax.lines.remove(self.price_plot[0])
            # [self.ax3.lines.remove(plot) for plot in self.features_plot]
            self.fluc_reward_plot_p.remove()
            self.fluc_reward_plot_n.remove()
            self.target_box.remove()
            self.reward_plot_p.remove()
            self.reward_plot_n.remove()
            self.posi_plot_long.remove()
            self.posi_plot_short.remove()
            self.trade_plot_buy.remove()
            self.trade_plot_sell.remove()

            self._plot_trading()

            self.ax.set_xlim(0,len(self.price[:self.step_st+self.obs_len])+200)
            if save:
                self.fig.savefig('fig/%s.png' % str(self.t_index))
            plt.pause(0.0001)
  
#######################################################################################################################

Using TensorFlow backend.


In [4]:
EPISODES = 2000
memory_max = 2000

class DoubleDQNAgent:
    def __init__(self, state_size, action_size):
        # if you want to see Cartpole learning, then change to True
        self.render = False
        self.load_model = False
        # get size of state and action
        self.state_size = state_size
        self.action_size = action_size

        # these is hyper parameters for the Double DQN
        self.discount_factor = 0.95
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_decay = 0.999
        self.epsilon_min = 0.01
        self.batch_size = 256 #64
        self.train_start = 1000
        # create replay memory using deque
        self.memory = deque(maxlen=2000)

        # create main model and target model
        self.model = self.build_model()
        self.target_model = self.build_model()

        # initialize target model
        self.update_target_model()

        if self.load_model:
            self.model.load_weights("./save_model/cartpole_ddqn.h5")

    # approximate Q function using Neural Network
    # state is input and Q Value of each action is output of network
    def build_model(self):
        model = Sequential()
        model.add(Dense(self.state_size+3, input_dim=self.state_size, activation='relu',
                        kernel_initializer='he_uniform'))
        
        model.add(Dense(self.state_size+3, activation='relu',
                        kernel_initializer='he_uniform'))
        
        model.add(Dense(self.action_size, activation='linear',
                        kernel_initializer='he_uniform'))
        
        model.summary()
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model

    # after some time interval update the target model to be same with model
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    # get action from model using epsilon-greedy policy
    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            q_value = self.model.predict(state)
            return np.argmax(q_value[0])

    def append_sample(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        if self.epsilon > self.epsilon_min :
            self.epsilon -= (1.0/EPISODES)/250

    # pick samples randomly from replay memory (with batch_size)
    def train_model(self):
        if len(self.memory) < self.train_start:
            return
        batch_size = min(self.batch_size, len(self.memory))
        mini_batch = random.sample(self.memory, batch_size)

        update_input = np.zeros((batch_size, self.state_size))
        update_target = np.zeros((batch_size, self.state_size))
        action, reward, done = [], [], []

        for i in range(batch_size):
            update_input[i] = mini_batch[i][0]
            action.append(mini_batch[i][1])
            reward.append(mini_batch[i][2])
            update_target[i] = mini_batch[i][3]
            done.append(mini_batch[i][4])

        target = self.model.predict(update_input)
        target_next = self.model.predict(update_target)
        target_val = self.target_model.predict(update_target)

        for i in range(self.batch_size):
            # like Q Learning, get maximum Q value at s'
            # But from target model
            if done[i]:
                target[i][action[i]] = reward[i]
            else:
                # the key point of Double DQN
                # selection of action is from model
                # update is from target model
                a = np.argmax(target_next[i])
                target[i][action[i]] = reward[i] + self.discount_factor * (
                    target_val[i][a])

        # make minibatch which includes target q value and predicted q value
        # and do the model fit!
        self.model.fit(update_input, target, batch_size=self.batch_size,epochs=1, verbose=0)
        # K.clear_session()

        

def feature(fename) :
    full = ['Open','High','Low','Close','ma2','ma3','ma4','ma5','ma10','ma50','ma200','macd','macdsignal','macdhist','rsi','upper','middle','lower',
                'smin','smax','lmin','lmax','rcloseopen','rclosehigh','rcloselow','rma5ma10','rma10ma50','rma50ma200','rma5upper','rma5middle',
                'rma5lowwer','rma5smin','rma5smax','rma5lmin','rma5lmax','rrsi'] #,'rcloseopen1','rclosehigh1','rcloselow1', 'rma5ma101','rma10ma501',
#                 'rma50ma2001','rma5upper1','rma5middle1','rma5lowwer1','rma5smin1','rma5smax1','rma5lmin1','rma5lmax1','rrsi1']

    full_price = ['Open','High','Low','Close','ma2','ma3','ma4','ma5','ma10','ma50','ma200','upper','middle','lower',
                    'smin','smax','lmin','lmax']
    
    full_relative = ['rcloseopen','rclosehigh','rcloselow','rma5ma10','rma10ma50','rma50ma200','rma5upper','rma5middle',
                    'rma5lowwer','rma5smin','rma5smax','rma5lmin','rma5lmax','rrsi']

    full_relative_shift = ['rcloseopen','rclosehigh','rcloselow','rma5ma10','rma10ma50','rma50ma200','rma5upper','rma5middle',
                    'rma5lowwer','rma5smin','rma5smax','rma5lmin','rma5lmax','rrsi','rcloseopen1','rclosehigh1','rcloselow1',
                    'rma5ma101','rma10ma501','rma50ma2001','rma5upper1','rma5middle1','rma5lowwer1','rma5smin1','rma5smax1',
                    'rma5lmin1','rma5lmax1','rrsi1']

    if fename == 'fullRelative' :
        feature_name_list = full_relative
    elif fename == 'fullPrice' :
        feature_name_list = full_price
    elif fename == 'fullRelativeShift' :
        feature_name_list = full_relative_shift
    else : feature_name_list = full

    return feature_name_list

In [1]:
if __name__ == "__main__":

    df =  pd.read_csv('dataset\SET_wINDICATOR.csv' )
    df['datetime'] = pd.to_datetime(df['datetime'])

    # fename , state_size , action_size = 'fullRelative' , 12 , 3
    # fename , state_size , action_size= 'fullPrice'  , 26  ,3
    # fename , state_size , action_size = 'fullRelativeShift' , 36 , 3
    fename , state_size , action_size = 'full' , 58 , 3

    Obs_data_len , Step_len, Max_position = 1,5,5

    env = trading_env(obs_data_len=Obs_data_len, step_len=Step_len, max_position=Max_position,
                        df=df, fee=0.1, deal_col_name='Close', plot_col_name='middle',
                        feature_names=feature(fename=fename) ,return_transaction = True)
                        
    agent = DoubleDQNAgent(state_size, action_size)

    scores, episodes, mean_scores = [], [], []
    # max_reward = 0
    

    for e in range(EPISODES):
        t3 = time.time()

        # # Validation data
        # start_split = np.random.randint(0,1300)
        # df1 =df.iloc[start_split : start_split+1400]
        # df1['serial_number'] = range(0,len(df1))
        # print(df1.shape)

        # env = trading_env(obs_data_len=Obs_data_len, step_len=Step_len, max_position=Max_position,
        #                 df=df1, fee=0.1, deal_col_name='Close', plot_col_name='middle',
        #                 feature_names=feature(fename=fename) ,return_transaction=True)

        done = False
        score = 0
        sc ,nr ,rf ,s = [] , [] , [] , 0
        reward1 = 0

        state = env.reset()
        state = np.reshape(state, [1, state_size])

        while not done:
            # get action for the current state and go one step in environment
            action = agent.get_action(state)
            next_state, reward, done, reward_fluc, info = env.step(action)
            next_state = np.reshape(next_state, [1, state_size])

            reward1 = reward
            # Drawdowm from negative fluc
            if reward_fluc < 0 :
                reward1 = -10

            # save the sample <s, a, r, s'> to the replay memory
            agent.append_sample(state, action, reward1, next_state, done )
            agent.train_model()   # every time step do the training
            state = next_state
            
            score += reward   # For collect to 
            net_reward = score + reward_fluc
            sc.append(score)
            nr.append(net_reward)
            rf.append(reward_fluc)
            s += 1

            if done:
                # every episode update the target model to be same with model
                agent.update_target_model()
                
                score += reward
                net_reward = score + reward_fluc
                sc.append(score)
                nr.append(net_reward)
                rf.append(reward_fluc)
                s += 1
                
                # every episode, plot the play time
                episodes.append(e)
                scores.append(score)
                mean_scores.append(np.mean(scores[-100:]) if len(scores) > 100 else 0)

                
                if e > (EPISODES-50) :
                    fig ,ax1 = plt.subplots()
                    ax1.plot(range(0,s), sc, 'b')
                    ax1.plot(range(0,s), nr, 'r',",")
                    ax1.plot(range(0,s), rf, 'g',",")
                    ax1.grid(True)                    
                    fig.savefig("cartpole/2-double-dqn/save_graph/ddqnVAL_FP256_{},{:0.0f}.png".format(e,score))
                    plt.close('all')
                    agent.model.save_weights("cartpole/2-double-dqn/save_model/ddqnVAL_FP256_{},{:0.0f}.h5".format(e,score))

        # Plot Graph Learning Progress
        pylab.plot(episodes, scores, 'b')
        pylab.plot(episodes, mean_scores, 'r')
        pylab.grid(True)
        pylab.savefig("cartpole/2-double-dqn/save_graph/a_DDQN_FP256_{:.0f}.png".format(EPISODES))
        plt.close('all')

        t4 = (time.time()-t3)
        t5 = time.ctime(time.time()+((EPISODES-e)*t4))
        print("Epoch: %s    Reward: %0.2f    Epsilon: %0.2f     %0.2f sec    %s" % (e,score,agent.epsilon,t4,t5))
        print()

      

                                
Q-table:

       left     right
0  0.000000  0.004320
1  0.000000  0.025005
2  0.000030  0.111241
3  0.000000  0.368750
4  0.027621  0.745813
5  0.000000  0.000000
