In [34]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torch.optim as optim
from torch.nn.utils import clip_grad_norm_
import random
import math
from torch.utils.tensorboard import SummaryWriter
from collections import deque, namedtuple
import time
import gym
import os
from stable_baselines3.common.vec_env import DummyVecEnv
def weight_init(layers):
    for layer in layers:
        torch.nn.init.kaiming_normal_(layer.weight, nonlinearity='relu')

In [35]:
from stable_baselines3.common.vec_env.base_vec_env import VecEnv, VecEnvStepReturn, VecEnvWrapper


In [36]:
import numpy as np
import pandas as pd
from stockstats import StockDataFrame as Sdf

import datetime

import datetime
import os

TRAINING_DATA_FILE = "dataprocessing/Yfinance_Data.csv"

now = datetime.datetime.now()
TRAINED_MODEL_DIR = f"trained_models/{now}"
os.makedirs(TRAINED_MODEL_DIR)

TESTING_DATA_FILE = "test.csv"

def load_dataset(*, file_name: str) -> pd.DataFrame:
    """
    load csv dataset from path
    :return: (df) pandas dataframe
    """
    # _data = pd.read_csv(f"{config.DATASET_DIR}/{file_name}")
    _data = pd.read_csv(file_name)

    return _data


def data_split(df, start, end):
    """
    split the dataset into training or testing using date
    :param data: (df) pandas dataframe, start, end
    :return: (df) pandas dataframe
    """
    data = df[(df.datadate >= start) & (df.datadate < end)]
    data = data.sort_values(['datadate', 'tic'], ignore_index=True)


    # data  = data[final_columns]
    data.index = data.datadate.factorize()[0]


    return data


def calculate_price(df):
    """
    calcualte adjusted close price, open-high-low price and volume
    :param data: (df) pandas dataframe
    :return: (df) pandas dataframe
    """
    data = df.copy()

    data = data[['Date', 'tic', 'Close', 'Open', 'High', 'Low', 'Volume','datadate']]
    data = data.sort_values(['tic', 'datadate'], ignore_index=True)
    return data


def add_technical_indicator(df):
    """
    calcualte technical indicators
    use stockstats package to add technical inidactors
    :param data: (df) pandas dataframe
    :return: (df) pandas dataframe
    """
    stock = Sdf.retype(df.copy())

    #print(stock)

    unique_ticker = stock.tic.unique()

    macd = pd.DataFrame()
    rsi = pd.DataFrame()
    cci = pd.DataFrame()
    dx = pd.DataFrame()

    # temp = stock[stock.tic == unique_ticker[0]]['macd']
    for i in range(len(unique_ticker)):
        ## macd
        temp_macd = stock[stock.tic == unique_ticker[i]]['macd']
        temp_macd = pd.DataFrame(temp_macd)
        macd = macd.append(temp_macd, ignore_index=True)
        ## rsi
        temp_rsi = stock[stock.tic == unique_ticker[i]]['rsi_30']
        temp_rsi = pd.DataFrame(temp_rsi)
        rsi = rsi.append(temp_rsi, ignore_index=True)
        ## cci
        temp_cci = stock[stock.tic == unique_ticker[i]]['cci_30']
        temp_cci = pd.DataFrame(temp_cci)
        cci = cci.append(temp_cci, ignore_index=True)
        ## adx
        temp_dx = stock[stock.tic == unique_ticker[i]]['dx_30']
        temp_dx = pd.DataFrame(temp_dx)
        dx = dx.append(temp_dx, ignore_index=True)

    df['macd'] = macd
    df['rsi'] = rsi
    df['cci'] = cci
    df['adx'] = dx

    return df


def preprocess_data():
    """data preprocessing pipeline"""
    start = datetime.datetime(2010, 12, 1)
    df = load_dataset(file_name=TRAINING_DATA_FILE)
    # get data after 2010
    # df = df[df.Date >= start]
    # calcualte adjusted price
    df_preprocess = calculate_price(df)
    # add technical indicators using stockstats
    df_final = add_technical_indicator(df_preprocess)
    # fill the missing values at the beginning
    df_final.fillna(method='bfill', inplace=True)
    return df_final



In [37]:
class IQN(nn.Module):
    def __init__(self, state_size, action_size,layer_size, n_step, seed, layer_type="ff"):
        super(IQN, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.input_shape = state_size
        self.action_size = action_size
        self.K = 32
        self.N = 8
        self.n_cos = 64
        self.layer_size = layer_size
        self.pis = torch.FloatTensor([np.pi*i for i in range(self.n_cos)]).view(1,1,self.n_cos).to(device) # Starting from 0 as in the paper 

        self.head = nn.Linear(self.input_shape, layer_size) # cound be a cnn 
        self.cos_embedding = nn.Linear(self.n_cos, layer_size)
        self.ff_1 = nn.Linear(layer_size, layer_size)
        self.ff_2 = nn.Linear(layer_size, action_size)
        #weight_init([self.head_1, self.ff_1])


        
    def calc_cos(self, batch_size, n_tau=8):
        """
        Calculating the cosinus values depending on the number of tau samples
        """
        taus = torch.rand(batch_size, n_tau).to(device).unsqueeze(-1) #(batch_size, n_tau, 1)
        cos = torch.cos(taus*self.pis)

        assert cos.shape == (batch_size,n_tau,self.n_cos), "cos shape is incorrect"
        return cos, taus
    
    def forward(self, input, num_tau=8):
        """
        Quantile Calculation depending on the number of tau
        
        Return:
        quantiles [ shape of (batch_size, num_tau, action_size)]
        taus [shape of ((batch_size, num_tau, 1))]
        
        """
        batch_size = input.shape[0]
        
        #print("self.head(input):{}".format(self.head(input).shape))
        
        x = torch.relu(self.head(input))
    
        #print("batch_size:{}".format(batch_size))
        #print("X:{}".format(x.shape))
        
        cos, taus = self.calc_cos(batch_size, num_tau) # cos shape (batch, num_tau, layer_size)
        
        #print("cos:{}".format(cos.shape))
        #print("taus:{}".format(taus.shape))
        
        
        cos = cos.view(batch_size*num_tau, self.n_cos)
        cos_x = torch.relu(self.cos_embedding(cos)).view(batch_size, num_tau, self.layer_size) # (batch, n_tau, layer)
        
        # x has shape (batch, layer_size) for multiplication –> reshape to (batch, 1, layer)
        #x = (x.unsqueeze(1)*cos_x).view(batch_size*num_tau, self.layer_size)
        #print("x:{},cos_x Shape:{},batch_size:{},layer_size:{}".format(x.shape,cos_x.shape,batch_size,self.layer_size))
        x = (x.unsqueeze(1) * cos_x).view(batch_size * num_tau, self.layer_size)
        #print("---------°°°°°°°°°------X----------°°°°°°°°°°-------:{}".format(x.shape))
        
        x = torch.relu(self.ff_1(x))
        out = self.ff_2(x)
        #print("---------°°°°°°°°°------out----------°°°°°°°°°°-------:{}".format(out.shape))
        
        return out.view(batch_size, num_tau, self.action_size), taus
    
    def get_action(self, inputs):
        quantiles, _ = self.forward(inputs, self.K)
        #print("quantiles:{}".format(quantiles.shape))
        actions = quantiles.mean(dim=1) #TODO: actions space= torch.Size([1, 32, 30])
        #print("action space quantile:{}".format(actions))
        return actions

In [38]:
class ReplayBuffer:
    """Fixed-size buffer to store experience tuples."""

    def __init__(self, buffer_size, batch_size, device, seed, gamma, n_step=1):
        """Initialize a ReplayBuffer object.
        Params
        ======
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
        """
        self.device = device
        self.memory = deque(maxlen=buffer_size)  
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)
        self.gamma = gamma
        self.n_step = n_step
        self.n_step_buffer = deque(maxlen=self.n_step)
    
    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        #print("before:", state,action,reward,next_state, done)
        self.n_step_buffer.append((state, action, reward, next_state, done))
        if len(self.n_step_buffer) == self.n_step:
            state, action, reward, next_state, done = self.calc_multistep_return()
            #print("after:",state,action,reward,next_state, done)
            e = self.experience(state, action, reward, next_state, done)
            self.memory.append(e)
    
    def calc_multistep_return(self):
        Return = 0
        for idx in range(self.n_step):
            Return += self.gamma**idx * self.n_step_buffer[idx][2]
        
        return self.n_step_buffer[0][0], self.n_step_buffer[0][1], Return, self.n_step_buffer[-1][3], self.n_step_buffer[-1][4]
        
    
    
    def sample(self):
        """Randomly sample a batch of experiences from memory."""
        experiences = random.sample(self.memory, k=self.batch_size)

        states = torch.from_numpy(np.stack([e.state for e in experiences if e is not None])).float().to(self.device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(self.device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(self.device)
        next_states = torch.from_numpy(np.stack([e.next_state for e in experiences if e is not None])).float().to(self.device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(self.device)
  
        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)

In [39]:


class DQN_Agent():
    """Interacts with and learns from the environment."""

    def __init__(self,
                 state_size,
                 action_size,
                 layer_size,
                 n_step,
                 BATCH_SIZE,
                 BUFFER_SIZE,
                 LR,
                 TAU,
                 GAMMA,
                 UPDATE_EVERY,
                 device,
                 seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            layer_size (int): size of the hidden layer
            BATCH_SIZE (int): size of the training batch
            BUFFER_SIZE (int): size of the replay memory
            LR (float): learning rate
            TAU (float): tau for soft updating the network weights
            GAMMA (float): discount factor
            UPDATE_EVERY (int): update frequency
            device (str): device that is used for the compute
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.device = device
        self.TAU = TAU
        self.GAMMA = GAMMA
        self.UPDATE_EVERY = UPDATE_EVERY
        self.BATCH_SIZE = BATCH_SIZE
        self.Q_updates = 0
        self.n_step = n_step
        self.action = []

        self.action_step = 30

        # IQN-Network
        self.qnetwork_local = IQN(state_size, action_size,layer_size, n_step, seed).to(device)
        self.qnetwork_target = IQN(state_size, action_size,layer_size, n_step, seed).to(device)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        #print(self.qnetwork_local)
        
        # Replay memory
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, self.device, seed, self.GAMMA, n_step)
        
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done, writer):
        # Save experience in replay memory
        #print("to memory action:{},state:{},next_state".format(action,state.shape,next_state.shape))
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.BATCH_SIZE:
                
                experiences = self.memory.sample()
                #print("experiences:{}".format(experiences))
                loss = self.learn(experiences)
                self.Q_updates += 1
                writer.add_scalar("Q_loss", loss, self.Q_updates)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy. Acting only every 4 frames!
        
        Params
        ======
            frame: to adjust epsilon
            state (array_like): current state
            
        """
        #print("without np.array:{}".format(state.shape))

        state = np.array(state)

        #print("this is the state space before torch.from_numpy:{}".format(state.shape))
        
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) #WHY?
        
        #print("this is the state space after torch.from_numpy:{}".format(state.shape))
        
        
        self.qnetwork_local.eval() #WHY?
        
        
        with torch.no_grad():
            #print("this is the state space:{}".format(state.shape))
            action_values = self.qnetwork_local.get_action(state) # 30 dimensions are coming back.
            #print('action_value:{}'.format(action_values))
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps: # select greedy action if random number is higher than epsilon or noisy network is used!
            action = np.argmax(action_values.cpu().data.numpy())
            self.last_action = action
            return action
        else:
            action = random.choice(np.arange(self.action_size))
            self.last_action = action 
            return action



    def learn(self, experiences):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        self.optimizer.zero_grad()
        states, actions, rewards, next_states, dones = experiences
        #print("learning states:{}, next_states:{}".format(states.shape, next_states.shape))
        # Get max predicted Q values (for next states) from target model
        Q_targets_next, _ = self.qnetwork_target(next_states)
        
        #print("--Q_targets_next :{}".format(Q_targets_next.shape))
        
        #print("---->Q_targets_next:{}".format(Q_targets_next))
        #print('------------------------------------')
        #print("--Q_targets_next detach max:{}".format(Q_targets_next.detach().max(2)))
        
        
        
        #print("--Q_targets_next.detach().max(2)[0].unsqueeze(1):{}".format(Q_targets_next.detach().max(2)[0].unsqueeze(1)))
        Q_targets_next = Q_targets_next.detach().max(2)[0].unsqueeze(1) # (batch_size, 1, N)
        
        # Compute Q targets for current states 
        Q_targets = rewards.unsqueeze(-1) + (self.GAMMA**self.n_step * Q_targets_next * (1. - dones.unsqueeze(-1)))
        # Get expected Q values from local model
        Q_expected, taus = self.qnetwork_local(states)
        
        #print("rewards:{}".format(rewards.shape))
        #print("Q_targets_Shape:{}".format(Q_targets.shape))
        #print("actions shape:{}".format(actions.shape))
        #print("Q_expected shape:{}".format(Q_expected.shape))
        #print("actions.unsqueeze(-1).shape:{}".format(actions.unsqueeze(-1).shape))
        #print("actions:{}".format(actions))
        #print("Q_expected:{}".format(Q_expected))
        #print("actions.unsqueeze(-1){}".format(actions.unsqueeze(-1)))
        Q_expected_2 = Q_expected.gather(2, actions.unsqueeze(-1))

        #print("Q_expected.gather(2, actions.unsqueeze(-1):{}".format(Q_expected_2.shape))
        
        Q_expected = Q_expected.gather(2, actions[0].unsqueeze(-1).expand(self.BATCH_SIZE, 8, 1))
        #print("Final what we need Q_expected-----:{}".format(Q_expected.shape))

        # Quantile Huber loss
        td_error = Q_targets - Q_expected
        #print("td_error.shape:{}".format(td_error.shape))
        #print("Q_expected.shape:{}".format(Q_expected.shape))
        #print("td_error:{}".format(td_error.shape))
        assert td_error.shape == (self.BATCH_SIZE, 8, 8), "wrong td error shape"
        huber_l = calculate_huber_loss(td_error, 1.0)
        quantil_l = abs(taus -(td_error.detach() < 0).float()) * huber_l / 1.0
        
        loss = quantil_l.sum(dim=1).mean(dim=1) # , keepdim=True if per weights get multipl
        loss = loss.mean()


        # Minimize the loss
        loss.backward()
        #clip_grad_norm_(self.qnetwork_local.parameters(),1)
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target)
        return loss.detach().cpu().numpy()            

    def soft_update(self, local_model, target_model):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(self.TAU*local_param.data + (1.0-self.TAU)*target_param.data)


def calculate_huber_loss(td_errors, k=1.0):
    """
    Calculate huber loss element-wisely depending on kappa k.
    """
    loss = torch.where(td_errors.abs() <= k, 0.5 * td_errors.pow(2), k * (td_errors.abs() - 0.5 * k))
    #print('this is huber loss: {}'.format(loss.shape))
    assert loss.shape == (td_errors.shape[0], 8, 8), "huber loss has wrong shape"
    return loss
    
def eval_runs(eps, frame):
    """
    Makes an evaluation run with the current epsilon
    """
    print("-----------------------------------------evaluating-----------------------------------------")
    env = gym.make("Acrobot-v1") # TODO:
    reward_batch = []
    for i in range(5):
        state = env.reset()
        rewards = 0
        while True:
            action = agent.act(state, eps)
            state, reward, done, _ = env.step(action)
            rewards += reward
            if done:
                break
        reward_batch.append(rewards)
        
    writer.add_scalar("Reward", np.mean(reward_batch), frame)

In [57]:
import matplotlib.pyplot as plt
%matplotlib inline
from IPython import display

import numpy as np
import pandas as pd
from gym.utils import seeding
import gym
import os
from gym import spaces
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import pickle

# shares normalization factor
# 100 shares per trade
HMAX_NORMALIZE = 100
# initial amount of money we have in our account
INITIAL_ACCOUNT_BALANCE= 1000
# total number of stocks in our portfolio
STOCK_DIM = 3
# transaction fee: 1/1000 reasonable percentage
TRANSACTION_FEE_PERCENT = 0.001
REWARD_SCALING = 1e-4

class StockEnvTrain(gym.Env):
    """A stock trading environment for OpenAI gym"""
    metadata = {'render.modes': ['human']}

    def __init__(self, df,day = 0):
        #super(StockEnv, self).__init__()
        #money = 10 , scope = 1
        self.day = day
        self.df = df
        self.agent_stock_iteration_index = 0

        # action_space normalization and shape is STOCK_DIM
        self.action_space = spaces.Box(low = -1, high = 1,shape = (STOCK_DIM,)) 
        # Shape = 181: [Current Balance]+[prices 1-30]+[owned shares 1-30] 
        # +[macd 1-30]+ [rsi 1-30] + [cci 1-30] + [adx 1-30]
        self.observation_space = spaces.Box(low=0, high=np.inf, shape = (19,))
        # load data from a pandas dataframe
        #print('df: {}'.format(self.df))
        #print('day: {}'.format(self.day))
        self.data = self.df.loc[self.day,:]
        #print(self.data.Close)
        self.terminal = False



        # initalize state
        self.state = [INITIAL_ACCOUNT_BALANCE] + \
                      self.data.Close.values.tolist() + \
                      [0]*STOCK_DIM + \
                      self.data.macd.values.tolist() + \
                      self.data.rsi.values.tolist() + \
                      self.data.cci.values.tolist() + \
                      self.data.adx.values.tolist()
        # initialize reward
        self.reward = 0
        self.cost = 0
        # memorize all the total balance change
        self.asset_memory = [INITIAL_ACCOUNT_BALANCE]
        self.rewards_memory = []
        self.final_asset_value = 0
        self.trades = 0
        #self.reset()
        self._seed()


    def _sell_stock(self, index, action):
        # perform sell action based on the sign of the action
        if self.state[index+STOCK_DIM+1] > 0:
            #update balance
            self.state[0] += \
            self.state[index+1]*min(abs(action),self.state[index+STOCK_DIM+1]) * \
             (1- TRANSACTION_FEE_PERCENT)

            self.state[index+STOCK_DIM+1] -= min(abs(action), self.state[index+STOCK_DIM+1])
            self.cost +=self.state[index+1]*min(abs(action),self.state[index+STOCK_DIM+1]) * \
             TRANSACTION_FEE_PERCENT
            self.trades+=1
        else:
            pass

    
    def _buy_stock(self, index, action):
        # perform buy action based on the sign of the action
        available_amount = self.state[0] // self.state[index+1]
        # print('available_amount:{}'.format(available_amount))

        #update balance
        self.state[0] -= self.state[index+1]*min(available_amount, action)* \
                          (1+ TRANSACTION_FEE_PERCENT)

        self.state[index+STOCK_DIM+1] += min(available_amount, action)

        self.cost+=self.state[index+1]*min(available_amount, action)* \
                          TRANSACTION_FEE_PERCENT
        self.trades+=1
        
    def step(self, actions):
        # print(self.day)
        self.terminal = self.day >= len(self.df.index.unique())-1
        #print(actions)
        self.actions = actions
        if self.terminal:
            print("Finished")
            print(self.state)
            end_total_asset = self.state[0]+ \
            sum(np.array(self.state[1:(STOCK_DIM+1)])*np.array(self.state[(STOCK_DIM+1):(STOCK_DIM*2+1)]))

            print("end_total_asset:{}".format(end_total_asset))
            df_total_value = pd.DataFrame(self.asset_memory)
            #df_total_value.to_csv('results/account_value_train.csv')
            #print("total_reward:{}".format(self.state[0]+sum(np.array(self.state[1:(STOCK_DIM+1)])*np.array(self.state[(STOCK_DIM+1):61]))- INITIAL_ACCOUNT_BALANCE ))
            #print("total_cost: ", self.cost)
            #print("total_trades: ", self.trades)
            df_total_value.columns = ['account_value']
            df_total_value['daily_return']=df_total_value.pct_change(1)
            sharpe = (252**0.5)*df_total_value['daily_return'].mean()/ \
                  df_total_value['daily_return'].std()
            print("Sharpe: ",sharpe)
            #print("=================================")
            df_rewards = pd.DataFrame(self.rewards_memory)
            #df_rewards.to_csv('results/account_rewards_train.csv')

            # print('total asset: {}'.format(self.state[0]+ sum(np.array(self.state[1:29])*np.array(self.state[29:]))))
            #with open('obs.pkl', 'wb') as f:  
            #    pickle.dump(self.state, f)

            return self.state, self.reward, self.terminal,{}

        else:
            # print(np.array(self.state[1:29]))
            #print("The actions is: {}".format(self.actions))

            #action = np.array([4,4,5])
            #actions = np.array([4,0,0,0,0,0,0,0,4,0,4,0,-3,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0])

            #actions = self.actions * HMAX_NORMALIZE #WHY??
            #print("actions-index------:{}".format(actions))
            #actions = (actions.astype(int))

            begin_total_asset = self.state[0]+ \
            sum(np.array(self.state[1:(STOCK_DIM+1)])*np.array(self.state[(STOCK_DIM+1):(STOCK_DIM*2+1)]))
            #print("begin_total_asset:{}".format(begin_total_asset))

            argsort_actions = np.argsort(actions) #TODO: this may not be touched.
            #print("The actions is: {}".format(actions))

            sell_index = argsort_actions[:np.where(actions == 0)[0].shape[0]]
            #sell_index = argsort_actions[4,0,0,0,0,0,0,0,4,0,4,0,-3,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0]
            #print("sell-index------:{}".format(sell_index))
            buy_index = argsort_actions[::-1][:np.where(actions == 2)[0].shape[0]]
            #buy_index = argsort_actions[::-1][4,0,0,0,0,0,0,0,4,0,4,0,-3,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0]
            #print("buy-index------:{}".format(buy_index))

            for index in sell_index:
            # print('take sell action'.format(actions[index]))
                #print("--------Action Shape:{}".format(actions.shape))
                self._sell_stock(index+ self.agent_stock_iteration_index, 1)

            for index in buy_index:
                #print("--------Action Shape:{}".format(actions.shape))
            # print('take buy action: {}'.format(actions[index]))
                self._buy_stock(index+ self.agent_stock_iteration_index, 1)
                
            
            #print("self.day:{}".format(self.day))
            
            

                
            #load next state
            # print("stock_shares:{}".format(self.state[29:]))
            self.state =  [self.state[0]] + \
                self.data.Close.values.tolist() + \
                list(self.state[(STOCK_DIM+1):(STOCK_DIM*2+1)]) + \
                self.data.macd.values.tolist() + \
                self.data.rsi.values.tolist() + \
                self.data.cci.values.tolist() + \
                self.data.adx.values.tolist()

            end_total_asset = self.state[0]+ \
            sum(np.array(self.state[1:(STOCK_DIM+1)])*np.array(self.state[(STOCK_DIM+1):(STOCK_DIM*2+1)]))
            self.asset_memory.append(end_total_asset)
            #print("end_total_asset:{}".format(end_total_asset))

            self.reward = end_total_asset - begin_total_asset            
            # print("step_reward:{}".format(self.reward))
            self.rewards_memory.append(self.reward)

            self.reward = self.reward*REWARD_SCALING
            
            self.agent_stock_iteration_index += 1 
            if self.agent_stock_iteration_index ==3:
                self.day += 1
                self.data = self.df.loc[self.day,:]
                self.agent_stock_iteration_index = 0
            
            

        return self.state, self.reward, self.terminal, {}

    def reset(self):
        self.final_asset_value = 0
        self.trades = 0
        self.asset_memory = [INITIAL_ACCOUNT_BALANCE]
        self.day = 0
        self.data = self.df.loc[self.day,:]
        self.cost = 0
        self.trades = 0
        self.terminal = False 
        self.rewards_memory = []
        self.agent_stock_iteration_index = 0
        #initiate state
        self.state = [INITIAL_ACCOUNT_BALANCE] + \
                      self.data.Close.values.tolist() + \
                      [0]*STOCK_DIM + \
                      self.data.macd.values.tolist() + \
                      self.data.rsi.values.tolist() + \
                      self.data.cci.values.tolist() + \
                      self.data.adx.values.tolist() 
        # iteration += 1 
        #print("[0]*STOCK_DIM:{}".format([0]*STOCK_DIM))
        #print("self.state:{}".format(len(self.state)))
        print(np.array(self.state))
        return np.array(self.state)
    
    def render(self, mode='human'):
        return self.state

    def _seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]


def run(env,frames=1000, eps_fixed=False, eps_frames=1e6, min_eps=0.01):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    output_history = []
    frame = 0
    if eps_fixed:
        eps = 0
    else:
        eps = 1
    eps_start = 1
    i_episode = 1
    state = env.reset()
    state = state[0,:]
    #print("state space:{}".format(state[0,:].shape))
    score = 0                  
    for frame in range(1, frames+1):
        
        if frame  == 0:
            # inital state

            initial = True
        else:
            # previous state
            initial = False
        #print('initial state:{}'.format(initial))
        
        
        
        
        action = agent.act(state, eps) #TODO: getting one dimension back.
        
        
        action = np.array([action])
        
        
        next_state, reward, done, info = env_train.step([action]) #TODO: Wants a list of actions of size a

        #print("env_trainNext State: {}".format(next_state.shape))

        next_state = next_state[0,:]
        
        agent.step(state, action, reward, next_state, done, writer)
        

        state = next_state
        score += reward
        # linear annealing to the min epsilon value until eps_frames and from there slowly decease epsilon to 0 until the end of training
        if eps_fixed == False:
            if frame < eps_frames:
                eps = max(eps_start - (frame*(1/eps_frames)), min_eps)
            else:
                eps = max(min_eps - min_eps*((frame-eps_frames)/(frames-eps_frames)), 0.001)

        # evaluation runs
        if frame % 100000 == 0:
            print("score: {}".format(state))
            print("score: {}".format(score))
            #print("state: {}".format(state))
            print("action:{}, Number:{}".format(action,frame))
            print("-------------------------")
        
        if done:
            scores_window.append(score)       # save most recent score
            scores.append(score)              # save most recent score
            writer.add_scalar("Average100", np.mean(scores_window), frame)
            output_history.append(np.mean(scores_window))
            print('\rEpisode {}\tFrame {} \tAverage Score: {:.2f}'.format(i_episode, frame, np.mean(scores_window)), end="")
            if i_episode % 100 == 0:
                print('\rEpisode {}\tFrame {}\tAverage Score: {:.2f}'.format(i_episode,frame, np.mean(scores_window)))
            i_episode +=1 

            state = env.reset()
            state = state[0,:]
            score = 0              

    return output_history


if __name__ == "__main__":
    
        # read and preprocess data
    preprocessed_path = "done_3stocks.csv"
    if os.path.exists(preprocessed_path):
        data = pd.read_csv(preprocessed_path, index_col=0)

    unique_trade_date = data[(data.datadate > 20151001)&(data.datadate <= 20200707)].datadate.unique()
    #print(unique_trade_date)

    
    train = data_split(data, start=20100101, end=20160101)
    
    env_train = DummyVecEnv([lambda: StockEnvTrain(train)])
    
    writer = SummaryWriter("runs/"+"IQN_CP_5")
    seed = 1
    BUFFER_SIZE = 10000
    BATCH_SIZE = 8
    GAMMA = 0.99
    TAU = 1e-2
    LR = 1e-3
    UPDATE_EVERY = 1
    n_step = 1
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("Using ", device)


    action_size     = env_train.action_space.shape[0]

    
    
    print('Action Space: {}'.format(action_size))
    state_size = env_train.observation_space.shape[0]
    print('State Space: {}'.format(state_size))

    agent = DQN_Agent(state_size=19,    #181 #4
                        action_size=3, #30 #7
                        layer_size=512, #512, #512
                        n_step=n_step,
                        BATCH_SIZE=BATCH_SIZE, 
                        BUFFER_SIZE=BUFFER_SIZE, 
                        LR=LR, 
                        TAU=TAU, 
                        GAMMA=GAMMA, 
                        UPDATE_EVERY=UPDATE_EVERY, 
                        device=device, 
                        seed=seed)



    # set epsilon frames to 0 so no epsilon exploration
    eps_fixed = False
    t0 = time.time()
    final_average100 = run(env=env_train, frames = 600000, eps_fixed=eps_fixed, eps_frames=5000, min_eps=0.025)
    t1 = time.time()
    
    print("Training time: {}min".format(round((t1-t0)/60,2)))
    torch.save(agent.qnetwork_local.state_dict(), "IQN"+".pth")


Using  cpu
Action Space: 3
State Space: 19
[1.00000000e+03 3.05728571e+01 4.09200000e+01 5.61800000e+01
 0.00000000e+00 0.00000000e+00 0.00000000e+00 5.60928564e-01
 3.58722215e-01 6.24669814e-01 6.21337044e+01 5.67090572e+01
 5.85111632e+01 1.68824972e+02 2.09000651e+00 8.31548524e+01
 3.37911464e+01 1.15370605e+01 1.08349581e+01]
Finished
[1315.2506514799938, 107.32, 70.16, 146.41, 1.0, 0.0, 1.0, -2.466788775, -0.5672052238, -0.1475155644, 42.30524752, 44.89929352, 52.58918069, -98.67248174, -15.74083296, 29.12710167, 29.43194077, 7.763541354, 9.869031232]
end_total_asset:1568.9806514799939
Sharpe:  0.38604505994930216
[1.00000000e+03 3.05728571e+01 4.09200000e+01 5.61800000e+01
 0.00000000e+00 0.00000000e+00 0.00000000e+00 5.60928564e-01
 3.58722215e-01 6.24669814e-01 6.21337044e+01 5.67090572e+01
 5.85111632e+01 1.68824972e+02 2.09000651e+00 8.31548524e+01
 3.37911464e+01 1.15370605e+01 1.08349581e+01]
Episode 1	Frame 4528 	Average Score: 0.06[1.00000000e+03 3.05728571e+01 4.092000

KeyboardInterrupt: 