In [64]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torch.optim as optim
from torch.nn.utils import clip_grad_norm_
import random
import math
from torch.utils.tensorboard import SummaryWriter
from collections import deque, namedtuple
import time
import gym
import os
from stable_baselines3.common.vec_env import DummyVecEnv
def weight_init(layers):
    for layer in layers:
        torch.nn.init.kaiming_normal_(layer.weight, nonlinearity='relu')

In [65]:
from stable_baselines3.common.vec_env.base_vec_env import VecEnv, VecEnvStepReturn, VecEnvWrapper


In [264]:

class Actor(nn.Module):
    """Actor (Policy) Model."""

    def __init__(self, state_size, action_size, seed, hidden_size=256):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fc1_units (int): Number of nodes in first hidden layer
            fc2_units (int): Number of nodes in second hidden layer
        """
        super(Actor, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.batch_norm = nn.BatchNorm1d(hidden_size) ## seems to improve the final performance a lot
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, action_size)
        self.reset_parameters()

    def reset_parameters(self):
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
        self.fc3.weight.data.uniform_(-3e-3, 3e-3)

    def forward(self, state):
        """Build an actor (policy) network that maps states -> actions."""
        x = torch.relu(self.fc1(state)) #self.batch_norm
        x = torch.relu(self.fc2(x))
        return torch.tanh(self.fc3(x))


class Critic(nn.Module):
    """Critic (Value) Model."""

    def __init__(self, state_size, action_size, seed, hidden_size=256):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fcs1_units (int): Number of nodes in the first hidden layer
            fc2_units (int): Number of nodes in the second hidden layer
        """
        super(Critic, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fcs1 = nn.Linear(state_size, hidden_size)
        self.batch_norm = nn.BatchNorm1d(hidden_size)
        self.fc2 = nn.Linear(hidden_size+action_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, 1)
        self.reset_parameters()

    def reset_parameters(self):
        self.fcs1.weight.data.uniform_(*hidden_init(self.fcs1))
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
        self.fc3.weight.data.uniform_(-3e-3, 3e-3)

    def forward(self, state, action):
        """Build a critic (value) network that maps (state, action) pairs -> Q-values."""
        xs = F.relu(self.fcs1(state)) # self.batch_norm
        x = torch.cat((xs, action), dim=1)
        x = F.relu(self.fc2(x))
        return self.fc3(x)

class IQN(nn.Module):
    def __init__(self, state_size, action_size, layer_size, seed, N, dueling=False, device="cuda:0"):
        super(IQN, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.input_shape = state_size
        self.action_size = action_size
        self.N = N  
        self.n_cos = 64
        self.layer_size = layer_size
        self.pis = torch.FloatTensor([np.pi*i for i in range(1,self.n_cos+1)]).view(1,1,self.n_cos).to(device) # Starting from 0 as in the paper 
        self.dueling = dueling
        self.device = device

        # Network Architecture

        self.head = nn.Linear(self.action_size+self.input_shape, layer_size) 
        self.cos_embedding = nn.Linear(self.n_cos, layer_size)
        self.ff_1 = nn.Linear(layer_size, layer_size)
        self.ff_2 = nn.Linear(layer_size, 1)    
        #weight_init([self.head_1, self.ff_1])

    def calc_input_layer(self):
        x = torch.zeros(self.input_shape).unsqueeze(0)
        x = self.head(x)
        return x.flatten().shape[0]
        
    def calc_cos(self, batch_size, n_tau=32):
        """
        Calculating the cosinus values depending on the number of tau samples
        """
        taus = torch.rand(batch_size, n_tau).unsqueeze(-1).to(self.device) #(batch_size, n_tau, 1)  .to(self.device)
        cos = torch.cos(taus*self.pis)

        assert cos.shape == (batch_size,n_tau,self.n_cos), "cos shape is incorrect"
        return cos, taus
    
    def forward(self, input, action, num_tau=32):
        """
        Quantile Calculation depending on the number of tau
        
        Return:
        quantiles [ shape of (batch_size, num_tau, action_size)]
        taus [shape of ((batch_size, num_tau, 1))]
        
        """
        batch_size = input.shape[0]

        x = torch.cat((input, action), dim=1)
        x = torch.relu(self.head(x  ))
        
        cos, taus = self.calc_cos(batch_size, num_tau) # cos shape (batch, num_tau, layer_size)
        cos = cos.view(batch_size*num_tau, self.n_cos)
        cos_x = torch.relu(self.cos_embedding(cos)).view(batch_size, num_tau, self.layer_size) # (batch, n_tau, layer)
        
        # x has shape (batch, layer_size) for multiplication –> reshape to (batch, 1, layer)
        x = (x.unsqueeze(1)*cos_x).view(batch_size*num_tau, self.layer_size)  #batch_size*num_tau, self.cos_layer_out
        # Following reshape and transpose is done to bring the action in the same shape as batch*tau:
        # first 32 entries are tau for each action -> thats why each action one needs to be repeated 32 times 
        # x = [[tau1   action = [[a1
        #       tau1              a1   
        #        ..               ..
        #       tau2              a2
        #       tau2              a2
        #       ..]]              ..]]  
        #action = action.repeat(num_tau,1).reshape(num_tau,batch_size*self.action_size).transpose(0,1).reshape(batch_size*num_tau,self.action_size)
        #x = torch.cat((x,action),dim=1)
        x = torch.relu(self.ff_1(x))

        out = self.ff_2(x)
        
        return out.view(batch_size, num_tau, 1), taus
    
    def get_qvalues(self, inputs, action):
        quantiles, _ = self.forward(inputs, action, self.N)
        actions = quantiles.mean(dim=1)
        return actions  



class DeepActor(nn.Module):
    """Actor (Policy) Model."""

    def __init__(self, state_size, action_size, seed, hidden_size=256):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fc1_units (int): Number of nodes in first hidden layer
            fc2_units (int): Number of nodes in second hidden layer
        """
        super(DeepActor, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.input_size = hidden_size+state_size
        self.fc1 = nn.Linear(state_size, hidden_size)
        #self.batch_norm = nn.BatchNorm1d(fc1_units)
        self.fc2 = nn.Linear(self.input_size, hidden_size)
        self.fc3 = nn.Linear(self.input_size, hidden_size)
        self.fc4 = nn.Linear(self.input_size, hidden_size)
        self.fc5 = nn.Linear(hidden_size, action_size)
        self.reset_parameters()

    def reset_parameters(self):
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
        self.fc3.weight.data.uniform_(*hidden_init(self.fc3)) 
        self.fc4.weight.data.uniform_(*hidden_init(self.fc4))
        self.fc5.weight.data.uniform_(-3e-3, 3e-3)

    def forward(self, state):
        """Build an actor (policy) network that maps states -> actions."""
        x = torch.relu(self.fc1(state))
        x = torch.cat((x,state), dim=1)
        x = torch.relu(self.fc2(x))
        x = torch.cat((x,state), dim=1)
        x = torch.relu(self.fc3(x))
        x = torch.cat((x,state), dim=1)
        x = torch.relu(self.fc4(x))
        return torch.tanh(self.fc5(x))


class DeepCritic(nn.Module):
    """Critic (Value) Model."""

    def __init__(self, state_size, action_size, seed, hidden_size=256):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fcs1_units (int): Number of nodes in the first hidden layer
            fc2_units (int): Number of nodes in the second hidden layer
        """
        super(DeepCritic, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.input_dim = hidden_size+action_size+state_size
        self.fc1 = nn.Linear(state_size+action_size, hidden_size)
        #.batch_norm = nn.BatchNorm1d(fcs1_units)
        self.fc2 = nn.Linear(self.input_dim, hidden_size)
        self.fc3 = nn.Linear(self.input_dim, hidden_size)
        self.fc4 = nn.Linear(self.input_dim, hidden_size)
        self.fc5 = nn.Linear(hidden_size, 1)
        self.reset_parameters()

    def reset_parameters(self):
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
        self.fc3.weight.data.uniform_(*hidden_init(self.fc3))
        self.fc4.weight.data.uniform_(*hidden_init(self.fc4))
        self.fc5.weight.data.uniform_(-3e-3, 3e-3)

    def forward(self, state, action):
        """Build a critic (value) network that maps (state, action) pairs -> Q-values."""
        xu = torch.cat((state, action), dim=1)
        x = F.relu(self.fc1(xu))
        x = torch.cat((x, xu), dim=1)
        x = F.relu(self.fc2(x))
        x = torch.cat((x, xu), dim=1)
        x = F.relu(self.fc3(x))
        x = torch.cat((x, xu), dim=1)
        x = F.relu(self.fc4(x))
        return self.fc5(x)

class DeepIQN(nn.Module):
    def __init__(self, state_size, action_size, layer_size, seed, N, dueling=False, device="cuda:0"):
        super(DeepIQN, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.input_shape = state_size
        self.action_size = action_size
        self.input_dim = action_size+state_size+layer_size
        self.N = N  
        self.n_cos = 64
        self.layer_size = layer_size
        self.pis = torch.FloatTensor([np.pi*i for i in range(1,self.n_cos+1)]).view(1,1,self.n_cos).to(device) # Starting from 0 as in the paper 
        self.dueling = dueling
        self.device = device

        # Network Architecture

        self.head = nn.Linear(self.action_size+self.input_shape, layer_size) 
        self.ff_1 = nn.Linear(self.input_dim, layer_size)
        self.ff_2 = nn.Linear(self.input_dim, layer_size)
        self.cos_embedding = nn.Linear(self.n_cos, layer_size)
        self.ff_3 = nn.Linear(self.input_dim, layer_size)
        self.ff_4 = nn.Linear(self.layer_size, 1)    
        #weight_init([self.head_1, self.ff_1])  

    def calc_input_layer(self):
        x = torch.zeros(self.input_shape).unsqueeze(0)
        x = self.head(x)
        return x.flatten().shape[0]
        
    def calc_cos(self, batch_size, n_tau=32):
        """
        Calculating the cosinus values depending on the number of tau samples
        """
        taus = torch.rand(batch_size, n_tau).unsqueeze(-1).to(self.device) #(batch_size, n_tau, 1)  .to(self.device)
        cos = torch.cos(taus*self.pis)

        assert cos.shape == (batch_size,n_tau,self.n_cos), "cos shape is incorrect"
        return cos, taus
    
    def forward(self, input, action, num_tau=32):
        """
        Quantile Calculation depending on the number of tau
        
        Return:
        quantiles [ shape of (batch_size, num_tau, action_size)]
        taus [shape of ((batch_size, num_tau, 1))]
        
        """
        batch_size = input.shape[0]
        xs = torch.cat((input, action), dim=1)
        x = torch.relu(self.head(xs))
        x = torch.cat((x, xs), dim=1)
        x = torch.relu(self.ff_1(x))
        x = torch.cat((x, xs), dim=1)
        x = torch.relu(self.ff_2(x))

        cos, taus = self.calc_cos(batch_size, num_tau) # cos shape (batch, num_tau, layer_size)
        cos = cos.view(batch_size*num_tau, self.n_cos)
        cos_x = torch.relu(self.cos_embedding(cos)).view(batch_size, num_tau, self.layer_size) # (batch, n_tau, layer)
        
        # x has shape (batch, layer_size) for multiplication –> reshape to (batch, 1, layer)
        x = (x.unsqueeze(1)*cos_x).view(batch_size*num_tau, self.layer_size)  #batch_size*num_tau, self.cos_layer_out
        # Following reshape and transpose is done to bring the action in the same shape as batch*tau:
        # first 32 entries are tau for each action -> thats why each action one needs to be repeated 32 times 
        # x = [[tau1   action = [[a1
        #       tau1              a1   
        #        ..               ..
        #       tau2              a2
        #       tau2              a2
        #       ..]]              ..]]  
        action = action.repeat(num_tau,1).reshape(num_tau,batch_size*self.action_size).transpose(0,1).reshape(batch_size*num_tau,self.action_size)
        state = input.repeat(num_tau,1).reshape(num_tau,batch_size*self.input_shape).transpose(0,1).reshape(batch_size*num_tau,self.input_shape)
        
        x = torch.cat((x,action,state),dim=1)
        x = torch.relu(self.ff_3(x))

        out = self.ff_4(x)
        
        return out.view(batch_size, num_tau, 1), taus
    
    def get_qvalues(self, inputs, action):
        quantiles, _ = self.forward(inputs, action, self.N)
        actions = quantiles.mean(dim=1)
        return actions  


In [265]:
import numpy as np
import pandas as pd
from stockstats import StockDataFrame as Sdf

import datetime

import datetime
import os

TRAINING_DATA_FILE = "dataprocessing/Yfinance_Data.csv"

now = datetime.datetime.now()
TRAINED_MODEL_DIR = f"trained_models/{now}"
os.makedirs(TRAINED_MODEL_DIR)

TESTING_DATA_FILE = "test.csv"

def load_dataset(*, file_name: str) -> pd.DataFrame:
    """
    load csv dataset from path
    :return: (df) pandas dataframe
    """
    # _data = pd.read_csv(f"{config.DATASET_DIR}/{file_name}")
    _data = pd.read_csv(file_name)

    return _data


def data_split(df, start, end):
    """
    split the dataset into training or testing using date
    :param data: (df) pandas dataframe, start, end
    :return: (df) pandas dataframe
    """
    data = df[(df.datadate >= start) & (df.datadate < end)]
    data = data.sort_values(['datadate', 'tic'], ignore_index=True)


    # data  = data[final_columns]
    data.index = data.datadate.factorize()[0]


    return data


def calculate_price(df):
    """
    calcualte adjusted close price, open-high-low price and volume
    :param data: (df) pandas dataframe
    :return: (df) pandas dataframe
    """
    data = df.copy()

    data = data[['Date', 'tic', 'Close', 'Open', 'High', 'Low', 'Volume','datadate']]
    data = data.sort_values(['tic', 'datadate'], ignore_index=True)
    return data


def add_technical_indicator(df):
    """
    calcualte technical indicators
    use stockstats package to add technical inidactors
    :param data: (df) pandas dataframe
    :return: (df) pandas dataframe
    """
    stock = Sdf.retype(df.copy())

    #print(stock)

    unique_ticker = stock.tic.unique()

    macd = pd.DataFrame()
    rsi = pd.DataFrame()
    cci = pd.DataFrame()
    dx = pd.DataFrame()

    # temp = stock[stock.tic == unique_ticker[0]]['macd']
    for i in range(len(unique_ticker)):
        ## macd
        temp_macd = stock[stock.tic == unique_ticker[i]]['macd']
        temp_macd = pd.DataFrame(temp_macd)
        macd = macd.append(temp_macd, ignore_index=True)
        ## rsi
        temp_rsi = stock[stock.tic == unique_ticker[i]]['rsi_30']
        temp_rsi = pd.DataFrame(temp_rsi)
        rsi = rsi.append(temp_rsi, ignore_index=True)
        ## cci
        temp_cci = stock[stock.tic == unique_ticker[i]]['cci_30']
        temp_cci = pd.DataFrame(temp_cci)
        cci = cci.append(temp_cci, ignore_index=True)
        ## adx
        temp_dx = stock[stock.tic == unique_ticker[i]]['dx_30']
        temp_dx = pd.DataFrame(temp_dx)
        dx = dx.append(temp_dx, ignore_index=True)

    df['macd'] = macd
    df['rsi'] = rsi
    df['cci'] = cci
    df['adx'] = dx

    return df


def preprocess_data():
    """data preprocessing pipeline"""
    start = datetime.datetime(2010, 12, 1)
    df = load_dataset(file_name=TRAINING_DATA_FILE)
    # get data after 2010
    # df = df[df.Date >= start]
    # calcualte adjusted price
    df_preprocess = calculate_price(df)
    # add technical indicators using stockstats
    df_final = add_technical_indicator(df_preprocess)
    # fill the missing values at the beginning
    df_final.fillna(method='bfill', inplace=True)
    return df_final

In [274]:
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils import clip_grad_norm_

class Agent():
    """Interacts with and learns from the environment."""
    
    def __init__(self, state_size,
                      action_size,
                      n_step,
                      per, 
                      munchausen,
                      distributional,
                      D2RL,
                      random_seed=2,
                      hidden_size=400,
                      BUFFER_SIZE = int(1e6),  # replay buffer size
                      BATCH_SIZE = 32,        # minibatch size
                      GAMMA = 0.99,            # discount factor
                      TAU = 1e-3,              # for soft update of target parameters
                      LR_ACTOR = 1e-3,         # learning rate of the actor 
                      LR_CRITIC = 1e-3,        # learning rate of the critic
                      WEIGHT_DECAY = 1e-2,        # L2 weight decay
                      LEARN_EVERY = 1,
                      LEARN_NUMBER = 1,
                      EPSILON = 1.0,
                      EPSILON_DECAY = 1,
                      device = "cuda",
                      frames = 100000,
                      worker=1
                      ):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.BUFFER_SIZE = BUFFER_SIZE
        self.BATCH_SIZE = BATCH_SIZE
        self.per = per
        self.munchausen = munchausen
        self.n_step = n_step
        self.distributional = distributional
        self.D2RL = D2RL
        self.GAMMA = GAMMA
        self.TAU = TAU
        self.LEARN_EVERY = LEARN_EVERY
        self.LEARN_NUMBER = LEARN_NUMBER
        self.EPSILON_DECAY = EPSILON_DECAY
        self.epsilon = 0.04
        self.device = device
        self.seed = random.seed(random_seed)
        # distributional Values
        self.N = 32
        self.entropy_coeff = 0.001
        # munchausen values
        self.entropy_tau = 0.03
        self.lo = -1
        self.alpha = 0.9
        self.last_action = []
        self.action = []
        self.eta = torch.FloatTensor([.1]).to(device)
        
        print("Using: ", device)
        print("seed agent: ", self.seed)
        
        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed, hidden_size=hidden_size).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed, hidden_size=hidden_size).to(device)

        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)

        #self.critic_local = DeepIQN(state_size, action_size, layer_size=hidden_size, device=device, seed=random_seed, dueling=None, N=self.N).to(device)
        #self.critic_target = DeepIQN(state_size, action_size, layer_size=hidden_size, device=device, seed=random_seed, dueling=None, N=self.N).to(device)
        
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        print("Actor: \n", self.actor_local)
        print("\nCritic: \n", self.critic_local)

        #self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, n_step=n_step, parallel_env=worker, device=device, seed=random_seed, gamma=GAMMA)
        #self.memory = PrioritizedReplay(device, BUFFER_SIZE, self.BATCH_SIZE, gamma=self.GAMMA, n_step=n_step, parallel_env=1)
        self.memory = PrioritizedReplay(BUFFER_SIZE, BATCH_SIZE, device=device, seed=random_seed, gamma=GAMMA, n_step=n_step, parallel_env=worker, beta_frames=frames)
        self.noise = OUNoise(action_size, random_seed)
        self.learn = self.learn_
        
        

        
    def step(self, state, action, reward, next_state, done, timestamp, writer):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        
        #rint('agent step reward:{}'.format(reward))
        self.memory.add(state, action, reward, next_state, done)

        
        
        # Learn, if enough samples are available in memory
        if len(self.memory) > self.BATCH_SIZE and timestamp % self.LEARN_EVERY == 0:
            for _ in range(self.LEARN_NUMBER):
                
                experiences = self.memory.sample()
                #print(experiences)
                losses = self.learn(experiences, self.GAMMA)
            writer.add_scalar("Critic_loss", losses[0], timestamp)
            writer.add_scalar("Actor_loss", losses[1], timestamp)

    def act(self, state):
        """Returns actions for given state as per current policy."""
        #state = torch.from_numpy(state).float().to(self.device)
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        #print(state)
        
        print('state.shape[0]:{}'.format(state.shape))
        #print('self.state_size:{}'.format(self.state_size))
    

        assert state.shape == (state.shape[0],self.state_size), "shape: {}".format(state.shape)
        self.actor_local.eval()
        with torch.no_grad():
                action = self.actor_local(state).cpu().data.numpy().squeeze(0)
        self.actor_local.train()
        
        # Epsilon-greedy action selection

        # Epsilon-greedy action selection
        """
        if random.random() > eps: # select greedy action if random number is higher than epsilon or noisy network is used!
            action = np.argmax(action_values.cpu().data.numpy())
            self.last_action = action
            return action
        else:
            action = random.choice(np.arange(self.action_size))
            self.last_action = action """
        action += self.noise.sample() * self.epsilon
        
        return action #np.clip(action, -1, 1)

    def reset(self):
        #print('self.seed',self.seed)
        self.noise.reset()

    
    def soft_update(self, local_model, target_model):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(self.TAU*local_param.data + (1.0-self.TAU)*target_param.data)


    def learn_(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones, idx, weights = experiences
        icm_loss = 0


        # ---------------------------- update critic ---------------------------- #

        with torch.no_grad():
            actions_next = self.actor_target(next_states.to(self.device))
            Q_targets_next = self.critic_target(next_states.to(self.device), actions_next.to(self.device))
            # Compute Q targets for current states (y_i)
            Q_targets = rewards + (gamma**self.n_step * Q_targets_next * (1 - dones))
       
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        td_error =  Q_targets - Q_expected
        critic_loss = (td_error.pow(2)*weights.to(self.device)).mean().to(self.device)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target)
        self.soft_update(self.actor_local, self.actor_target)                     

        self.memory.update_priorities(idx, np.clip(abs(td_error.data.cpu().numpy()),-1,1))
        # ----------------------- update epsilon and noise ----------------------- #
        
        self.epsilon *= self.EPSILON_DECAY
        
        self.noise.reset()
        return critic_loss.detach().cpu().numpy(), actor_loss.detach().cpu().numpy(), icm_loss
            
    def learn_distribution(self, experiences, gamma):
            """Update policy and value parameters using given batch of experience tuples.
            Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
            where:
                actor_target(state) -> action
                critic_target(state, action) -> Q-value

            Params
            ======
                experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
                gamma (float): discount factor
            """
            states, actions, rewards, next_states, dones, idx, weights = experiences
            
            #print('states:{}'.format(states.shape))
            #print('rewards:{}'.format(rewards))
            # ---------------------------- update critic ---------------------------- #
            # Get predicted next-state actions and Q values from target models

            # Get max predicted Q values (for next states) from target model
            with torch.no_grad():
                next_actions = self.actor_local(next_states)
                Q_targets_next, _ = self.critic_target(next_states, next_actions, self.N)
                Q_targets_next = Q_targets_next.transpose(1,2)
            # Compute Q targets for current states 
            Q_targets = rewards.unsqueeze(-1) + (self.GAMMA**self.n_step * Q_targets_next.to(self.device) * (1. - dones.unsqueeze(-1)))
                
            # Get expected Q values from local model
            Q_expected, taus = self.critic_local(states, actions, self.N)
            assert Q_targets.shape == (self.BATCH_SIZE, 1, self.N)
            assert Q_expected.shape == (self.BATCH_SIZE, self.N, 1)
    
            # Quantile Huber loss
            td_error = Q_targets - Q_expected
            assert td_error.shape == (self.BATCH_SIZE, self.N, self.N), "wrong td error shape"
            huber_l = calculate_huber_loss(td_error, 1.0)
            quantil_l = abs(taus -(td_error.detach() < 0).float()) * huber_l / 1.0

            critic_loss = quantil_l.sum(dim=1).mean(dim=1).mean()
            # Minimize the loss
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            clip_grad_norm_(self.critic_local.parameters(), 1)
            self.critic_optimizer.step()

            # ---------------------------- update actor ---------------------------- #
            # Compute actor loss
            actions_pred = self.actor_local(states)
            actor_loss = -self.critic_local.get_qvalues(states, actions_pred).mean()
            # Minimize the loss
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # ----------------------- update target networks ----------------------- #
            self.soft_update(self.critic_local, self.critic_target)
            self.soft_update(self.actor_local, self.actor_target)
            
            self.epsilon *= self.EPSILON_DECAY
            self.noise.reset()

            return critic_loss.detach().cpu().numpy(), actor_loss.detach().cpu().numpy()


    
def calculate_huber_loss(td_errors, k=1.0):
    """
    Calculate huber loss element-wisely depending on kappa k.
    """
    loss = torch.where(td_errors.abs() <= k, 0.5 * td_errors.pow(2), k * (td_errors.abs() - 0.5 * k))
    assert loss.shape == (td_errors.shape[0], 32, 32), "huber loss has wrong shape"
    return loss

In [275]:


class DeepActor(nn.Module):
    """Actor (Policy) Model."""

    def __init__(self, state_size, action_size, seed, hidden_size=256):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fc1_units (int): Number of nodes in first hidden layer
            fc2_units (int): Number of nodes in second hidden layer
        """
        super(DeepActor, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.input_size = hidden_size+state_size
        self.fc1 = nn.Linear(state_size, hidden_size)
        #self.batch_norm = nn.BatchNorm1d(fc1_units)
        self.fc2 = nn.Linear(self.input_size, hidden_size)
        self.fc3 = nn.Linear(self.input_size, hidden_size)
        self.fc4 = nn.Linear(self.input_size, hidden_size)
        self.fc5 = nn.Linear(hidden_size, action_size)
        self.reset_parameters()

    def reset_parameters(self):
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
        self.fc3.weight.data.uniform_(*hidden_init(self.fc3)) 
        self.fc4.weight.data.uniform_(*hidden_init(self.fc4))
        self.fc5.weight.data.uniform_(-3e-3, 3e-3)

    def forward(self, state):
        """Build an actor (policy) network that maps states -> actions."""
        x = torch.relu(self.fc1(state))
        x = torch.cat((x,state), dim=1)
        x = torch.relu(self.fc2(x))
        x = torch.cat((x,state), dim=1)
        x = torch.relu(self.fc3(x))
        x = torch.cat((x,state), dim=1)
        x = torch.relu(self.fc4(x))
        return torch.tanh(self.fc5(x))

In [276]:
import copy
class OUNoise:
    """Ornstein-Uhlenbeck process."""

    def __init__(self, size, seed, mu=0., theta=0.15, sigma=0.2):
        """Initialize parameters and noise process."""
        self.mu = mu * np.ones(size)
        self.theta = theta
        self.sigma = sigma
        self.seed = random.seed(seed)
        self.reset()

    def reset(self):
        """Reset the internal state (= noise) to mean (mu)."""
        self.state = copy.copy(self.mu)

    def sample(self):
        """Update internal state and return it as a noise sample."""
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.array([random.random() for i in range(len(x))])
        self.state = x + dx
        return self.state


def hidden_init(layer):
    fan_in = layer.weight.data.size()[0]
    lim = 1. / np.sqrt(fan_in)
    return (-lim, lim)

def weight_init(layers):
    for layer in layers:
        torch.nn.init.kaiming_normal_(layer.weight, nonlinearity='relu')

def weight_init_xavier(layers):
    for layer in layers:
        torch.nn.init.xavier_uniform_(layer.weight, gain=0.01)


class IQN(nn.Module):
    def __init__(self, state_size, action_size, layer_size, seed, N, dueling=False, device="cuda:0"):
        super(IQN, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.input_shape = state_size
        self.action_size = action_size
        self.N = N  
        self.n_cos = 64
        self.layer_size = layer_size
        self.pis = torch.FloatTensor([np.pi*i for i in range(1,self.n_cos+1)]).view(1,1,self.n_cos).to(device) # Starting from 0 as in the paper 
        self.dueling = dueling
        self.device = device

        # Network Architecture

        self.head = nn.Linear(self.action_size+self.input_shape, layer_size) 
        self.cos_embedding = nn.Linear(self.n_cos, layer_size)
        self.ff_1 = nn.Linear(layer_size, layer_size)
        self.ff_2 = nn.Linear(layer_size, 1)    
        #weight_init([self.head_1, self.ff_1])

    def calc_input_layer(self):
        x = torch.zeros(self.input_shape).unsqueeze(0)
        x = self.head(x)
        return x.flatten().shape[0]
        
    def calc_cos(self, batch_size, n_tau=32):
        """
        Calculating the cosinus values depending on the number of tau samples
        """
        taus = torch.rand(batch_size, n_tau).unsqueeze(-1).to(self.device) #(batch_size, n_tau, 1)  .to(self.device)
        cos = torch.cos(taus*self.pis)

        assert cos.shape == (batch_size,n_tau,self.n_cos), "cos shape is incorrect"
        return cos, taus
    
    def forward(self, input, action, num_tau=32):
        """
        Quantile Calculation depending on the number of tau
        
        Return:
        quantiles [ shape of (batch_size, num_tau, action_size)]
        taus [shape of ((batch_size, num_tau, 1))]
        
        """
        batch_size = input.shape[0]

        print('input', input.shape)
        x = torch.cat((input, action), dim=1)
        x = torch.relu(self.head(x  ))
        
        cos, taus = self.calc_cos(batch_size, num_tau) # cos shape (batch, num_tau, layer_size)
        cos = cos.view(batch_size*num_tau, self.n_cos)
        cos_x = torch.relu(self.cos_embedding(cos)).view(batch_size, num_tau, self.layer_size) # (batch, n_tau, layer)
        
        # x has shape (batch, layer_size) for multiplication –> reshape to (batch, 1, layer)
        x = (x.unsqueeze(1)*cos_x).view(batch_size*num_tau, self.layer_size)  #batch_size*num_tau, self.cos_layer_out
        # Following reshape and transpose is done to bring the action in the same shape as batch*tau:
        # first 32 entries are tau for each action -> thats why each action one needs to be repeated 32 times 
        # x = [[tau1   action = [[a1
        #       tau1              a1   
        #        ..               ..
        #       tau2              a2
        #       tau2              a2
        #       ..]]              ..]]  
        #action = action.repeat(num_tau,1).reshape(num_tau,batch_size*self.action_size).transpose(0,1).reshape(batch_size*num_tau,self.action_size)
        #x = torch.cat((x,action),dim=1)
        x = torch.relu(self.ff_1(x))

        out = self.ff_2(x)
        
        return out.view(batch_size, num_tau, 1), taus
    
    def get_qvalues(self, inputs, action):
        quantiles, _ = self.forward(inputs, action, self.N)
        actions = quantiles.mean(dim=1)
        return actions  

In [277]:

class PrioritizedReplay(object):
    """
    Proportional Prioritization
    """
    def __init__(self, capacity, batch_size, device, seed, gamma=0.99, n_step=1, parallel_env=1, alpha=0.6, beta_start = 0.4, beta_frames=100000):
        self.alpha = alpha
        self.beta_start = beta_start
        self.beta_frames = beta_frames
        self.device = device
        self.frame = 1 #for beta calculation
        self.batch_size = batch_size
        self.capacity   = capacity
        self.buffer     = deque(maxlen=capacity)
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.pos        = 0
        self.priorities = deque(maxlen=capacity)
        self.seed = np.random.seed(seed)
        self.parallel_env = parallel_env
        self.n_step = n_step
        self.n_step_buffer = [deque(maxlen=self.n_step) for i in range(parallel_env)]
        self.iter_ = 0
        self.memory = deque(maxlen=capacity) 
        self.gamma = gamma

    def calc_multistep_return(self,n_step_buffer):
        Return = 0
        for idx in range(self.n_step):
            Return += self.gamma**idx * n_step_buffer[idx][2]
        
        return n_step_buffer[0][0], n_step_buffer[0][1], Return, n_step_buffer[-1][3], n_step_buffer[-1][4]

    def beta_by_frame(self, frame_idx):
        """
        Linearly increases beta from beta_start to 1 over time from 1 to beta_frames.
        
        3.4 ANNEALING THE BIAS (Paper: PER)
        We therefore exploit the flexibility of annealing the amount of importance-sampling
        correction over time, by defining a schedule on the exponent 
        that reaches 1 only at the end of learning. In practice, we linearly anneal from its initial value 0 to 1
        """
        return min(1.0, self.beta_start + frame_idx * (1.0 - self.beta_start) / self.beta_frames)
    
    def add(self, state, action, reward, next_state, done):
        if self.iter_ == self.parallel_env:
            self.iter_ = 0
        assert state.ndim == next_state.ndim
        state      = np.expand_dims(state, 0)
        next_state = np.expand_dims(next_state, 0)
        action = torch.from_numpy(action).unsqueeze(0)

        # n_step calc
        self.n_step_buffer[self.iter_].append((state, action, reward, next_state, done))
        if len(self.n_step_buffer[self.iter_]) == self.n_step:
            state, action, reward, next_state, done = self.calc_multistep_return(self.n_step_buffer[self.iter_])
            e = self.experience(state, action, reward, next_state, done)
            self.memory.append(e)

        max_prio = np.array(self.priorities, dtype=float).max() if self.buffer else 1.0 # gives max priority if buffer is not empty else 1
        

        self.buffer.append((state, action, reward, next_state, done))
        self.priorities.append(max_prio)
        self.iter_ += 1

        
    def sample(self):
        N = len(self.buffer)
        prios = np.array(self.priorities, dtype=float)
        assert N == len(prios)
            
        # calc P = p^a/sum(p^a)
        probs  = prios ** self.alpha
        P = probs/probs.sum()
        
        #gets the indices depending on the probability p
        indices = np.random.choice(N, self.batch_size, p=P) 
        samples = [self.buffer[idx] for idx in indices]
        
        beta = self.beta_by_frame(self.frame)
        #print(beta)
        self.frame+=1
                
        #Compute importance-sampling weight
        weights  = (N * P[indices])**(-beta)
        # normalize weights
        weights /= weights.max() 
        weights  = np.array(weights, dtype=np.float32) 
        
        
        CurrentSequence = indices[0]
        
        if CurrentSequence < 8:
            if len(self.memory) < 16:
                indices = np.random.choice(N, 8, p=P)
                #print('second indices:{}'.format(indices))
                SequenceOfSampling=indices
            else: 
                SequenceOfSampling = [CurrentSequence, CurrentSequence+1,CurrentSequence+2,CurrentSequence+3,CurrentSequence+4,CurrentSequence+5,CurrentSequence+6,CurrentSequence+7]
        else:
            SequenceOfSampling = [CurrentSequence-7,CurrentSequence-6,CurrentSequence-5,CurrentSequence-4,CurrentSequence-3,CurrentSequence-2,CurrentSequence-1,CurrentSequence]
        
        #print(SequenceOfSampling)
        #print(len(self.memory))
        experiences = [self.memory[SequenceOfSampling[0]],self.memory[SequenceOfSampling[1]],self.memory[SequenceOfSampling[2]],self.memory[SequenceOfSampling[3]],self.memory[SequenceOfSampling[4]],self.memory[SequenceOfSampling[5]],self.memory[SequenceOfSampling[6]],self.memory[SequenceOfSampling[7]]]
        #print(experiences)
        
        
        states, actions, rewards, next_states, dones = zip(*samples) 

        states      = torch.FloatTensor(np.float32(np.concatenate(states))).to(self.device)
        next_states = torch.FloatTensor(np.float32(np.concatenate(next_states))).to(self.device)
        actions     = torch.cat(actions).to(self.device)
        rewards     = torch.FloatTensor(rewards).to(self.device).unsqueeze(1) 
        dones       = torch.FloatTensor(dones).to(self.device).unsqueeze(1)
        weights    = torch.FloatTensor(weights).unsqueeze(1)
        #print("s",states.shape)
        #print("ns", next_states.shape)
        #print("a", actions.shape)
        
        #print("r:{}".format(rewards))
        #print("d", dones.shape)
        #print("w", weights.shape)
        
        return states, actions, rewards, next_states, dones, indices, weights
    

    def update_priorities(self, batch_indices, batch_priorities):
        for idx, prio in zip(batch_indices, batch_priorities):
            self.priorities[idx] = prio 

    def __len__(self):
        return len(self.buffer)

In [278]:
import matplotlib.pyplot as plt
%matplotlib inline
from IPython import display

import numpy as np
import pandas as pd
from gym.utils import seeding
import gym
import os
from gym import spaces
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import pickle

# shares normalization factor
# 100 shares per trade
HMAX_NORMALIZE = 10
# initial amount of money we have in our account
INITIAL_ACCOUNT_BALANCE= 1000
# total number of stocks in our portfolio
STOCK_DIM = 3
# transaction fee: 1/1000 reasonable percentage
TRANSACTION_FEE_PERCENT = 0.001
REWARD_SCALING = 1e-4

class StockEnvTrain(gym.Env):
    """A stock trading environment for OpenAI gym"""
    metadata = {'render.modes': ['human']}

    def __init__(self, df,day = 0):
        #super(StockEnv, self).__init__()
        #money = 10 , scope = 1
        self.day = day
        self.df = df
        self.agent_stock_iteration_index = 0
        self.penalty = 0

        # action_space normalization and shape is STOCK_DIM
        self.action_space = spaces.Box(low = -1, high = 1,shape = (STOCK_DIM,)) 
        # Shape = 181: [Current Balance]+[prices 1-30]+[owned shares 1-30] 
        # +[macd 1-30]+ [rsi 1-30] + [cci 1-30] + [adx 1-30]
        self.observation_space = spaces.Box(low=0, high=np.inf, shape = (7,))
        # load data from a pandas dataframe
        #print('df: {}'.format(self.df))
        #print('day: {}'.format(self.day))
        self.data = self.df.loc[self.day,:]
        
        self.terminal = False

        # initalize state
        self.state = [INITIAL_ACCOUNT_BALANCE] + \
                      self.data.Close.values.tolist() + \
                      [0]*STOCK_DIM 
        # initialize reward
        self.reward = 0
        self.cost = 0
        # memorize all the total balance change
        self.asset_memory = [INITIAL_ACCOUNT_BALANCE]
        self.rewards_memory = []
        self.final_asset_value = 0
        self.trades = 0
        self.previous_trades = 0 
        #self.reset()
        self._seed()


    def _sell_stock(self, index, action):
        action = action
        #print('index:{}'.format(index))
        #print('selling:{}'.format(action))
        # perform sell action based on the sign of the action
        if self.state[index+STOCK_DIM+1] > 0:
            #update balance
            self.state[0] += \
            self.state[index+1]*min(abs(action),self.state[index+STOCK_DIM+1]) * \
             (1- TRANSACTION_FEE_PERCENT)

            self.state[index+STOCK_DIM+1] -= min(abs(action), self.state[index+STOCK_DIM+1])
            self.cost +=self.state[index+1]*min(abs(action),self.state[index+STOCK_DIM+1]) * \
             TRANSACTION_FEE_PERCENT
            self.trades+=1
        else:
            pass

    
    def _buy_stock(self, index, action):
        #print('index:{}'.format(index))
        #print('buying:{}'.format(action))
        action = action
        # perform buy action based on the sign of the action
        available_amount = self.state[0] // self.state[index+1]
        # print('available_amount:{}'.format(available_amount))

        #update balance
        self.state[0] -= self.state[index+1]*min(available_amount, action)* \
                          (1+ TRANSACTION_FEE_PERCENT)

        self.state[index+STOCK_DIM+1] += min(available_amount, action)

        self.cost+=self.state[index+1]*min(available_amount, action)* \
                          TRANSACTION_FEE_PERCENT
        
        if available_amount>0:
            self.trades+=1
    
        
    def step(self, actions):
        # print(self.day)
        #print(self.day)
        self.terminal = self.day >= len(self.df.index.unique())-1
        #print(actions)
        self.actions = actions
        if self.terminal:
            print("Finished")
            print(self.state)
            end_total_asset = self.state[0]+ \
            sum(np.array(self.state[1:(STOCK_DIM+1)])*np.array(self.state[(STOCK_DIM+1):(STOCK_DIM*2+1)]))

            print("end_total_asset:{}".format(end_total_asset))
            df_total_value = pd.DataFrame(self.asset_memory)
            #df_total_value.to_csv('results/account_value_train.csv')
            #print("total_reward:{}".format(self.state[0]+sum(np.array(self.state[1:(STOCK_DIM+1)])*np.array(self.state[(STOCK_DIM+1):61]))- INITIAL_ACCOUNT_BALANCE ))
            #print("total_cost: ", self.cost)
            #print("total_trades: ", self.trades)
            df_total_value.columns = ['account_value']
            df_total_value['daily_return']=df_total_value.pct_change(1)
            sharpe = (252**0.5)*df_total_value['daily_return'].mean()/ \
                  df_total_value['daily_return'].std()
            print("Sharpe: ",sharpe)
            #print("=================================")
            df_rewards = pd.DataFrame(self.rewards_memory)
            #df_rewards.to_csv('results/account_rewards_train.csv')

            # print('total asset: {}'.format(self.state[0]+ sum(np.array(self.state[1:29])*np.array(self.state[29:]))))
            #with open('obs.pkl', 'wb') as f:  
            #    pickle.dump(self.state, f)

            return self.state, self.reward, self.terminal,{}

        else:
            # print(np.array(self.state[1:29]))
            #print("The actions is: {}".format(self.actions))

            #action = np.array([4,4,5])
            #actions = np.array([4,0,0,0,0,0,0,0,4,0,4,0,-3,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0])

            #actions = self.actions * HMAX_NORMALIZE #WHY??
            #print("actions-index------:{}".format(actions))
            #actions = (actions.astype(int))
            print('here:',actions)
            actions = actions * HMAX_NORMALIZE
            #print('all-actions:{}'.format(actions))

            begin_total_asset = self.state[0]+ \
            sum(np.array(self.state[1:(STOCK_DIM+1)])*np.array(self.state[(STOCK_DIM+1):(STOCK_DIM*2+1)]))
            #print("begin_total_asset:{}".format(begin_total_asset))

            argsort_actions = np.argsort(actions) #TODO: this may not be touched.
            #print("The actions is: {}".format(actions))
            
            

            sell_index = argsort_actions[:np.where(actions < 0.5 )[0].shape[0]]
            buy_index = argsort_actions[::-1][:np.where(actions > -0.5 )[0].shape[0]]
            
            
            for index in sell_index:
                # print('take sell action'.format(actions[index]))
                self._sell_stock(index, actions[index])

            for index in buy_index:
                # print('take buy action: {}'.format(actions[index]))
                self._buy_stock(index, actions[index])
                
            
            #print("self.day:{}".format(self.day))
            #--print('trades:{}'.format(self.trades))
            
            
            
            if self.previous_trades == self.trades:
                self.penalty = 10
                #self.reward = -1
            else: 
                self.penalty = 0
                #self.reward = self.trades
                
            self.previous_trades = float(self.trades)
                
            #load next state
            # print("stock_shares:{}".format(self.state[29:]))
            self.state =  [self.state[0]] + \
                self.data.Close.values.tolist() + \
                list(self.state[(STOCK_DIM+1):(STOCK_DIM*2+1)]) 

            end_total_asset = self.state[0]+ \
            sum(np.array(self.state[1:(STOCK_DIM+1)])*np.array(self.state[(STOCK_DIM+1):(STOCK_DIM*2+1)]))
            self.asset_memory.append(end_total_asset)
            #print("end_total_asset:{}".format(end_total_asset))
            
            

            self.reward = end_total_asset - begin_total_asset - self.penalty
            
            print("trades:{}".format(self.trades))
            print('previous:{}'.format(self.previous_trades))
            print("penalty:{}".format(self.penalty))
            print("step_reward:{}".format(self.reward))
        
            self.rewards_memory.append(self.reward)
            #self.reward = self.reward*REWARD_SCALING
            print("step_reward:{}".format(self.reward))
            
            self.day += 1
            self.data = self.df.loc[self.day,:]

        return self.state, self.reward, self.terminal, {}

    def reset(self):
        self.final_asset_value = 0
        self.trades = 0
        self.previous_trades = 0
        self.penalty = 0 
        self.asset_memory = [INITIAL_ACCOUNT_BALANCE]
        self.day = 0
        self.data = self.df.loc[self.day,:]
        self.cost = 0
        self.trades = 0
        self.terminal = False 
        self.rewards_memory = []
        self.agent_stock_iteration_index = 0
        #initiate state
        self.state = [INITIAL_ACCOUNT_BALANCE] + \
                      self.data.Close.values.tolist() + \
                      [0]*STOCK_DIM 
        # iteration += 1 
        #print("[0]*STOCK_DIM:{}".format([0]*STOCK_DIM))
        #print("self.state:{}".format(len(self.state)))
        #print(np.array(self.state))
        return np.array(self.state)
    
    def render(self, mode='human'):
        return self.state

    def _seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]



In [279]:
from torch.utils.tensorboard import SummaryWriter

def run(env_train,frames=100000, eps_fixed=False, eps_frames=1e6, min_eps=0.001):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per epaisode) for decreasing epsilon
    """
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    output_history = []
    frame = 0
    if eps_fixed:
        eps = 0
    else:
        eps = 1
    eps_start = 1
    i_episode = 1
    state = env_train.reset()
   # state = state[0,:]
    #print("state space:{}".format(state[0,:].shape))
    score = 0 
    
    action_high = env_train.action_space.high[0]
    action_low = env_train.action_space.low[0]
    state_size = env_train.observation_space.shape[0]
    action_size = env_train.action_space.shape[0]
    
    for frame in range(1, frames+1):
        
        
        
        if frame  == 0:
            # inital state

            initial = True
        else:
            # previous state
            initial = False
        #print('initial state:{}'.format(initial))
        
        if eps_fixed == False:
            if frame < eps_frames:
                eps = max(eps_start - (frame*(1/eps_frames)), min_eps)
            else:
                eps = max(min_eps - min_eps*((frame-eps_frames)/(frames-eps_frames)), 0.001)
        

        action = agent.act(state) #TODO: getting one dimension back.
        #
        action = np.array([action])

        #--print('My Action_V: {}'.format(action_v))
        
        next_state, reward, done, info = env_train.step(action) #TODO: Wants a list of actions of size a

        
        #print("env_trainNext State: {}".format(next_state.shape))
        
        for s, a, r, ns, d in zip(state, action, reward, next_state, done):
            agent.step(s, a, r, ns, d, frame, writer)

        print('agent seed', agent.seed)
        state = next_state
        score += reward
        # linear annealing to the min epsilon value until eps_frames and from there slowly decease epsilon to 0 until the end of training

                
                
                
        # evaluation runs
        if frame % 1 == 0:
            print('My Action: {}'.format(action))
            print("state: {}".format(state))
            print("score: {}".format(score))
            print("state: {}".format(state))
            print("action:{}, Number:{}".format(action,frame))
            print("-------------------------")
        
        if done:
            
            scores_window.append(score)       # save most recent score
            scores.append(score)              # save most recent score
            writer.add_scalar("Average100", np.mean(scores_window), frame*worker)
            
            print('\rEpisode {}\tFrame {} \tAverage100 Score: {:.2f}'.format(i_episode*worker, frame*worker, np.mean(scores_window)), end="")
            #if i_episode % 100 == 0:
            #    print('\rEpisode {}\tFrame \tReward: {}\tAverage100 Score: {:.2f}'.format(i_episode*worker, frame*worker, round(eval_reward,2), np.mean(scores_window)), end="", flush=True)
            i_episode +=1 
            state = env_train.reset()
            score = 0



In [280]:

if __name__ == "__main__":
    seed = 4
    frames = 30000
    worker = 1
    GAMMA = 0.99
    TAU = 1e-3
    HIDDEN_SIZE = 400
    BUFFER_SIZE = int(1e6)
    BATCH_SIZE = 32
    LR_ACTOR = 3e-3         # learning rate of the actor 
    
    LR_CRITIC = 3e-3     # learning rate of the critic
    saved_model = None #'D4PG.pth'
    D2RL = 0

    writer = SummaryWriter("")
    
    preprocessed_path = "done_3stocks.csv"
    if os.path.exists(preprocessed_path):
        data = pd.read_csv(preprocessed_path, index_col=0)

    unique_trade_date = data[(data.datadate > 20151001)&(data.datadate <= 20200707)].datadate.unique()
    #print(unique_trade_date)

    
    train = data_split(data, start=20100101, end=20100202)
    
    eval_env = DummyVecEnv([lambda: StockEnvTrain(train)])
    
    eval_env.seed(seed+1)
    
    torch.manual_seed(seed)
    
    np.random.seed(seed)
    
    
    
    device = torch.device("cpu")
    print("Using device: {}".format(device))
    
    action_high = eval_env.action_space.high[0]
    print(action_high)
    action_low = eval_env.action_space.low[0]
    print(action_low)
    state_size = eval_env.observation_space.shape[0]
    action_size = eval_env.action_space.shape[0]
    print('run seed', seed)
    agent = Agent(state_size=state_size, action_size=action_size, n_step=1, per=0, munchausen=0,distributional=1,
                 D2RL=D2RL, random_seed=seed, hidden_size=HIDDEN_SIZE, BATCH_SIZE=BATCH_SIZE, BUFFER_SIZE=BUFFER_SIZE, GAMMA=GAMMA,
                 LR_ACTOR=LR_ACTOR, LR_CRITIC=LR_CRITIC, TAU=TAU, LEARN_EVERY=1, LEARN_NUMBER=1, device=device, frames=frames, worker=1) 
    
    t0 = time.time()
    if saved_model != None:
        agent.actor_local.load_state_dict(torch.load(saved_model))
        #evaluate(1)
        run(eval_env,frames=frames, eps_fixed=False, eps_frames=1000, min_eps=0.025)
    else:    
        run(eval_env,frames=frames, eps_fixed=False, eps_frames=1000, min_eps=0.025)

    eval_env.close()
    # save trained model 
    torch.save(agent.actor_local.state_dict(), 'D4PG_2.pth')
    #writer.export_scalars_to_json('./scalars.json')
    # save parameter


Using device: cpu
1.0
-1.0
run seed 4
Using:  cpu
seed agent:  None
Actor: 
 Actor(
  (fc1): Linear(in_features=7, out_features=400, bias=True)
  (batch_norm): BatchNorm1d(400, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=400, out_features=400, bias=True)
  (fc3): Linear(in_features=400, out_features=3, bias=True)
)

Critic: 
 Critic(
  (fcs1): Linear(in_features=7, out_features=256, bias=True)
  (batch_norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=259, out_features=256, bias=True)
  (fc3): Linear(in_features=256, out_features=1, bias=True)
)
state.shape[0]:torch.Size([1, 1, 7])


AssertionError: shape: torch.Size([1, 1, 7])

In [273]:
"""
Evaluation part 
1) Reading JSON and PTH
2) Setting up trading env
3) Running test

"""



def evaluate(eval_runs=5):
    """
    Makes an evaluation run 
    
    """
    seed = 4
    frames = 10000
    worker = 1
    GAMMA = 0.99
    TAU = 1e-3
    HIDDEN_SIZE = 256
    BUFFER_SIZE = int(1e6)
    BATCH_SIZE = 256
    LR_ACTOR = 3e-4         # learning rate of the actor 
    LR_CRITIC = 3e-4     # learning rate of the critic
    saved_model = 'D4PG.pth'
    D2RL = 0

    writer = SummaryWriter("")
    
    preprocessed_path = "done_3stocks.csv"
    if os.path.exists(preprocessed_path):
        data = pd.read_csv(preprocessed_path, index_col=0)

    unique_trade_date = data[(data.datadate > 20151001)&(data.datadate <= 20200707)].datadate.unique()
    #print(unique_trade_date)

    
    train = data_split(data, start=20150101, end=20180101)
    
    env_train = DummyVecEnv([lambda: StockEnvTrain(train)])
    
    env_train.seed(seed+1)
    
    torch.manual_seed(seed)
    np.random.seed(seed)
    
    device = torch.device("cpu")
    print("Using device: {}".format(device))
    
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    output_history = []
    frame = 0

    eps = 1
    eps_start = 1
    i_episode = 1
    state = env_train.reset()
   # state = state[0,:]
    #print("state space:{}".format(state[0,:].shape))
    score = 0 
    
    action_high = env_train.action_space.high[0]
    action_low = env_train.action_space.low[0]
    state_size = env_train.observation_space.shape[0]
    action_size = env_train.action_space.shape[0]

    for _ in range(eval_runs):
        state = env_train.reset()

        rewards = 0
        while True:
            action = agent.act(state) #TODO: getting one dimension back.
            print(action)
            print(state)
            action_v = np.clip(action, action_low, action_high)

            next_state, reward, done, info = env_train.step(action_v) #TODO: Wants a list of actions of size a
            state = next_state
            rewards += reward
            if done:
                print("Episode Rewards: {}".format(rewards))
                break

class dotdict(dict):
    def __getattr__(self, name):
        return self[name]


In [254]:
print(1e-4)

0.0001


In [255]:
np.random.normal(0, scale=1)

0.05056170714293955

In [256]:
a = np.random.seed(4)
print(a)

None


In [257]:
env = gym.make("Pendulum-v0")
state_size = env.observation_space.shape[0]
action_size = env.action_space.shape[0]

In [267]:
from torch.utils.tensorboard import SummaryWriter

def ddpg(n_episodes=200, max_t=1000, print_every=10):
    writer = SummaryWriter("")
    scores_deque = deque(maxlen=100)
    scores = []
    minmax_scores = []
    average_100_scores = []
    time_stamp = 0
    for i_episode in range(1, n_episodes+1):

        state = env.reset()                     
        agent.reset()
        score = 0
        for t in range(max_t):
            
            action = agent.act(state)
            
            next_state, reward, done, _ = env.step(action) 
            agent.step(state, action, reward, next_state, done, time_stamp, writer)
            
            
            state = next_state
            score += reward
            time_stamp += 1
            
            if done:
                break 
        
        scores_deque.append(np.mean(score))
        scores.append(np.mean(score))
        minmax_scores.append((np.min(score),np.max(score)))
        average_100_scores.append(np.mean(scores_deque))
        
        print('\rEpisode {}  Min_reward: {:.2f}  Max_reward: {:.2f}  Mean_reward: {:.2f}  Average100 Score: {:.2f}'.format(i_episode,np.min(score),np.max(score),np.mean(score), np.mean(scores_deque)), end="")
        if i_episode % 25 == 0:
            torch.save(agent.actor_local.state_dict(), "checkpoint_actor"+str(i_episode)+".pth")
            torch.save(agent.critic_local.state_dict(), "checkpoint_critic"+str(i_episode)+".pth")
        if i_episode % print_every == 0:
            print('\rEpisode {}  Min_reward: {:.2f}  Max_reward: {:.2f}  Mean_reward: {:.2f}  Average100 Score: {:.2f}'.format(i_episode,np.min(score),np.max(score),np.mean(score), np.mean(scores_deque)))
        if np.mean(scores_deque) >= 30:
            print("\nSolved Environment!")
            torch.save(agent.actor_local.state_dict(), 'final_actor.pth')
            torch.save(agent.critic_local.state_dict(), 'final_critic.pth')
            break
    return scores, minmax_scores, average_100_scores

agent = Agent(state_size=state_size, action_size=action_size, n_step=1, per=0, munchausen=0,distributional=1,
                 D2RL=D2RL, random_seed=seed, hidden_size=HIDDEN_SIZE, BATCH_SIZE=BATCH_SIZE, BUFFER_SIZE=BUFFER_SIZE, GAMMA=GAMMA,
                 LR_ACTOR=LR_ACTOR, LR_CRITIC=LR_CRITIC, TAU=TAU, LEARN_EVERY=1, LEARN_NUMBER=1, device=device, frames=frames, worker=1)


scores, minmax, average_100 = ddpg()

Using:  cpu
seed agent:  None
Actor: 
 Actor(
  (fc1): Linear(in_features=3, out_features=400, bias=True)
  (batch_norm): BatchNorm1d(400, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=400, out_features=400, bias=True)
  (fc3): Linear(in_features=400, out_features=1, bias=True)
)

Critic: 
 Critic(
  (fcs1): Linear(in_features=3, out_features=256, bias=True)
  (batch_norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=257, out_features=256, bias=True)
  (fc3): Linear(in_features=256, out_features=1, bias=True)
)


  max_prio = np.array(self.priorities, dtype=float).max() if self.buffer else 1.0 # gives max priority if buffer is not empty else 1
  prios = np.array(self.priorities, dtype=float)


Episode 10  Min_reward: -1369.81  Max_reward: -1369.81  Mean_reward: -1369.81  Average100 Score: -1363.84
Episode 20  Min_reward: -1359.51  Max_reward: -1359.51  Mean_reward: -1359.51  Average100 Score: -1336.87
Episode 30  Min_reward: -1390.69  Max_reward: -1390.69  Mean_reward: -1390.69  Average100 Score: -1365.07
Episode 40  Min_reward: -1332.02  Max_reward: -1332.02  Mean_reward: -1332.02  Average100 Score: -1376.11
Episode 50  Min_reward: -1714.50  Max_reward: -1714.50  Mean_reward: -1714.50  Average100 Score: -1354.81
Episode 60  Min_reward: -1687.15  Max_reward: -1687.15  Mean_reward: -1687.15  Average100 Score: -1346.71
Episode 70  Min_reward: -1167.88  Max_reward: -1167.88  Mean_reward: -1167.88  Average100 Score: -1358.94
Episode 80  Min_reward: -950.33  Max_reward: -950.33  Mean_reward: -950.33  Average100 Score: -1352.70.80
Episode 90  Min_reward: -720.12  Max_reward: -720.12  Mean_reward: -720.12  Average100 Score: -1334.49.39
Episode 98  Min_reward: -1366.84  Max_reward: 

KeyboardInterrupt: 