In [40]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torch.optim as optim
from torch.nn.utils import clip_grad_norm_
import random
import math
from torch.utils.tensorboard import SummaryWriter
from collections import deque, namedtuple
import time
import gym
import os
from stable_baselines3.common.vec_env import DummyVecEnv
def weight_init(layers):
    for layer in layers:
        torch.nn.init.kaiming_normal_(layer.weight, nonlinearity='relu')

In [41]:
from stable_baselines3.common.vec_env.base_vec_env import VecEnv, VecEnvStepReturn, VecEnvWrapper


In [42]:
import numpy as np
import pandas as pd
from stockstats import StockDataFrame as Sdf

import datetime

import datetime
import os

TRAINING_DATA_FILE = "dataprocessing/Yfinance_Data.csv"

now = datetime.datetime.now()
TRAINED_MODEL_DIR = f"trained_models/{now}"
os.makedirs(TRAINED_MODEL_DIR)

TESTING_DATA_FILE = "test.csv"

def load_dataset(*, file_name: str) -> pd.DataFrame:
    """
    load csv dataset from path
    :return: (df) pandas dataframe
    """
    # _data = pd.read_csv(f"{config.DATASET_DIR}/{file_name}")
    _data = pd.read_csv(file_name)

    return _data


def data_split(df, start, end):
    """
    split the dataset into training or testing using date
    :param data: (df) pandas dataframe, start, end
    :return: (df) pandas dataframe
    """
    data = df[(df.datadate >= start) & (df.datadate < end)]
    data = data.sort_values(['datadate', 'tic'], ignore_index=True)


    # data  = data[final_columns]
    data.index = data.datadate.factorize()[0]


    return data


def calculate_price(df):
    """
    calcualte adjusted close price, open-high-low price and volume
    :param data: (df) pandas dataframe
    :return: (df) pandas dataframe
    """
    data = df.copy()

    data = data[['Date', 'tic', 'Close', 'Open', 'High', 'Low', 'Volume','datadate']]
    data = data.sort_values(['tic', 'datadate'], ignore_index=True)
    return data


def add_technical_indicator(df):
    """
    calcualte technical indicators
    use stockstats package to add technical inidactors
    :param data: (df) pandas dataframe
    :return: (df) pandas dataframe
    """
    stock = Sdf.retype(df.copy())

    #print(stock)

    unique_ticker = stock.tic.unique()

    macd = pd.DataFrame()
    rsi = pd.DataFrame()
    cci = pd.DataFrame()
    dx = pd.DataFrame()

    # temp = stock[stock.tic == unique_ticker[0]]['macd']
    for i in range(len(unique_ticker)):
        ## macd
        temp_macd = stock[stock.tic == unique_ticker[i]]['macd']
        temp_macd = pd.DataFrame(temp_macd)
        macd = macd.append(temp_macd, ignore_index=True)
        ## rsi
        temp_rsi = stock[stock.tic == unique_ticker[i]]['rsi_30']
        temp_rsi = pd.DataFrame(temp_rsi)
        rsi = rsi.append(temp_rsi, ignore_index=True)
        ## cci
        temp_cci = stock[stock.tic == unique_ticker[i]]['cci_30']
        temp_cci = pd.DataFrame(temp_cci)
        cci = cci.append(temp_cci, ignore_index=True)
        ## adx
        temp_dx = stock[stock.tic == unique_ticker[i]]['dx_30']
        temp_dx = pd.DataFrame(temp_dx)
        dx = dx.append(temp_dx, ignore_index=True)

    df['macd'] = macd
    df['rsi'] = rsi
    df['cci'] = cci
    df['adx'] = dx

    return df


def preprocess_data():
    """data preprocessing pipeline"""
    start = datetime.datetime(2010, 12, 1)
    df = load_dataset(file_name=TRAINING_DATA_FILE)
    # get data after 2010
    # df = df[df.Date >= start]
    # calcualte adjusted price
    df_preprocess = calculate_price(df)
    # add technical indicators using stockstats
    df_final = add_technical_indicator(df_preprocess)
    # fill the missing values at the beginning
    df_final.fillna(method='bfill', inplace=True)
    return df_final



In [130]:
class QR_DQN(nn.Module):
    def __init__(self, state_size, action_size,layer_size, n_step, seed, N, layer_type="ff"):
        super(QR_DQN, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.input_shape = state_size
        self.action_size = action_size
        self.N = N

        self.head_1 = nn.Linear(self.input_shape, layer_size)
        self.ff_1 = nn.Linear(layer_size, layer_size)
        self.ff_2 = nn.Linear(layer_size, action_size*N)
        weight_init([self.head_1, self.ff_1])


    
    def forward(self, input):
        """
        
        """
        x = torch.relu(self.head_1(input))
        x = torch.relu(self.ff_1(x))
        out = self.ff_2(x)
        
        return out.view(input.shape[0], self.N, self.action_size)
    def get_action(self,input):
        x = self.forward(input)
        #print(x)
        return x.mean(dim=1)
    
    def get_det_action(self,input):
        x = self.forward(input)
        #print(x)
        return x.mean(dim=1)

In [131]:
class ReplayBuffer:
    """Fixed-size buffer to store experience tuples."""

    def __init__(self, buffer_size, batch_size, device, seed, gamma, n_step=1):
        """Initialize a ReplayBuffer object.
        Params
        ======
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
        """
        self.device = device
        self.memory = deque(maxlen=buffer_size)  
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)
        self.gamma = gamma
        self.n_step = n_step
        self.n_step_buffer = deque(maxlen=self.n_step)
    
    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        #print("before:", state,action,reward,next_state, done)
        self.n_step_buffer.append((state, action, reward, next_state, done))
        if len(self.n_step_buffer) == self.n_step:
            state, action, reward, next_state, done = self.calc_multistep_return()
            #print("after:",state,action,reward,next_state, done)
            e = self.experience(state, action, reward, next_state, done)
            self.memory.append(e)
    
    def calc_multistep_return(self):
        Return = 0
        for idx in range(self.n_step):
            Return += self.gamma**idx * self.n_step_buffer[idx][2]
        
        return self.n_step_buffer[0][0], self.n_step_buffer[0][1], Return, self.n_step_buffer[-1][3], self.n_step_buffer[-1][4]
        
    
    
    def sample(self):
        """Randomly sample a batch of experiences from memory."""
        
        
        
        experiences = random.sample(self.memory, k=8)
        
        CurrentSequence = random.choice(list(enumerate(self.memory)))
        
        SequenceOfSampling = [] 
        
        #print(CurrentSequence)
        
        CurrentSequence = CurrentSequence[0]
        
        if CurrentSequence < 8:
            if len(self.memory) < 16:
                SequenceOfSampling=[0,1,2,3,4,5,6,7]
            else: 
                SequenceOfSampling = [CurrentSequence, CurrentSequence+1,CurrentSequence+2,CurrentSequence+3,CurrentSequence+4,CurrentSequence+5,CurrentSequence+6,CurrentSequence+7]
        else:
            SequenceOfSampling = [CurrentSequence-7,CurrentSequence-6,CurrentSequence-5,CurrentSequence-4,CurrentSequence-3,CurrentSequence-2,CurrentSequence-1,CurrentSequence]
        
        #print(SequenceOfSampling)
        #print(len(self.memory))
        experiences = [self.memory[SequenceOfSampling[0]],self.memory[SequenceOfSampling[1]],self.memory[SequenceOfSampling[2]],self.memory[SequenceOfSampling[3]],self.memory[SequenceOfSampling[4]],self.memory[SequenceOfSampling[5]],self.memory[SequenceOfSampling[6]],self.memory[SequenceOfSampling[7]]]
        #print(experiences)
    
        states = torch.from_numpy(np.stack([e.state for e in experiences if e is not None])).float().to(self.device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(self.device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(self.device)
        next_states = torch.from_numpy(np.stack([e.next_state for e in experiences if e is not None])).float().to(self.device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(self.device)
  
        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)

In [152]:
class DQN_Agent():
    """Interacts with and learns from the environment."""

    def __init__(self,
                 state_size,
                 action_size,
                 Network,
                 layer_size,
                 n_step,
                 BATCH_SIZE,
                 BUFFER_SIZE,
                 LR,
                 TAU,
                 GAMMA,
                 UPDATE_EVERY,
                 device,
                 seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            Network (str): dqn network type
            layer_size (int): size of the hidden layer
            BATCH_SIZE (int): size of the training batch
            BUFFER_SIZE (int): size of the replay memory
            LR (float): learning rate
            TAU (float): tau for soft updating the network weights
            GAMMA (float): discount factor
            UPDATE_EVERY (int): update frequency
            device (str): device that is used for the compute
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.device = device
        self.TAU = TAU
        self.GAMMA = GAMMA
        self.UPDATE_EVERY = UPDATE_EVERY
        self.BATCH_SIZE = BATCH_SIZE
        self.Q_updates = 0
        self.n_step = n_step
        self.N = 32
        self.quantile_tau = torch.FloatTensor([i/self.N for i in range(1,self.N+1)]).to(device)

        self.action_step = 4
        self.last_action = None

        # Q-Network
        
        self.qnetwork_local = QR_DQN(state_size, action_size,layer_size, n_step, seed, self.N).to(device)
        self.qnetwork_target = QR_DQN(state_size, action_size,layer_size, n_step, seed, self.N).to(device)
        
        self.qnetwork_local.load_state_dict(torch.load('IQN1.pth'))
        self.qnetwork_target.load_state_dict(torch.load('IQN1.pth'))
        
        
        print('self.qnetwork_local.parameters():{}'.format(self.qnetwork_local.parameters()))
        for k, v in agent.qnetwork_local.named_parameters():
                if k=='ff_2.bias':
                    print(k, v)
        
        
        
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        print(self.qnetwork_local)
        
        # Replay memory
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, self.device, seed, self.GAMMA, n_step)
        
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done, writer):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.UPDATE_EVERY
        
                    
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.BATCH_SIZE:
                experiences = self.memory.sample()
                loss = self.learn(experiences)
                self.Q_updates += 1
                writer.add_scalar("Q_loss", loss, self.Q_updates)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy. Acting only every 4 frames!
        
        Params
        ======
            frame: to adjust epsilon
            state (array_like): current state
            
        """
        state = np.array(state)

        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local.get_action(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        #if random.random() > eps: # select greedy action if random number is higher than epsilon or noisy network is used!
        action = np.argmax(action_values.cpu().data.numpy())
        self.last_action = action
        return action
        #self.action_step = 0


    def learn(self, experiences):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        self.optimizer.zero_grad()
        states, actions, rewards, next_states, dones = experiences
        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().cpu() #.max(2)[0].unsqueeze(1) #(batch_size, 1, N)
        action_indx = torch.argmax(Q_targets_next.mean(dim=1), dim=1, keepdim=True)

        Q_targets_next = Q_targets_next.gather(2, action_indx.unsqueeze(-1).expand(self.BATCH_SIZE, self.N, 1)).transpose(1,2)

        assert Q_targets_next.shape == (self.BATCH_SIZE,1, self.N)
        # Compute Q targets for current states 
        Q_targets = rewards.unsqueeze(-1) + (self.GAMMA**self.n_step * Q_targets_next.to(self.device) * (1 - dones.unsqueeze(-1)))
        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(2, actions.unsqueeze(-1).expand(self.BATCH_SIZE, self.N, 1))
        # Compute loss
        td_error = Q_targets - Q_expected
        assert td_error.shape == (self.BATCH_SIZE, self.N, self.N), "wrong td error shape"
        huber_l = calculate_huber_loss(td_error, 1.0)
        quantil_l = abs(self.quantile_tau -(td_error.detach() < 0).float()) * huber_l / 1.0

        loss = quantil_l.sum(dim=1).mean(dim=1) # , keepdim=True if per weights get multipl
        loss = loss.mean()
        # Minimize the loss
        loss.backward()
        #clip_grad_norm_(self.qnetwork_local.parameters(),1)
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target)
        return loss.detach().cpu().numpy()            

    def soft_update(self, local_model, target_model):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(self.TAU*local_param.data + (1.0-self.TAU)*target_param.data)
            
def calculate_huber_loss(td_errors, k=1.0):
    """
    Calculate huber loss element-wisely depending on kappa k.
    """
    loss = torch.where(td_errors.abs() <= k, 0.5 * td_errors.pow(2), k * (td_errors.abs() - 0.5 * k))
    assert loss.shape == (td_errors.shape[0], 32, 32), "huber loss has wrong shape"
    return loss

def eval_runs(eps, frame):
    """
    Makes an evaluation run with the current epsilon
    """
    env = gym.make("CartPole-v0")
    reward_batch = []
    for i in range(5):
        state = env.reset()
        rewards = 0
        while True:
            action = agent.act(state, eps)
            state, reward, done, _ = env.step(action)
            rewards += reward
            if done:
                break
        reward_batch.append(rewards)
        
    writer.add_scalar("Reward", np.mean(reward_batch), frame)

In [153]:
import matplotlib.pyplot as plt
%matplotlib inline
from IPython import display

import numpy as np
import pandas as pd
from gym.utils import seeding
import gym
import os
from gym import spaces
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import pickle

# shares normalization factor
# 100 shares per trade
HMAX_NORMALIZE = 100
# initial amount of money we have in our account
INITIAL_ACCOUNT_BALANCE= 1000
# total number of stocks in our portfolio
STOCK_DIM = 3
# transaction fee: 1/1000 reasonable percentage
TRANSACTION_FEE_PERCENT = 0.001
REWARD_SCALING = 1e-4

class StockEnvTrain(gym.Env):
    """A stock trading environment for OpenAI gym"""
    metadata = {'render.modes': ['human']}

    def __init__(self, df,day = 0):
        #super(StockEnv, self).__init__()
        #money = 10 , scope = 1
        self.day = day
        self.df = df
        self.agent_stock_iteration_index = 0

        # action_space normalization and shape is STOCK_DIM
        self.action_space = spaces.Box(low = -1, high = 1,shape = (STOCK_DIM,)) 
        # Shape = 181: [Current Balance]+[prices 1-30]+[owned shares 1-30] 
        # +[macd 1-30]+ [rsi 1-30] + [cci 1-30] + [adx 1-30]
        self.observation_space = spaces.Box(low=0, high=np.inf, shape = (19,))
        # load data from a pandas dataframe
        #print('df: {}'.format(self.df))
        #print('day: {}'.format(self.day))
        self.data = self.df.loc[self.day,:]
        #print(self.data.Close)
        self.terminal = False
        # initalize state
        self.state = [INITIAL_ACCOUNT_BALANCE] + \
                      self.data.Close.values.tolist() + \
                      [0]*STOCK_DIM + \
                      self.data.macd.values.tolist() + \
                      self.data.rsi.values.tolist() + \
                      self.data.cci.values.tolist() + \
                      self.data.adx.values.tolist()
        # initialize reward
        self.reward = 0
        self.cost = 0
        # memorize all the total balance change
        self.asset_memory = [INITIAL_ACCOUNT_BALANCE]
        self.rewards_memory = []
        self.final_asset_value = 0
        self.trades = 0
        #self.reset()
        self._seed()


    def _sell_stock(self, index, action):
        # perform sell action based on the sign of the action
        if self.state[index+STOCK_DIM+1] > 0:
            #update balance
            self.state[0] += \
            self.state[index+1]*min(abs(action),self.state[index+STOCK_DIM+1]) * \
             (1- TRANSACTION_FEE_PERCENT)
            self.state[index+STOCK_DIM+1] -= min(abs(action), self.state[index+STOCK_DIM+1])
            self.cost +=self.state[index+1]*min(abs(action),self.state[index+STOCK_DIM+1]) * \
             TRANSACTION_FEE_PERCENT
            self.trades+=1
        else:
            pass

    
    def _buy_stock(self, index, action):
        # perform buy action based on the sign of the action
        available_amount = self.state[0] // self.state[index+1]
        # print('available_amount:{}'.format(available_amount))

        #update balance
        self.state[0] -= self.state[index+1]*min(available_amount, action)* \
                          (1+ TRANSACTION_FEE_PERCENT)
        self.state[index+STOCK_DIM+1] += min(available_amount, action)
        self.cost+=self.state[index+1]*min(available_amount, action)* \
                          TRANSACTION_FEE_PERCENT
        self.trades+=1
        
        
    def step(self, actions):
        # print(self.day)
        self.terminal = self.day >= len(self.df.index.unique())-1
        #print(actions)
        self.actions = actions
        if self.terminal:
            print("Finished")
            print(self.state)
            end_total_asset = self.state[0]+ \
            sum(np.array(self.state[1:(STOCK_DIM+1)])*np.array(self.state[(STOCK_DIM+1):(STOCK_DIM*2+1)]))

            print("end_total_asset:{}".format(end_total_asset))
            df_total_value = pd.DataFrame(self.asset_memory)
            #df_total_value.to_csv('results/account_value_train.csv')
            #print("total_reward:{}".format(self.state[0]+sum(np.array(self.state[1:(STOCK_DIM+1)])*np.array(self.state[(STOCK_DIM+1):61]))- INITIAL_ACCOUNT_BALANCE ))
            #print("total_cost: ", self.cost)
            #print("total_trades: ", self.trades)
            df_total_value.columns = ['account_value']
            df_total_value['daily_return']=df_total_value.pct_change(1)
            sharpe = (252**0.5)*df_total_value['daily_return'].mean()/ \
                  df_total_value['daily_return'].std()
            print("Sharpe: ",sharpe)
            #print("=================================")
            df_rewards = pd.DataFrame(self.rewards_memory)
            #df_rewards.to_csv('results/account_rewards_train.csv')

            # print('total asset: {}'.format(self.state[0]+ sum(np.array(self.state[1:29])*np.array(self.state[29:]))))
            #with open('obs.pkl', 'wb') as f:  
            #    pickle.dump(self.state, 
            return self.state, self.reward, self.terminal,{}

        else:
            # print(np.array(self.state[1:29]))
            #print("The actions is: {}".format(self.actions))

            #action = np.array([4,4,5])
            #actions = np.array([4,0,0,0,0,0,0,0,4,0,4,0,-3,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0])

            #actions = self.actions * HMAX_NORMALIZE #WHY??
            #print("actions-index------:{}".format(actions))
            #actions = (actions.astype(int))

            begin_total_asset = self.state[0]+ \
            sum(np.array(self.state[1:(STOCK_DIM+1)])*np.array(self.state[(STOCK_DIM+1):(STOCK_DIM*2+1)]))
            #print("begin_total_asset:{}".format(begin_total_asset))

            argsort_actions = np.argsort(actions) #TODO: this may not be touched.
            #print("The actions is: {}".format(actions))

            sell_index = argsort_actions[:np.where(actions == 0)[0].shape[0]]
            #sell_index = argsort_actions[4,0,0,0,0,0,0,0,4,0,4,0,-3,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0]
            #print("sell-index------:{}".format(sell_index))
            buy_index = argsort_actions[::-1][:np.where(actions == 2)[0].shape[0]]
            #buy_index = argsort_actions[::-1][4,0,0,0,0,0,0,0,4,0,4,0,-3,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0]
            #print("buy-index------:{}".format(buy_index))

            for index in sell_index:
            # print('take sell action'.format(actions[index]))
                #print("--------Action Shape:{}".format(actions.shape))
                self._sell_stock(index+ self.agent_stock_iteration_index, 1)

            for index in buy_index:
                #print("--------Action Shape:{}".format(actions.shape))
            # print('take buy action: {}'.format(actions[index]))
                self._buy_stock(index+ self.agent_stock_iteration_index, 1)
            
            #print("self.day:{}".format(self.day))
            #load next state
            # print("stock_shares:{}".format(self.state[29:]))
            self.state =  [self.state[0]] + \
                self.data.Close.values.tolist() + \
                list(self.state[(STOCK_DIM+1):(STOCK_DIM*2+1)]) + \
                self.data.macd.values.tolist() + \
                self.data.rsi.values.tolist() + \
                self.data.cci.values.tolist() + \
                self.data.adx.values.tolist()

            end_total_asset = self.state[0]+ \
            sum(np.array(self.state[1:(STOCK_DIM+1)])*np.array(self.state[(STOCK_DIM+1):(STOCK_DIM*2+1)]))
            self.asset_memory.append(end_total_asset)
            #print("end_total_asset:{}".format(end_total_asset))

            self.reward = end_total_asset - begin_total_asset            
            self.rewards_memory.append(self.reward)
            self.reward = self.reward*REWARD_SCALING
            self.agent_stock_iteration_index += 1 
            if self.agent_stock_iteration_index ==3:
                self.day += 1
                self.data = self.df.loc[self.day,:]
                self.agent_stock_iteration_index = 0
            
        return self.state, self.reward, self.terminal, {}

    def reset(self):
        self.final_asset_value = 0
        self.trades = 0
        self.asset_memory = [INITIAL_ACCOUNT_BALANCE]
        self.day = 0
        self.data = self.df.loc[self.day,:]
        self.cost = 0
        self.trades = 0
        self.terminal = False 
        self.rewards_memory = []
        self.agent_stock_iteration_index = 0
        #initiate state
        self.state = [INITIAL_ACCOUNT_BALANCE] + \
                      self.data.Close.values.tolist() + \
                      [0]*STOCK_DIM + \
                      self.data.macd.values.tolist() + \
                      self.data.rsi.values.tolist() + \
                      self.data.cci.values.tolist() + \
                      self.data.adx.values.tolist() 
        # iteration += 1 
        #print("[0]*STOCK_DIM:{}".format([0]*STOCK_DIM))
        #print("self.state:{}".format(len(self.state)))
        print(np.array(self.state))
        return np.array(self.state)
    
    def render(self, mode='human'):
        return self.state

    def _seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

In [162]:
def run(env,frames=1000, eps_fixed=False, eps_frames=1e6, min_eps=0.01):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    output_history = []
    frame = 0
    if eps_fixed:
        eps = 0
    else:
        eps = 1
    eps_start = 1
    i_episode = 1
    state = env.reset()
    state = state[0,:]
    #print("state space:{}".format(state[0,:].shape))
    score = 0                  
    for frame in range(1, frames+1):
        
        if frame  == 0:
            # inital state
            initial = True
        else:
            # previous state
            initial = False

        action = agent.act(state, eps) #TODO: getting one dimension back.
        action = np.array([action])
        next_state, reward, done, info = env_train.step([action]) #TODO: Wants a list of actions of size 
        #print("env_trainNext State: {}".format(next_state.shape))
        next_state = next_state[0,:]
        #agent.step(state, action, reward, next_state, done, writer)

        state = next_state
        score += reward
        # linear annealing to the min epsilon value until eps_frames and from there slowly decease epsilon to 0 until the end of training
        if eps_fixed == False:
            if frame < eps_frames:
                eps = max(eps_start - (frame*(1/eps_frames)), min_eps)
            else:
                eps = max(min_eps - min_eps*((frame-eps_frames)/(frames-eps_frames)), 0.001)

        # evaluation runs
        if frame % 100000 == 0:
            
            print("score: {}".format(state))
            print("score: {}".format(score))
            #print("state: {}".format(state))
            print("action:{}, Number:{}".format(action,frame))
            print("-------------------------")
        
        if done:
            
            for k, v in agent.qnetwork_local.named_parameters():
                if k=='ff_2.bias':
                    print(k, v)
            
            scores_window.append(score)       # save most recent score
            scores.append(score)              # save most recent score
            writer.add_scalar("Average100", np.mean(scores_window), frame)
            output_history.append(np.mean(scores_window))
            print('\rEpisode {}\tFrame {} \tAverage Score: {:.2f}'.format(i_episode, frame, np.mean(scores_window)), end="")
            if i_episode % 100 == 0:
                print('\rEpisode {}\tFrame {}\tAverage Score: {:.2f}'.format(i_episode,frame, np.mean(scores_window)))
            i_episode +=1 

            state = env.reset()
            print('state:{}'.format(state))
            state = state[0,:]
            score = 0              
    return output_history


if __name__ == "__main__":
    
        # read and preprocess data
    preprocessed_path = "done_3stocks.csv"
    if os.path.exists(preprocessed_path):
        data = pd.read_csv(preprocessed_path, index_col=0)

    unique_trade_date = data[(data.datadate > 20101001)&(data.datadate <= 20200707)].datadate.unique()
    #print(unique_trade_date)

    
    train = data_split(data, start=20100101, end=20200101)
    
    env_train = DummyVecEnv([lambda: StockEnvTrain(train)])
    
    writer = SummaryWriter("runs/"+"IQN_CP_5")
    seed = 1
    BUFFER_SIZE = 100
    BATCH_SIZE = 8
    GAMMA = 0.99
    TAU = 1e-2
    LR = 1e-3
    UPDATE_EVERY = 1
    n_step = 1
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("Using ", device)


    action_size     = env_train.action_space.shape[0]

    
    
    #print('Action Space: {}'.format(action_size))
    state_size = env_train.observation_space.shape[0]
    #print('State Space: {}'.format(state_size))

    

    agent = DQN_Agent(state_size=19,    
                    action_size=3,
                    Network="DDQN",
                    layer_size=512,
                    n_step=n_step,
                    BATCH_SIZE=BATCH_SIZE, 
                    BUFFER_SIZE=BUFFER_SIZE, 
                    LR=LR, 
                    TAU=TAU, 
                    GAMMA=GAMMA, 
                    UPDATE_EVERY=UPDATE_EVERY, 
                    device=device, 
                    seed=seed)



    # set epsilon frames to 0 so no epsilon exploration
    eps_fixed = False
    t0 = time.time()
    final_average100 = run(env=env_train, frames = 500000, eps_fixed=eps_fixed, eps_frames=5000, min_eps=0.025)
    t1 = time.time()
    
    print("Training time: {}min".format(round((t1-t0)/60,2)))
    torch.save(agent.qnetwork_local.state_dict(), "IQN3"+".pth")
    print('this is the final IQN.pth'.format(agent.qnetwork_local.state_dict()))
    print('self.qnetwork_local.parameters():{}'.format(agent.qnetwork_local.parameters()))
        
    for k, v in agent.qnetwork_local.named_parameters():
        if k=='ff_2.bias':
            print(k, v)


Using  cpu
self.qnetwork_local.parameters():<generator object Module.parameters at 0x159b5a3b0>
head_1.weight Parameter containing:
tensor([[-0.2463, -0.2630, -0.1614,  ..., -0.3611, -0.3910, -0.4960],
        [-0.3770, -0.1720, -0.2778,  ..., -0.6713, -0.1004, -0.0239],
        [-0.1456, -0.7444, -0.1435,  ...,  0.0897, -0.0536, -0.4889],
        ...,
        [-0.6994,  0.0897, -0.1608,  ..., -0.1087, -0.2199,  0.1190],
        [-0.4599,  0.4351, -0.2418,  ..., -0.3472, -0.3592, -0.0515],
        [-0.2331,  0.1188, -0.2957,  ...,  0.1661,  0.0593, -0.2946]],
       requires_grad=True)
head_1.bias Parameter containing:
tensor([-3.3902e-05, -2.7106e-01,  1.8301e-02, -1.2399e-01, -3.5248e-01,
        -1.8112e-01, -1.5797e-01, -2.6200e-01, -2.9386e-01, -4.5188e-01,
        -3.8487e-01, -1.4696e-01, -3.9047e-01, -2.1903e-01, -3.4676e-01,
        -3.2200e-01, -1.5252e-01, -4.3159e-01, -4.5385e-01, -1.8346e-01,
        -5.6958e-01, -1.9914e-01, -1.7511e-01,  1.3883e-03, -2.0238e-01,
        

action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  2.9057e+01,  3.9620e+01,  6.1260e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -2.3104e-01, -5.9731e-01,  5.4456e-01,
          5.2001e+01,  5.1379e+01,  5.7204e+01, -1.0027e+01, -1.7170e+01,
          4.5612e+01,  5.3674e+00,  5.8389e+00,  1.7408e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  2.8936e+01,  3.9050e+01,  6.1820e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -1.7251e-01, -5.0579e-01,  6.4453e-01,
          5.1515e+01,  4.9781e+01,  5.8091e+01, -9.7500e+00, -2.0921e+01,
          8.5137e+01,  3.2947e+00,  5.8389e+00,  2.2331e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  2.8936e+01,  3.9050e+01,  6.1820e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -1.7251e-01, -5.0579e-01,  6.4453e-01,
          5.1515e+01,  4.9781e+01,  5.8091e+01, -9.7500e+00, -2.0921e+01,
          8.5137e+01,  3.2947e+00,  5.8389e+00,  2.2331

tensor([[1.0000e+03, 3.3710e+01, 4.1700e+01, 7.2990e+01, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 9.4381e-01, 5.8207e-01, 2.4016e+00, 6.5594e+01, 5.7371e+01,
         6.7380e+01, 1.1084e+02, 1.0213e+02, 9.1094e+01, 4.4892e+01, 1.8068e+01,
         5.4694e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[1.0000e+03, 3.3710e+01, 4.1700e+01, 7.2990e+01, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 9.4381e-01, 5.8207e-01, 2.4016e+00, 6.5594e+01, 5.7371e+01,
         6.7380e+01, 1.1084e+02, 1.0213e+02, 9.1094e+01, 4.4892e+01, 1.8068e+01,
         5.4694e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[1.0000e+03, 3.4070e+01, 4.2670e+01, 7.2040e+01, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 9.7166e-01, 6.6448e-01, 2.2296e+00, 6.6799e+01, 6.0495e+01,
         6.4616e+01, 1.1583e+02, 1.3358e+02, 6.8779e+01, 4.4892e+01, 3.0696e+01,
         3.5498e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[1.0000e+03, 3.4070e+01, 4.2670e+01, 7.20

action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  3.6193e+01,  4.0330e+01,  6.5130e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -9.1285e-02, -1.3756e+00, -2.1232e+00,
          5.4756e+01,  4.6790e+01,  4.5397e+01, -1.3310e+01, -8.5278e+01,
         -1.0499e+02,  2.9033e+01,  3.6247e+01,  3.9493e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  3.6193e+01,  4.0330e+01,  6.5130e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -9.1285e-02, -1.3756e+00, -2.1232e+00,
          5.4756e+01,  4.6790e+01,  4.5397e+01, -1.3310e+01, -8.5278e+01,
         -1.0499e+02,  2.9033e+01,  3.6247e+01,  3.9493e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  3.6697e+01,  3.9870e+01,  6.4180e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  5.9269e-03, -1.3109e+00, -2.1102e+00,
          5.6027e+01,  4.5991e+01,  4.4176e+01,  2.1098e+01, -7.6833e+01,
         -1.0319e+02,  2.0647e+01,  3.2732e+01,  4.0733

action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  3.7134e+01,  4.4790e+01,  6.7930e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -1.0527e-01,  5.4845e-01, -1.0813e-02,
          5.3226e+01,  5.6075e+01,  5.3552e+01,  1.6291e+00,  1.5557e+02,
          8.9496e+01,  4.3637e-01,  1.9605e+01,  1.3067e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  3.7134e+01,  4.4790e+01,  6.7930e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -1.0527e-01,  5.4845e-01, -1.0813e-02,
          5.3226e+01,  5.6075e+01,  5.3552e+01,  1.6291e+00,  1.5557e+02,
          8.9496e+01,  4.3637e-01,  1.9605e+01,  1.3067e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  3.7134e+01,  4.4790e+01,  6.7930e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -1.0527e-01,  5.4845e-01, -1.0813e-02,
          5.3226e+01,  5.6075e+01,  5.3552e+01,  1.6291e+00,  1.5557e+02,
          8.9496e+01,  4.3637e-01,  1.9605e+01,  1.3067

tensor([[ 1.0000e+03,  3.7560e+01,  4.0070e+01,  6.4500e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  4.5251e-02, -5.6574e-01, -6.8503e-01,
          5.5636e+01,  4.5759e+01,  4.9051e+01,  9.6559e+01, -7.9037e+01,
         -2.6178e+01,  1.5883e+01,  2.5141e+01,  3.9158e+00]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  3.7560e+01,  4.0070e+01,  6.4500e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  4.5251e-02, -5.6574e-01, -6.8503e-01,
          5.5636e+01,  4.5759e+01,  4.9051e+01,  9.6559e+01, -7.9037e+01,
         -2.6178e+01,  1.5883e+01,  2.5141e+01,  3.9158e+00]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  3.7581e+01,  4.0690e+01,  6.3440e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  1.6485e-01, -5.2262e-01, -6.3675e-01,
          5.5708e+01,  4.7478e+01,  4.7336e+01,  1.1611e+02, -5.6720e+01,
         -3.3929e+01,  1.9743e+01,  1.8567e+01,  3.9158e+00]])
action_values:tensor([[0.0089, 0.0068, 0.00

action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  4.3976e+01,  4.0680e+01,  7.0860e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  1.2412e+00, -2.6577e-01,  1.5538e+00,
          6.4762e+01,  5.0096e+01,  5.7869e+01,  8.4692e+01, -5.1524e+00,
          6.5929e+01,  1.5163e+01,  6.5035e+00,  1.1163e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  4.3606e+01,  4.1220e+01,  7.1270e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  1.1436e+00, -1.2747e-01,  1.4994e+00,
          6.2872e+01,  5.1610e+01,  5.8608e+01,  6.0039e+01,  2.9128e+01,
          7.5905e+01,  4.6061e+00,  1.6303e+01,  1.6337e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  4.3606e+01,  4.1220e+01,  7.1270e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  1.1436e+00, -1.2747e-01,  1.4994e+00,
          6.2872e+01,  5.1610e+01,  5.8608e+01,  6.0039e+01,  2.9128e+01,
          7.5905e+01,  4.6061e+00,  1.6303e+01,  1.6337

tensor([[ 1.0000e+03,  4.5756e+01,  4.6200e+01,  6.4490e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  5.3394e-01,  1.1260e+00, -6.3712e-01,
          5.9495e+01,  6.1530e+01,  4.6340e+01,  8.7585e+01,  1.5307e+02,
         -4.4973e+01,  2.8507e+01,  5.4393e+01,  6.5314e+00]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  4.5766e+01,  4.6120e+01,  6.4240e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  5.1710e-01,  1.1207e+00, -6.4578e-01,
          5.9528e+01,  6.1197e+01,  4.5853e+01,  8.6718e+01,  1.3246e+02,
         -4.4924e+01,  2.9479e+01,  4.6639e+01,  6.5314e+00]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  4.5766e+01,  4.6120e+01,  6.4240e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  5.1710e-01,  1.1207e+00, -6.4578e-01,
          5.9528e+01,  6.1197e+01,  4.5853e+01,  8.6718e+01,  1.3246e+02,
         -4.4924e+01,  2.9479e+01,  4.6639e+01,  6.5314e+00]])
action_values:tensor([[0.0089, 0.0068, 0.00

action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  4.9500e+01,  4.3820e+01,  7.1380e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  5.9481e-01, -1.5150e-01,  9.5748e-01,
          6.0963e+01,  4.9835e+01,  5.7411e+01,  9.5621e+01, -4.6473e+01,
          7.1236e+01,  3.1845e+00,  9.7951e+00,  2.2040e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  4.9500e+01,  4.3820e+01,  7.1380e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  5.9481e-01, -1.5150e-01,  9.5748e-01,
          6.0963e+01,  4.9835e+01,  5.7411e+01,  9.5621e+01, -4.6473e+01,
          7.1236e+01,  3.1845e+00,  9.7951e+00,  2.2040e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  5.0269e+01,  4.4820e+01,  7.1930e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  6.6732e-01, -8.7111e-02,  9.9256e-01,
          6.3187e+01,  5.3375e+01,  5.8496e+01,  1.4038e+02,  2.1011e+01,
          8.4671e+01,  9.1653e+00,  1.0238e+01,  2.6069

tensor([[1.0000e+03, 4.9787e+01, 4.5200e+01, 7.3930e+01, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 9.3221e-03, 3.2312e-01, 7.3280e-01, 5.2745e+01, 5.2575e+01,
         5.7509e+01, 6.3660e+00, 8.7660e+01, 1.4034e+02, 1.3361e+01, 7.2439e+00,
         2.3970e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  4.9223e+01,  4.5360e+01,  7.4010e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -1.5253e-02,  3.2142e-01,  7.9445e-01,
          5.0938e+01,  5.3111e+01,  5.7670e+01, -7.4108e+00,  9.2180e+01,
          1.4514e+02,  1.7766e+01,  7.2439e+00,  2.7269e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  4.9223e+01,  4.5360e+01,  7.4010e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -1.5253e-02,  3.2142e-01,  7.9445e-01,
          5.0938e+01,  5.3111e+01,  5.7670e+01, -7.4108e+00,  9.2180e+01,
          1.4514e+02,  1.7766e+01,  7.2439e+00,  2.7269e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1

tensor([[ 1.0000e+03,  4.8111e+01,  5.0410e+01,  7.6320e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -3.4345e-01,  9.8427e-01,  2.9638e-01,
          4.7479e+01,  6.1969e+01,  5.2963e+01, -6.2876e+01,  5.7113e+01,
         -2.5479e+01,  2.0651e+01,  1.5769e+01,  1.9341e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  4.7857e+01,  5.0680e+01,  7.6660e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -3.5783e-01,  9.2948e-01,  2.3038e-01,
          4.6600e+01,  6.2880e+01,  5.3743e+01, -6.9531e+01,  6.5638e+01,
         -1.7394e+01,  2.0651e+01,  1.8956e+01,  1.7474e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  4.7857e+01,  5.0680e+01,  7.6660e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -3.5783e-01,  9.2948e-01,  2.3038e-01,
          4.6600e+01,  6.2880e+01,  5.3743e+01, -6.9531e+01,  6.5638e+01,
         -1.7394e+01,  2.0651e+01,  1.8956e+01,  1.7474e+01]])
action_values:tensor([[0.0089, 0.0068, 0.00

tensor([[ 1.0000e+03,  5.2131e+01,  5.1810e+01,  7.1280e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  1.0268e+00,  7.7034e-01, -6.5442e-01,
          6.0646e+01,  5.7107e+01,  4.3451e+01,  1.4721e+02,  6.2860e+01,
         -1.3059e+02,  3.0345e+01,  7.6134e+00,  2.2989e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  5.3400e+01,  5.1330e+01,  6.9550e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  1.1980e+00,  6.8044e-01, -8.6638e-01,
          6.3569e+01,  5.5297e+01,  4.0123e+01,  1.7307e+02,  4.1468e+01,
         -1.9351e+02,  4.2442e+01,  6.3719e-01,  3.5489e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  5.3400e+01,  5.1330e+01,  6.9550e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  1.1980e+00,  6.8044e-01, -8.6638e-01,
          6.3569e+01,  5.5297e+01,  4.0123e+01,  1.7307e+02,  4.1468e+01,
         -1.9351e+02,  4.2442e+01,  6.3719e-01,  3.5489e+01]])
action_values:tensor([[0.0089, 0.0068, 0.00

tensor([[ 1.0000e+03,  5.4433e+01,  4.9500e+01,  6.6050e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  4.7142e-01,  1.1939e-01, -6.0837e-01,
          5.4007e+01,  5.2525e+01,  4.8311e+01,  1.6991e+01,  6.0171e+01,
          3.7035e+01,  6.3052e-01,  1.0418e+01,  1.0713e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  5.3436e+01,  4.8510e+01,  6.4030e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  3.7237e-01,  1.5948e-01, -5.3241e-01,
          5.1870e+01,  5.0354e+01,  4.5770e+01, -3.4917e+01,  3.6711e+01,
          4.2331e+00,  1.1372e+01,  1.6842e+00,  3.3302e+00]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  5.3436e+01,  4.8510e+01,  6.4030e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  3.7237e-01,  1.5948e-01, -5.3241e-01,
          5.1870e+01,  5.0354e+01,  4.5770e+01, -3.4917e+01,  3.6711e+01,
          4.2331e+00,  1.1372e+01,  1.6842e+00,  3.3302e+00]])
action_values:tensor([[0.0089, 0.0068, 0.00

tensor([[1.0000e+03, 5.7229e+01, 5.0450e+01, 6.6560e+01, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 4.9771e-01, 6.2249e-01, 6.2259e-01, 5.3345e+01, 5.6030e+01,
         5.3537e+01, 6.4850e+00, 1.4258e+02, 1.7182e+02, 2.7980e+00, 2.2637e+01,
         1.7978e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[1.0000e+03, 5.7229e+01, 5.0450e+01, 6.6560e+01, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 4.9771e-01, 6.2249e-01, 6.2259e-01, 5.3345e+01, 5.6030e+01,
         5.3537e+01, 6.4850e+00, 1.4258e+02, 1.7182e+02, 2.7980e+00, 2.2637e+01,
         1.7978e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[1.0000e+03, 5.7813e+01, 5.2060e+01, 6.7490e+01, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 5.0704e-01, 9.2080e-01, 8.6789e-01, 5.4365e+01, 5.8632e+01,
         5.4728e+01, 4.4095e+01, 1.9267e+02, 1.9741e+02, 4.1681e+00, 3.1119e+01,
         2.2271e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[1.0000e+03, 5.7813e+01, 5.2060e+01, 6.74

action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  5.4134e+01,  4.6420e+01,  7.0610e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -1.0692e-01, -2.0518e-01,  1.4798e+00,
          4.7308e+01,  4.6859e+01,  5.5899e+01, -4.4451e+01, -6.2087e+01,
          7.7954e+01,  1.1495e+01,  1.6267e+01,  2.0654e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  5.4134e+01,  4.6420e+01,  7.0610e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -1.0692e-01, -2.0518e-01,  1.4798e+00,
          4.7308e+01,  4.6859e+01,  5.5899e+01, -4.4451e+01, -6.2087e+01,
          7.7954e+01,  1.1495e+01,  1.6267e+01,  2.0654e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  5.4431e+01,  4.6880e+01,  7.1010e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -1.5008e-01, -2.4532e-01,  1.4567e+00,
          4.8038e+01,  4.7870e+01,  5.6442e+01, -3.0225e+01, -5.1600e+01,
          9.1729e+01,  1.0698e+01,  1.6267e+01,  2.8709

tensor([[1.0000e+03, 6.6281e+01, 5.1810e+01, 7.5460e+01, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 2.0386e+00, 7.8646e-01, 7.2350e-01, 6.8049e+01, 5.7733e+01,
         5.7526e+01, 1.4846e+02, 1.5456e+02, 7.0310e+01, 6.2542e+01, 2.9569e+01,
         5.6469e+00]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[1.0000e+03, 6.6281e+01, 5.1810e+01, 7.5460e+01, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 2.0386e+00, 7.8646e-01, 7.2350e-01, 6.8049e+01, 5.7733e+01,
         5.7526e+01, 1.4846e+02, 1.5456e+02, 7.0310e+01, 6.2542e+01, 2.9569e+01,
         5.6469e+00]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[1.0000e+03, 6.6281e+01, 5.1810e+01, 7.5460e+01, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 2.0386e+00, 7.8646e-01, 7.2350e-01, 6.8049e+01, 5.7733e+01,
         5.7526e+01, 1.4846e+02, 1.5456e+02, 7.0310e+01, 6.2542e+01, 2.9569e+01,
         5.6469e+00]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[1.0000e+03, 6.6976e+01, 5.2130e+01, 7.52

action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  8.6711e+01,  5.8660e+01,  7.5180e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  4.0213e+00,  1.5421e+00, -2.4678e-02,
          7.7563e+01,  6.7291e+01,  5.3600e+01,  1.2300e+02,  1.6436e+02,
          1.1981e+01,  5.1515e+01,  5.3734e+01,  1.8137e+00]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  8.6711e+01,  5.8660e+01,  7.5180e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  4.0213e+00,  1.5421e+00, -2.4678e-02,
          7.7563e+01,  6.7291e+01,  5.3600e+01,  1.2300e+02,  1.6436e+02,
          1.1981e+01,  5.1515e+01,  5.3734e+01,  1.8137e+00]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  8.6711e+01,  5.8660e+01,  7.5180e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  4.0213e+00,  1.5421e+00, -2.4678e-02,
          7.7563e+01,  6.7291e+01,  5.3600e+01,  1.2300e+02,  1.6436e+02,
          1.1981e+01,  5.1515e+01,  5.3734e+01,  1.8137

tensor([[ 1.0000e+03,  7.8011e+01,  5.7390e+01,  7.2350e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -1.4100e+00,  2.4650e-01, -3.5822e-01,
          4.6229e+01,  5.1280e+01,  4.4695e+01, -1.3154e+02, -4.5775e+01,
         -6.8365e+01,  3.3210e+01,  1.3239e+01,  2.2984e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  7.5731e+01,  5.5650e+01,  6.9730e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -1.7233e+00, -3.4724e-02, -6.7842e-01,
          4.3295e+01,  4.6003e+01,  3.9077e+01, -1.5733e+02, -1.3139e+02,
         -1.5234e+02,  3.9021e+01,  3.2080e+01,  4.2261e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  7.5731e+01,  5.5650e+01,  6.9730e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -1.7233e+00, -3.4724e-02, -6.7842e-01,
          4.3295e+01,  4.6003e+01,  3.9077e+01, -1.5733e+02, -1.3139e+02,
         -1.5234e+02,  3.9021e+01,  3.2080e+01,  4.2261e+01]])
action_values:tensor([[0.0089, 0.0068, 0.00

tensor([[1.0000e+03, 8.7134e+01, 5.9250e+01, 7.4440e+01, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 8.7632e-01, 6.2712e-01, 5.9604e-01, 5.8934e+01, 5.6929e+01,
         5.4651e+01, 2.8726e+02, 2.0923e+02, 1.5113e+02, 3.2477e+01, 2.9867e+01,
         2.0464e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[1.0000e+03, 8.7134e+01, 5.9250e+01, 7.4440e+01, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 8.7632e-01, 6.2712e-01, 5.9604e-01, 5.8934e+01, 5.6929e+01,
         5.4651e+01, 2.8726e+02, 2.0923e+02, 1.5113e+02, 3.2477e+01, 2.9867e+01,
         2.0464e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[1.0000e+03, 8.6554e+01, 5.8630e+01, 7.3690e+01, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 1.0351e+00, 6.5794e-01, 6.2567e-01, 5.7816e+01, 5.5112e+01,
         5.2868e+01, 2.2504e+02, 1.5200e+02, 1.0208e+02, 3.2477e+01, 2.4695e+01,
         9.0153e+00]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[1.0000e+03, 8.6554e+01, 5.8630e+01, 7.36

action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  9.4661e+01,  5.6420e+01,  7.0360e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  2.3771e+00, -1.3753e-01, -2.1210e-01,
          6.4124e+01,  4.8453e+01,  4.3560e+01,  1.7116e+02, -4.3448e+01,
         -2.4012e+02,  5.4528e+01,  5.2490e+00,  3.1198e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  9.4746e+01,  5.7490e+01,  7.1090e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  2.4156e+00, -7.3538e-02, -3.5394e-01,
          6.4233e+01,  5.1527e+01,  4.5420e+01,  1.4887e+02,  9.7947e+00,
         -2.1621e+02,  4.5235e+01,  5.7784e+00,  3.1198e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  9.4746e+01,  5.7490e+01,  7.1090e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  2.4156e+00, -7.3538e-02, -3.5394e-01,
          6.4233e+01,  5.1527e+01,  4.5420e+01,  1.4887e+02,  9.7947e+00,
         -2.1621e+02,  4.5235e+01,  5.7784e+00,  3.1198

action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  9.1559e+01,  5.7970e+01,  7.0340e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -8.0916e-01,  1.5200e-01, -3.1902e-01,
          4.7937e+01,  5.1457e+01,  4.6726e+01, -1.6121e+02,  3.0017e+01,
         -5.4773e+00,  3.2081e+01,  1.0252e+01,  9.1821e+00]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  9.1559e+01,  5.7970e+01,  7.0340e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -8.0916e-01,  1.5200e-01, -3.1902e-01,
          4.7937e+01,  5.1457e+01,  4.6726e+01, -1.6121e+02,  3.0017e+01,
         -5.4773e+00,  3.2081e+01,  1.0252e+01,  9.1821e+00]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  8.9729e+01,  5.8470e+01,  7.0830e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -1.0846e+00,  1.8245e-01, -2.5319e-01,
          4.5232e+01,  5.3069e+01,  4.8310e+01, -1.7411e+02,  5.5418e+01,
          3.7454e+01,  3.7138e+01,  1.0252e+01,  4.7376

action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  8.3611e+01,  5.5900e+01,  7.4280e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -9.2495e-01, -2.6261e-01,  6.9878e-01,
          4.6777e+01,  4.8505e+01,  5.5640e+01,  1.1391e+01,  2.7660e+00,
          9.7033e+01,  8.4203e+00,  1.0336e+01,  2.6624e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  8.3741e+01,  5.5990e+01,  7.4020e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -7.2555e-01, -2.0833e-01,  6.9669e-01,
          4.6964e+01,  4.8783e+01,  5.4861e+01,  2.6497e+01,  3.8308e+01,
          8.1605e+01,  3.0879e+00,  4.2597e+00,  2.2531e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  8.3741e+01,  5.5990e+01,  7.4020e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -7.2555e-01, -2.0833e-01,  6.9669e-01,
          4.6964e+01,  4.8783e+01,  5.4861e+01,  2.6497e+01,  3.8308e+01,
          8.1605e+01,  3.0879e+00,  4.2597e+00,  2.2531

tensor([[ 1.0000e+03,  7.2110e+01,  5.9400e+01,  7.4160e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -1.5969e+00,  8.7775e-01,  7.0582e-02,
          4.2038e+01,  5.5047e+01,  4.9246e+01, -1.0358e+02,  4.2139e+01,
         -1.1756e+02,  2.4213e+01,  3.3060e+00,  2.3301e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  7.3429e+01,  5.8930e+01,  7.4290e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -1.4315e+00,  7.3165e-01, -2.8738e-02,
          4.4005e+01,  5.3211e+01,  4.9531e+01, -5.1880e+01,  1.9146e+01,
         -1.1255e+02,  1.7349e+01,  3.7455e-01,  2.0049e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  7.3429e+01,  5.8930e+01,  7.4290e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -1.4315e+00,  7.3165e-01, -2.8738e-02,
          4.4005e+01,  5.3211e+01,  4.9531e+01, -5.1880e+01,  1.9146e+01,
         -1.1255e+02,  1.7349e+01,  3.7455e-01,  2.0049e+01]])
action_values:tensor([[0.0089, 0.0068, 0.00

action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[1000.0000,   61.1929,   65.3000,   84.7500,    0.0000,    0.0000,
            0.0000,   -1.6993,    1.2151,    2.0713,   38.9975,   65.6610,
           67.5741, -101.6896,  169.6402,  241.0353,   33.4343,   39.4536,
           56.9312]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[1000.0000,   61.1929,   65.3000,   84.7500,    0.0000,    0.0000,
            0.0000,   -1.6993,    1.2151,    2.0713,   38.9975,   65.6610,
           67.5741, -101.6896,  169.6402,  241.0353,   33.4343,   39.4536,
           56.9312]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[1000.0000,   61.7857,   65.3800,   84.6200,    0.0000,    0.0000,
            0.0000,   -1.5994,    1.2261,    2.2459,   40.1230,   65.8658,
           67.1954,  -79.9178,  155.3208,  205.5602,   33.1723,   40.9234,
           57.8744]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[1000.0000,   61.7857,   65.3800,   84.6200,   

action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  5.6007e+01,  6.5040e+01,  8.6120e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -1.4294e+00,  2.7171e-02,  1.3804e+00,
          3.5439e+01,  5.4158e+01,  5.9475e+01, -2.2152e+02, -7.9092e+01,
          4.6908e+01,  4.9200e+01,  1.8113e+00,  2.4201e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  5.6007e+01,  6.5040e+01,  8.6120e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -1.4294e+00,  2.7171e-02,  1.3804e+00,
          3.5439e+01,  5.4158e+01,  5.9475e+01, -2.2152e+02, -7.9092e+01,
          4.6908e+01,  4.9200e+01,  1.8113e+00,  2.4201e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  5.6007e+01,  6.5040e+01,  8.6120e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -1.4294e+00,  2.7171e-02,  1.3804e+00,
          3.5439e+01,  5.4158e+01,  5.9475e+01, -2.2152e+02, -7.9092e+01,
          4.6908e+01,  4.9200e+01,  1.8113e+00,  2.4201

tensor([[ 1.0000e+03,  6.2637e+01,  7.6240e+01,  9.9760e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  3.2811e-01,  1.8783e+00,  2.1103e+00,
          4.8125e+01,  6.7166e+01,  6.4544e+01, -2.6707e+01,  7.9558e+01,
          6.9746e+01,  1.4382e+01,  4.3958e+01,  2.7044e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  6.2637e+01,  7.6240e+01,  9.9760e+01,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  3.2811e-01,  1.8783e+00,  2.1103e+00,
          4.8125e+01,  6.7166e+01,  6.4544e+01, -2.6707e+01,  7.9558e+01,
          6.9746e+01,  1.4382e+01,  4.3958e+01,  2.7044e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  6.3116e+01,  7.8040e+01,  1.0249e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  2.6943e-01,  1.9694e+00,  2.2300e+00,
          4.9172e+01,  7.0106e+01,  6.7860e+01, -4.3480e+01,  1.1456e+02,
          1.1879e+02,  1.5941e+01,  5.3870e+01,  4.4410e+01]])
action_values:tensor([[0.0089, 0.0068, 0.00

action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  6.2999e+01,  7.5340e+01,  1.0560e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  4.5298e-01,  1.3357e-01,  1.5269e+00,
          5.3705e+01,  5.3247e+01,  5.7414e+01,  1.1869e+02, -5.5575e+00,
          6.4092e+01,  8.2475e+00,  7.5386e+00,  6.3887e+00]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  6.2999e+01,  7.5340e+01,  1.0560e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  4.5298e-01,  1.3357e-01,  1.5269e+00,
          5.3705e+01,  5.3247e+01,  5.7414e+01,  1.1869e+02, -5.5575e+00,
          6.4092e+01,  8.2475e+00,  7.5386e+00,  6.3887e+00]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  6.2999e+01,  7.5340e+01,  1.0560e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  4.5298e-01,  1.3357e-01,  1.5269e+00,
          5.3705e+01,  5.3247e+01,  5.7414e+01,  1.1869e+02, -5.5575e+00,
          6.4092e+01,  8.2475e+00,  7.5386e+00,  6.3887

action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  6.7527e+01,  7.5290e+01,  1.0980e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  6.3978e-01, -9.5659e-02,  9.6694e-01,
          5.1330e+01,  5.3464e+01,  5.9735e+01, -6.5388e+01,  5.7251e+01,
          2.0494e+02,  1.7666e+01,  1.4917e+01,  2.8851e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  6.7527e+01,  7.5290e+01,  1.0980e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  6.3978e-01, -9.5659e-02,  9.6694e-01,
          5.1330e+01,  5.3464e+01,  5.9735e+01, -6.5388e+01,  5.7251e+01,
          2.0494e+02,  1.7666e+01,  1.4917e+01,  2.8851e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  6.7527e+01,  7.5290e+01,  1.0980e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  6.3978e-01, -9.5659e-02,  9.6694e-01,
          5.1330e+01,  5.3464e+01,  5.9735e+01, -6.5388e+01,  5.7251e+01,
          2.0494e+02,  1.7666e+01,  1.4917e+01,  2.8851

action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[1000.0000,   75.0641,   81.9000,  133.5700,    0.0000,    0.0000,
            0.0000,    1.4334,    1.8269,    4.4889,   59.5420,   61.4401,
           70.7562,   89.3183,   74.3600,  125.7225,   18.3552,   25.5610,
           48.5368]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[1000.0000,   74.4171,   82.1200,  133.0900,    0.0000,    0.0000,
            0.0000,    1.3337,    1.7535,    4.4563,   57.9469,   61.8654,
           69.8559,   66.4865,   77.9147,  121.7133,    9.2548,   29.4003,
           51.0383]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[1000.0000,   74.4171,   82.1200,  133.0900,    0.0000,    0.0000,
            0.0000,    1.3337,    1.7535,    4.4563,   57.9469,   61.8654,
           69.8559,   66.4865,   77.9147,  121.7133,    9.2548,   29.4003,
           51.0383]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[1000.0000,   74.4171,   82.1200,  133.0900,   

action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[1000.0000,   80.5571,   89.0000,  138.2700,    0.0000,    0.0000,
            0.0000,    1.1278,    1.4321,    1.4127,   59.9828,   67.1136,
           62.7161,   74.4911,  194.6561,  151.6069,   24.6766,   48.3720,
           20.9284]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[1000.0000,   80.0129,   89.1900,  136.9000,    0.0000,    0.0000,
            0.0000,    1.0650,    1.5303,    1.3997,   58.6024,   67.4454,
           60.2927,   54.2802,  185.0245,  119.8344,   17.6857,   50.0327,
           18.3915]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[1000.0000,   80.0129,   89.1900,  136.9000,    0.0000,    0.0000,
            0.0000,    1.0650,    1.5303,    1.3997,   58.6024,   67.4454,
           60.2927,   54.2802,  185.0245,  119.8344,   17.6857,   50.0327,
           18.3915]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[1000.0000,   80.0129,   89.1900,  136.9000,   

tensor([[ 1.0000e+03,  7.6767e+01,  8.8840e+01,  1.2839e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  4.5549e-02,  4.2462e-01, -1.9294e+00,
          5.1296e+01,  5.5582e+01,  4.6311e+01,  2.9853e+01,  6.2149e+01,
         -4.4062e+01,  3.1967e+00,  2.0575e+01,  1.6618e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  7.6767e+01,  8.8840e+01,  1.2839e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  4.5549e-02,  4.2462e-01, -1.9294e+00,
          5.1296e+01,  5.5582e+01,  4.6311e+01,  2.9853e+01,  6.2149e+01,
         -4.4062e+01,  3.1967e+00,  2.0575e+01,  1.6618e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  7.5879e+01,  8.9010e+01,  1.2956e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  1.8422e-02,  4.7738e-01, -1.7398e+00,
          4.9550e+01,  5.5906e+01,  4.7705e+01,  8.5424e-01,  4.3342e+01,
         -4.1653e+01,  3.1626e+00,  1.4077e+01,  1.9852e+01]])
action_values:tensor([[0.0089, 0.0068, 0.00

tensor([[ 1.0000e+03,  7.5760e+01,  8.8720e+01,  1.2688e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -6.0104e-02, -5.9761e-01, -1.6358e-01,
          4.9126e+01,  4.9100e+01,  4.9024e+01, -7.0879e+01, -1.6690e+02,
          2.2192e+00,  1.0765e+01,  1.2234e+01,  1.0248e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  7.5760e+01,  8.8720e+01,  1.2688e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -6.0104e-02, -5.9761e-01, -1.6358e-01,
          4.9126e+01,  4.9100e+01,  4.9024e+01, -7.0879e+01, -1.6690e+02,
          2.2192e+00,  1.0765e+01,  1.2234e+01,  1.0248e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  7.5760e+01,  8.8720e+01,  1.2688e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -6.0104e-02, -5.9761e-01, -1.6358e-01,
          4.9126e+01,  4.9100e+01,  4.9024e+01, -7.0879e+01, -1.6690e+02,
          2.2192e+00,  1.0765e+01,  1.2234e+01,  1.0248e+01]])
action_values:tensor([[0.0089, 0.0068, 0.00

action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[1.0000e+03, 8.9807e+01, 9.1890e+01, 1.3590e+02, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 2.4923e+00, 8.9170e-01, 1.7024e+00, 6.8030e+01, 5.7590e+01,
         5.9084e+01, 1.3672e+02, 1.8946e+02, 1.6780e+02, 5.0725e+01, 2.9419e+01,
         2.1685e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[1.0000e+03, 9.1077e+01, 9.1730e+01, 1.3588e+02, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 2.5456e+00, 9.6466e-01, 1.7725e+00, 6.9824e+01, 5.7162e+01,
         5.9047e+01, 1.5084e+02, 1.5884e+02, 1.6540e+02, 5.3171e+01, 2.6946e+01,
         2.4920e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[1.0000e+03, 9.1077e+01, 9.1730e+01, 1.3588e+02, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 2.5456e+00, 9.6466e-01, 1.7725e+00, 6.9824e+01, 5.7162e+01,
         5.9047e+01, 1.5084e+02, 1.5884e+02, 1.6540e+02, 5.3171e+01, 2.6946e+01,
         2.4920e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])


tensor([[ 1.0000e+03,  9.3939e+01,  9.2880e+01,  1.2830e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  1.0546e+00,  5.8362e-02, -1.0246e+00,
          5.9560e+01,  5.0734e+01,  4.7467e+01,  4.1770e+01, -2.6019e+02,
         -5.4349e+01,  6.6276e+00,  2.3905e+01,  2.1937e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  9.3939e+01,  9.2880e+01,  1.2830e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  1.0546e+00,  5.8362e-02, -1.0246e+00,
          5.9560e+01,  5.0734e+01,  4.7467e+01,  4.1770e+01, -2.6019e+02,
         -5.4349e+01,  6.6276e+00,  2.3905e+01,  2.1937e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  9.3939e+01,  9.2880e+01,  1.2830e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  1.0546e+00,  5.8362e-02, -1.0246e+00,
          5.9560e+01,  5.0734e+01,  4.7467e+01,  4.1770e+01, -2.6019e+02,
         -5.4349e+01,  6.6276e+00,  2.3905e+01,  2.1937e+01]])
action_values:tensor([[0.0089, 0.0068, 0.00

action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  9.8360e+01,  8.8930e+01,  1.2798e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  8.7845e-01, -8.2661e-02,  5.0845e-01,
          5.5037e+01,  4.6197e+01,  5.2675e+01,  1.2064e+00,  2.9902e+01,
          8.4181e+01,  4.6938e+00,  1.1056e+01,  1.1078e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  9.7990e+01,  8.7900e+01,  1.2821e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  6.6710e-01, -1.8187e-01,  6.3137e-01,
          5.4178e+01,  4.3238e+01,  5.3034e+01,  1.6050e+01, -1.2269e+01,
          9.6159e+01,  2.1786e+01,  2.0941e+01,  1.3791e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  9.7990e+01,  8.7900e+01,  1.2821e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  6.6710e-01, -1.8187e-01,  6.3137e-01,
          5.4178e+01,  4.3238e+01,  5.3034e+01,  1.6050e+01, -1.2269e+01,
          9.6159e+01,  2.1786e+01,  2.0941e+01,  1.3791

action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  1.0511e+02,  8.6630e+01,  1.2212e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  1.0509e+00, -6.1307e-01, -9.9732e-01,
          6.0020e+01,  4.9029e+01,  4.6011e+01,  1.8572e+02,  6.5990e+00,
         -8.2619e+01,  1.9620e+01,  6.7655e+00,  2.2319e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  1.0511e+02,  8.6630e+01,  1.2212e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  1.0509e+00, -6.1307e-01, -9.9732e-01,
          6.0020e+01,  4.9029e+01,  4.6011e+01,  1.8572e+02,  6.5990e+00,
         -8.2619e+01,  1.9620e+01,  6.7655e+00,  2.2319e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  1.0674e+02,  8.8010e+01,  1.2361e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  1.3477e+00, -3.2608e-01, -8.6698e-01,
          6.2167e+01,  5.1543e+01,  4.8067e+01,  2.0445e+02,  5.1736e+01,
         -4.6766e+01,  2.6216e+01,  3.2691e+00,  1.1444

tensor([[ 1.0000e+03,  1.0675e+02,  8.8240e+01,  1.2425e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -2.5500e-01,  2.4546e-01, -1.1497e+00,
          4.7142e+01,  4.5976e+01,  4.5715e+01, -1.2410e+02, -2.0039e+02,
         -7.7527e+01,  3.4449e+01,  1.5915e+01,  1.7877e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  1.0941e+02,  9.0440e+01,  1.2506e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -4.1478e-01,  1.5812e-01, -1.1875e+00,
          5.1078e+01,  5.0719e+01,  4.6918e+01, -1.0221e+02, -1.3225e+02,
         -7.4770e+01,  3.4449e+01,  1.5915e+01,  2.0014e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  1.0941e+02,  9.0440e+01,  1.2506e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -4.1478e-01,  1.5812e-01, -1.1875e+00,
          5.1078e+01,  5.0719e+01,  4.6918e+01, -1.0221e+02, -1.3225e+02,
         -7.4770e+01,  3.4449e+01,  1.5915e+01,  2.0014e+01]])
action_values:tensor([[0.0089, 0.0068, 0.00

tensor([[1000.0000,  119.9400,   84.7300,  148.6000,    0.0000,    0.0000,
            0.0000,    2.2940,   -2.0013,    4.7630,   59.4401,   44.0418,
           67.9698,  157.2787,  -56.4929,  175.5622,   22.3576,   29.9667,
           45.7644]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[1000.0000,  118.9300,   85.0100,  148.0000,    0.0000,    0.0000,
            0.0000,    2.3496,   -1.7713,    4.8731,   58.1252,   44.6091,
           67.0399,  130.5236,  -41.6929,  154.4481,   17.6409,   19.3011,
           45.8097]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[1000.0000,  118.9300,   85.0100,  148.0000,    0.0000,    0.0000,
            0.0000,    2.3496,   -1.7713,    4.8731,   58.1252,   44.6091,
           67.0399,  130.5236,  -41.6929,  154.4481,   17.6409,   19.3011,
           45.8097]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[1000.0000,  118.9300,   85.0100,  148.0000,    0.0000,    0.0000,
            0.0000,    2.3496, 

tensor([[ 1.0000e+03,  1.2359e+02,  8.0600e+01,  1.5157e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  9.9779e-01, -9.2216e-01,  2.1509e+00,
          5.3894e+01,  4.4063e+01,  5.9122e+01, -2.6955e+01, -3.2131e+01,
          5.5811e+00,  1.0775e+01,  1.2781e+01,  1.8338e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  1.2495e+02,  8.1500e+01,  1.5367e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  8.6053e-01, -7.9392e-01,  2.0828e+00,
          5.5305e+01,  4.5737e+01,  6.1109e+01, -2.5107e+01, -1.0001e+01,
          5.3212e+01,  1.0775e+01,  1.2747e+01,  2.5846e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  1.2495e+02,  8.1500e+01,  1.5367e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  8.6053e-01, -7.9392e-01,  2.0828e+00,
          5.5305e+01,  4.5737e+01,  6.1109e+01, -2.5107e+01, -1.0001e+01,
          5.3212e+01,  1.0775e+01,  1.2747e+01,  2.5846e+01]])
action_values:tensor([[0.0089, 0.0068, 0.00

action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  1.3265e+02,  7.7490e+01,  1.4780e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  1.2509e+00, -7.1638e-01, -4.5465e-01,
          6.0464e+01,  4.2609e+01,  4.8727e+01,  2.9103e+02, -1.0078e+02,
         -1.2738e+02,  3.4489e+01,  1.8088e+01,  1.7416e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  1.3265e+02,  7.7490e+01,  1.4780e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  1.2509e+00, -7.1638e-01, -4.5465e-01,
          6.0464e+01,  4.2609e+01,  4.8727e+01,  2.9103e+02, -1.0078e+02,
         -1.2738e+02,  3.4489e+01,  1.8088e+01,  1.7416e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  1.3056e+02,  7.7490e+01,  1.4751e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  1.3302e+00, -7.3124e-01, -6.7491e-01,
          5.7381e+01,  4.2609e+01,  4.8374e+01,  2.2101e+02, -1.0038e+02,
         -1.4255e+02,  2.1320e+01,  1.9095e+01,  2.4544

action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  1.2780e+02,  7.8810e+01,  1.4064e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  3.8169e-01,  6.5677e-02, -1.6522e+00,
          5.0574e+01,  4.6341e+01,  4.2933e+01, -4.0805e+01, -1.4616e+01,
         -1.2120e+02,  2.7377e+01,  8.4330e+00,  1.6489e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  1.2742e+02,  7.9170e+01,  1.4010e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  1.8535e-01,  1.6722e-02, -1.7243e+00,
          5.0041e+01,  4.7430e+01,  4.2294e+01, -6.9091e+01, -2.0007e+01,
         -1.2626e+02,  3.3480e+01,  8.4330e+00,  1.8098e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  1.2742e+02,  7.9170e+01,  1.4010e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  1.8535e-01,  1.6722e-02, -1.7243e+00,
          5.0041e+01,  4.7430e+01,  4.2294e+01, -6.9091e+01, -2.0007e+01,
         -1.2626e+02,  3.3480e+01,  8.4330e+00,  1.8098

tensor([[ 1.0000e+03,  1.3207e+02,  7.9300e+01,  1.4672e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  2.8580e-01, -1.9987e-01,  1.0810e+00,
          5.7083e+01,  4.9950e+01,  5.3271e+01,  2.3159e+02,  1.7172e+01,
          1.0314e+02,  1.7218e+01,  4.4218e+00,  1.9802e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  1.3207e+02,  7.9300e+01,  1.4672e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  2.8580e-01, -1.9987e-01,  1.0810e+00,
          5.7083e+01,  4.9950e+01,  5.3271e+01,  2.3159e+02,  1.7172e+01,
          1.0314e+02,  1.7218e+01,  4.4218e+00,  1.9802e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  1.3075e+02,  7.8950e+01,  1.4500e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  5.4466e-01, -1.5572e-01,  9.6976e-01,
          5.5084e+01,  4.8940e+01,  5.0709e+01,  1.8796e+02, -1.0439e+01,
          5.6185e+01,  1.4627e+01,  5.7576e+00,  7.1439e+00]])
action_values:tensor([[0.0089, 0.0068, 0.00

action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  1.0927e+02,  7.4080e+01,  1.2976e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -2.6483e+00, -9.6511e-01, -3.6737e+00,
          4.2047e+01,  4.2414e+01,  4.0567e+01, -8.4569e+01, -9.2775e+01,
         -9.2369e+01,  4.8072e+01,  3.2681e+01,  4.3059e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  1.1231e+02,  7.5490e+01,  1.3382e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -2.3432e+00, -9.2733e-01, -3.2646e+00,
          4.5022e+01,  4.5549e+01,  4.4857e+01, -4.1510e+01, -5.3724e+01,
         -4.9996e+01,  3.8687e+01,  2.1900e+01,  2.8799e+01]])
action_values:tensor([[0.0089, 0.0068, 0.0070]])
0
tensor([[ 1.0000e+03,  1.1231e+02,  7.5490e+01,  1.3382e+02,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -2.3432e+00, -9.2733e-01, -3.2646e+00,
          4.5022e+01,  4.5549e+01,  4.4857e+01, -4.1510e+01, -5.3724e+01,
         -4.9996e+01,  3.8687e+01,  2.1900e+01,  2.8799

KeyboardInterrupt: 

In [None]:
agent.qnetwork_local.load_state_dict(torch.load('IQN1.pth'))

for k, v in agent.qnetwork_local.named_parameters():
        if k=='ff_2.bias':
            print(k, v)

In [163]:
class DQN_Agent():
    """Interacts with and learns from the environment."""

    def __init__(self,
                 state_size,
                 action_size,
                 Network,
                 layer_size,
                 n_step,
                 BATCH_SIZE,
                 BUFFER_SIZE,
                 LR,
                 TAU,
                 GAMMA,
                 UPDATE_EVERY,
                 device,
                 seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            Network (str): dqn network type
            layer_size (int): size of the hidden layer
            BATCH_SIZE (int): size of the training batch
            BUFFER_SIZE (int): size of the replay memory
            LR (float): learning rate
            TAU (float): tau for soft updating the network weights
            GAMMA (float): discount factor
            UPDATE_EVERY (int): update frequency
            device (str): device that is used for the compute
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.device = device
        self.TAU = TAU
        self.GAMMA = GAMMA
        self.UPDATE_EVERY = UPDATE_EVERY
        self.BATCH_SIZE = BATCH_SIZE
        self.Q_updates = 0
        self.n_step = n_step
        self.N = 32
        self.quantile_tau = torch.FloatTensor([i/self.N for i in range(1,self.N+1)]).to(device)

        self.action_step = 4
        self.last_action = None

        # Q-Network
        
        self.qnetwork_local = QR_DQN(state_size, action_size,layer_size, n_step, seed, self.N).to(device)
        self.qnetwork_target = QR_DQN(state_size, action_size,layer_size, n_step, seed, self.N).to(device)

        self.qnetwork_local.load_state_dict(torch.load('IQN1.pth'))
        self.qnetwork_target.load_state_dict(torch.load('IQN1.pth'))
        
        print('self.qnetwork_local.parameters():{}'.format(self.qnetwork_local.parameters()))
        
        for k, v in self.qnetwork_local.named_parameters():
            print(k, v)
        
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        print(self.qnetwork_local)
        
        # Replay memory
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, self.device, seed, self.GAMMA, n_step)
        
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy. Acting only every 4 frames!
        
        Params
        ======
            frame: to adjust epsilon
            state (array_like): current state
            
        """
        state = np.array(state)
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        self.qnetwork_local.eval()
        
        
        
        with torch.no_grad():
            action_values = self.qnetwork_local.get_action(state)
            #print('action_values:{}'.format(action_values))
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        action = np.argmax(action_values.cpu().data.numpy())
        #print('action:{}'.format(action))
        self.last_action = action
        #print(action)
        #print(state)
        return action

            

In [164]:

def run(env,frames=1000, eps_fixed=False, eps_frames=1e6, min_eps=0.01):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    output_history = []
    frame = 0
    if eps_fixed:
        eps = 0
    else:
        eps = 1
    eps_start = 1
    i_episode = 1
    state = env.reset()
    state = state[0,:]
    #print("state space:{}".format(state[0,:].shape))
    score = 0                  
    for frame in range(1, frames+1):
        
        if frame  == 0:
            # inital state

            initial = True
        else:
            # previous state
            initial = False
        #print('initial state:{}'.format(initial))

        action = agent.act(state, eps) #TODO: getting one dimension back.
        
        
        action = np.array([action])
        #print(action)
        next_state, reward, done, info = env_train.step([action]) #TODO: Wants a list of actions of size
        
        
        next_state = next_state[0,:]

        state = next_state
        #print("State: {}".format(state))
        score += reward
        # linear annealing to the min epsilon value until eps_frames and from there slowly decease epsilon to 0 until the end of training
        if eps_fixed == False:
            if frame < eps_frames:
                eps = max(eps_start - (frame*(1/eps_frames)), min_eps)
            else:
                eps = max(min_eps - min_eps*((frame-eps_frames)/(frames-eps_frames)), 0.001)

        # evaluation runs
        if frame % 100000 == 0:
            print("state: {}".format(state))
            print("score: {}".format(score))
            #print("state: {}".format(state))
            print("action:{}, Number:{}".format(action,frame))
            print("-------------------------")
        
        if done:
            scores_window.append(score)       # save most recent score
            scores.append(score)              # save most recent score
            writer.add_scalar("Average100", np.mean(scores_window), frame)
            output_history.append(np.mean(scores_window))
            print('\rEpisode {}\tFrame {} \tAverage Score: {:.2f}'.format(i_episode, frame, np.mean(scores_window)), end="")
            if i_episode % 100 == 0:
                print('\rEpisode {}\tFrame {}\tAverage Score: {:.2f}'.format(i_episode,frame, np.mean(scores_window)))
            i_episode +=1 

            state = env.reset()
            state = state[0,:]
            score = 0              

    return output_history


if __name__ == "__main__":
    # read and preprocess data
    preprocessed_path = "done_3stocks.csv"
    if os.path.exists(preprocessed_path):
        data = pd.read_csv(preprocessed_path, index_col=0)

    unique_trade_date = data[(data.datadate > 20101001)&(data.datadate <= 20200707)].datadate.unique()
    #print(unique_trade_date)

    train = data_split(data, start=20100101, end=20200101)
    env_train = DummyVecEnv([lambda: StockEnvTrain(train)])
    
    writer = SummaryWriter("runs/"+"IQN_CP_5")
    seed = 1
    BUFFER_SIZE = 100
    BATCH_SIZE = 8
    GAMMA = 0.99
    TAU = 1e-2
    LR = 1e-3
    UPDATE_EVERY = 1
    n_step = 1
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
    action_size = env_train.action_space.shape[0]
    state_size = env_train.observation_space.shape[0]
    agent = DQN_Agent(state_size=19,    
                    action_size=3,
                    Network="DDQN",
                    layer_size=512,
                    n_step=n_step,
                    BATCH_SIZE=BATCH_SIZE, 
                    BUFFER_SIZE=BUFFER_SIZE, 
                    LR=LR, 
                    TAU=TAU, 
                    GAMMA=GAMMA, 
                    UPDATE_EVERY=UPDATE_EVERY, 
                    device=device, 
                    seed=seed)

    
    # set epsilon frames to 0 so no epsilon exploration
    eps_fixed = False
    t0 = time.time()
    final_average100 = run(env=env_train, frames = 100000, eps_fixed=eps_fixed, eps_frames=5000, min_eps=0.025)
    t1 = time.time()


self.qnetwork_local.parameters():<generator object Module.parameters at 0x159b5a260>
head_1.weight Parameter containing:
tensor([[-0.2463, -0.2630, -0.1614,  ..., -0.3611, -0.3910, -0.4960],
        [-0.3770, -0.1720, -0.2778,  ..., -0.6713, -0.1004, -0.0239],
        [-0.1456, -0.7444, -0.1435,  ...,  0.0897, -0.0536, -0.4889],
        ...,
        [-0.6994,  0.0897, -0.1608,  ..., -0.1087, -0.2199,  0.1190],
        [-0.4599,  0.4351, -0.2418,  ..., -0.3472, -0.3592, -0.0515],
        [-0.2331,  0.1188, -0.2957,  ...,  0.1661,  0.0593, -0.2946]],
       requires_grad=True)
head_1.bias Parameter containing:
tensor([-3.3902e-05, -2.7106e-01,  1.8301e-02, -1.2399e-01, -3.5248e-01,
        -1.8112e-01, -1.5797e-01, -2.6200e-01, -2.9386e-01, -4.5188e-01,
        -3.8487e-01, -1.4696e-01, -3.9047e-01, -2.1903e-01, -3.4676e-01,
        -3.2200e-01, -1.5252e-01, -4.3159e-01, -4.5385e-01, -1.8346e-01,
        -5.6958e-01, -1.9914e-01, -1.7511e-01,  1.3883e-03, -2.0238e-01,
        -2.9055e-01

Finished
[1000, 291.52, 124.3, 326.4, 0, 0, 0, 7.716996917, 1.489069302, -7.871754813, 73.17316442, 56.03029524, 40.94551713, 150.7355356, 87.1548076, -108.6209868, 48.15525942, 21.00079147, 41.59561359]
end_total_asset:1000.0
Sharpe:  nan
[1.00000000e+03 3.05728571e+01 4.09200000e+01 5.61800000e+01
 0.00000000e+00 0.00000000e+00 0.00000000e+00 5.60928564e-01
 3.58722215e-01 6.24669814e-01 6.21337044e+01 5.67090572e+01
 5.85111632e+01 1.68824972e+02 2.09000651e+00 8.31548524e+01
 3.37911464e+01 1.15370605e+01 1.08349581e+01]
Episode 1	Frame 7546 	Average Score: 0.00[1.00000000e+03 3.05728571e+01 4.09200000e+01 5.61800000e+01
 0.00000000e+00 0.00000000e+00 0.00000000e+00 5.60928564e-01
 3.58722215e-01 6.24669814e-01 6.21337044e+01 5.67090572e+01
 5.85111632e+01 1.68824972e+02 2.09000651e+00 8.31548524e+01
 3.37911464e+01 1.15370605e+01 1.08349581e+01]


  sharpe = (252**0.5)*df_total_value['daily_return'].mean()/ \


Finished
[1000, 291.52, 124.3, 326.4, 0, 0, 0, 7.716996917, 1.489069302, -7.871754813, 73.17316442, 56.03029524, 40.94551713, 150.7355356, 87.1548076, -108.6209868, 48.15525942, 21.00079147, 41.59561359]
end_total_asset:1000.0
Sharpe:  nan
[1.00000000e+03 3.05728571e+01 4.09200000e+01 5.61800000e+01
 0.00000000e+00 0.00000000e+00 0.00000000e+00 5.60928564e-01
 3.58722215e-01 6.24669814e-01 6.21337044e+01 5.67090572e+01
 5.85111632e+01 1.68824972e+02 2.09000651e+00 8.31548524e+01
 3.37911464e+01 1.15370605e+01 1.08349581e+01]
Episode 2	Frame 15092 	Average Score: 0.00[1.00000000e+03 3.05728571e+01 4.09200000e+01 5.61800000e+01
 0.00000000e+00 0.00000000e+00 0.00000000e+00 5.60928564e-01
 3.58722215e-01 6.24669814e-01 6.21337044e+01 5.67090572e+01
 5.85111632e+01 1.68824972e+02 2.09000651e+00 8.31548524e+01
 3.37911464e+01 1.15370605e+01 1.08349581e+01]
Finished
[1000, 291.52, 124.3, 326.4, 0, 0, 0, 7.716996917, 1.489069302, -7.871754813, 73.17316442, 56.03029524, 40.94551713, 150.735535

KeyboardInterrupt: 