In [3]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gym
from gym import spaces
import copy
import os


import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gym
from gym import spaces
import copy
import os
import torch
import torch.nn as nn
from torch.distributions.categorical import Categorical
import torch.nn.functional as F
from torch.autograd import Variable
from tqdm import tqdm

In [23]:
#Technical indicators list. TODO: put these in a config file


TECHNICAL_INDICATORS_LIST = ["macd", "macds",
                             "boll_ub","boll_lb", 
                             "rsi_14", "cci_14", "dx_14",
                             "open_14_sma", "pdi", "mdi",
                            "dx", "adx", "vr", "wr_14"]

In [24]:
top_dir = 'perc_changes/'

In [25]:
tickers = ['JNJ', 'JPM', 'DIS', 'HD']

In [39]:
#Helper function to perform softmax
def softmax(x):
    return np.exp(x)/sum(np.exp(x))


INITIAL_BALANCE = 1000.0 #Start with balance of 1000 dollars
NUM_PAST_STATES = 15 #use past 14 days of price data as part of observation
EPISODE_LENGTH = 30 #each episode goes for 30 trading days
TRADE_FREQ = 1 #Trade every x amount of days

#OpenAI Gym style environment for RL
class TradeEnv(gym.Env):
    def __init__(self, tickers):
        super(TradeEnv, self).__init__()

        self.tickers = tickers
        
        data_idx = np.random.randint(0, 9)
        self.data_file = pickle.load(open('fake_data/train_ta_{}.p'.format(data_idx), 'rb'))
        
        self.feature_names = list(self.data_file[list(self.data_file.keys())[0]].keys())
        self.feature_names.remove('date')
        self.feature_names.remove('tic')
        self.feature_names.remove('price')

        self.features = {}
        self.means = {}
        self.stds = {}
        self.prices = {}
        for key, value in self.data_file.items():

            self.means[key] = np.mean(value[self.feature_names], axis = 0)
            self.stds[key] = np.std(value[self.feature_names], axis = 0)
            
            #Normalize features to have zero mean and unit standard deviation
            self.features[key] = np.divide(value[self.feature_names] - self.means[key],
                                          self.stds[key])
            
            self.prices[key] = value['price'].values
        
        pickle.dump(self.means, open(top_dir + 'feature_means', 'wb'))
        pickle.dump(self.stds, open(top_dir + 'feature_stds', 'wb'))
        
        
        self.prices = pd.DataFrame.from_dict(self.prices)
        
        #self.prices is a dataframe with each ticker being a key 
        #and the corresponding series representing the stock prices
        
        #Will be used later for normalization
        
        perc_changes = np.divide(self.prices[1:], self.prices[:-1]).values
        
        self.perc_means = np.mean(perc_changes, axis = 0)
        self.perc_stds = np.std(perc_changes, axis = 0)
        
        pickle.dump(self.perc_means, open(top_dir + 'perc_means', 'wb'))
        pickle.dump(self.perc_stds, open(top_dir + 'perc_stds', 'wb'))
        

        self.episode_length = EPISODE_LENGTH #number of trading minutes in episode

        self.num_past_states = NUM_PAST_STATES #number of past days that are used in state

        self.action_space = spaces.Box(low=-1, high=1, shape=(len(self.tickers) + 1,))
                                            

        obs_length = len(self.tickers) #observation due to past stacked states
        obs_length += 1 #balance
        obs_length += len(self.tickers) #holdings
        obs_length += len(self.tickers)*len(self.feature_names) #number of technical analysis features
        obs_length += len(self.tickers)
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf,
                                            shape=(obs_length,))
        
        self.trade_pen_frac = .00002

    def step(self, action_):

        
        #Apply softmax to RL output so that actions sum to 1

        action = softmax(10*action_.numpy()[0])

        #Liquidate past holdings
        self.balance += np.sum(self.holdings)
        
        trade_vol = np.sum(np.abs(self.last_action - action[:len(self.tickers)])*self.balance)
        trade_pen = self.trade_pen_frac*trade_vol


        self.last_action = action[:len(self.tickers)]
        
        self.balance = self.balance - trade_pen
        
        #New Portfolio at end of day
        self.holdings = self.balance*action[:-1]
        self.balance = self.balance*action[-1]
        

        
        #Step into next day
        self.index += TRADE_FREQ
        #Get stock prices at next day
        stock_obs = self.get_stock_obs(self.index)
        self.next_prices = stock_obs[-1]
        
        #Update value of current holdings
        perc_change = np.divide(self.next_prices, self.curr_prices)
        self.holdings = np.multiply(self.holdings, perc_change)

        self.curr_prices = self.next_prices
        
        self.net_worth = self.balance + np.sum(self.holdings)
        

        rew = self.net_worth - self.last_net_worth # reward is the delta between last net worth and current net worth

        self.last_net_worth = self.net_worth
        self.steps += TRADE_FREQ
        done = (self.net_worth <= 0) or (self.steps >= self.episode_length)

        obs = self.get_obs(stock_obs, self.balance, self.holdings, self.index)
        self.cum_rew += rew

        return obs, rew, done, {}
    
    
    def get_stock_obs(self, index):

        
        ret= self.prices[index - self.num_past_states:index][self.tickers].values #stack data
        return ret

    def get_obs(self, stock_obs, balance, holdings, index):
        #Normalize stock prices for inclusion in observations
        perc_changes = np.divide(stock_obs[1:], stock_obs[:-1])[-1:]
        perc_norm = np.divide(perc_changes - self.perc_means,
                             self.perc_stds).reshape(-1,)

        
        feature_vals = np.array([])
        ix = index - 1
        #Add in features at current timestep, for each ticker
        for tic in self.tickers:
            feature_vals = np.append(feature_vals, (self.features[tic].iloc[ix][self.feature_names].values))
        
        #Form observation and normalize balance and holdings
#         balance_norm = (balance - 1000.0/(len(self.tickers) + 1))/50.0
#         holdings_norm = (holdings - 1000.0/(len(self.tickers) + 1))/50.0
        
        net_worth = balance + np.sum(holdings)
        net_worth_norm = (net_worth - 1000.0)
        holding_frac_norm = (holdings/net_worth - .5)*20
        act_norm = (self.last_action - .5)*20
        return np.concatenate([perc_norm, [net_worth_norm], holding_frac_norm, act_norm, feature_vals])

        
    def reset(self):
        
        data_idx = np.random.randint(0, 10)
        self.data_file = pickle.load(open('fake_data/train_ta_{}.p'.format(data_idx), 'rb'))
        
        for key, df in self.data_file.items():
            df['macd'] = np.divide(df['macd'], df['price'])
            df['macds'] = np.divide(df['macds'], df['price'])


            df['boll_ub'] = np.divide(df['boll_ub'], df['price'])
            df['boll_lb'] = np.divide(df['boll_lb'], df['price'])

            df['open_14_sma'] = np.divide(df['open_14_sma'], df['price'])

        self.features = {}

        self.prices = {}
        for key, value in self.data_file.items():
 
            #Normalize features to have zero mean and unit standard deviation
            self.features[key] = np.divide(value[self.feature_names] - self.means[key],
                                          self.stds[key])
            
            self.prices[key] = value['price'].values

        self.prices = pd.DataFrame.from_dict(self.prices)
        
        
        self.cum_rew = 0.0
        self.steps = 0
        self.index = np.random.randint(2*NUM_PAST_STATES, len(self.prices) - EPISODE_LENGTH - 10)

        #self.init_prices = self.prices[self.index-1:self.index + EPISODE_LENGTH]
        stock_obs = self.get_stock_obs(self.index)
        self.holdings = np.zeros(len(self.tickers)) #holdings of each stock in number of shares
        self.balance = INITIAL_BALANCE
        self.last_net_worth = INITIAL_BALANCE
        self.net_worth = INITIAL_BALANCE
        
        self.curr_prices = stock_obs[-1]
        
        self.last_action = np.zeros(len(self.tickers))
        
        

        obs = self.get_obs(stock_obs, self.balance, self.holdings, self.index)
        return obs  # reward, done, info can't be included



In [40]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gym
from gym import spaces
import copy
import os
import torch
import torch.nn as nn
from torch.distributions.categorical import Categorical
import torch.nn.functional as F
from torch.autograd import Variable
from tqdm import tqdm

In [41]:
# these following functions were coded with reference to
# https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail

def init_params(m):
    classname = m.__class__.__name__
    if classname.find("Linear") != -1:
        m.weight.data.normal_(0, 1)
        m.weight.data *= 1 / torch.sqrt(m.weight.data.pow(2).sum(1, keepdim=True))
        if m.bias is not None:
            m.bias.data.fill_(0)

def init_gru_params(gru):
    for name, param in gru.named_parameters():
        if 'bias' in name:
            nn.init.constant_(param, 0)
        elif 'weight' in name:
            nn.init.orthogonal_(param)

# this is from https://github.com/p-morais/deep-rl/blob/master/rl/distributions/gaussian.py
class DiagonalGaussian(nn.Module):
    def __init__(self, num_outputs, init_std=1, learn_std=True):
        super(DiagonalGaussian, self).__init__()

        self.logstd = nn.Parameter(
            torch.ones(1, num_outputs) * np.log(init_std),
            requires_grad=learn_std
        )

        self.learn_std = learn_std

    def forward(self, x):
        mean = x
        
#         print(self.logstd.sum())
        std = self.logstd.exp()
        
        return mean, std

    def sample(self, x, deterministic):
        if deterministic is False:
            action = self.evaluate(x).sample()
        else:
            action, _ = self(x)

        return action

    def evaluate(self, x):
        mean, std = self(x)
        output = torch.distributions.Normal(mean, std)
        return output

    
class ACModel(nn.Module):
    def __init__(self, obs_dim, act_dim, num_layers_1=1, num_layers_2 = 2, hidden_size=64, learn_std=True):
        super().__init__()
        
        # num tickers, time horizon, and num ta used to compute the number of inputs
        # for recurrent network, the input size to GRU is just the num tickers (prices at each timestep)
        # and input size to the actor/critic is the current cash, holdings, and technical analysis for each ticker
        # for feedforward, the input size is num tickers * time horizon
        # and input size to the actor/critic is the current cash, holdings, and technical analysis for each ticker

        # TODO changed for cartpole
        self.act_dim = act_dim
        self.obs_dim = obs_dim

        
        self.hidden_size = hidden_size
        #self.num_layers = num_layers
        



        seq_1 = [nn.Linear(obs_dim, hidden_size), nn.Tanh()]
        for i in range(num_layers_1-1):
            seq_1.extend([nn.Linear(hidden_size, hidden_size), nn.Tanh()])

        self.fwd_actor = nn.Sequential(*seq_1)
        self.fwd_critic = nn.Sequential(*seq_1)
        

        mid_layer = int(hidden_size)

        seq_2_act = [nn.Linear(hidden_size, mid_layer), nn.Tanh()]
        seq_2_obs = [nn.Linear(hidden_size, mid_layer), nn.Tanh()]
        for i in range(num_layers_2-2):
            seq_2_act.extend([nn.Linear(mid_layer, mid_layer), nn.Tanh()])
            seq_2_obs.extend([nn.Linear(mid_layer, mid_layer), nn.Tanh()])
            
        seq_2_act.extend([nn.Linear(mid_layer, self.act_dim), nn.Tanh()])
        seq_2_obs.extend([nn.Linear(mid_layer, 1)])

        self.actor = nn.Sequential(*seq_2_act)
        self.critic = nn.Sequential(*seq_2_obs)
        
        
        
        
        self.dist = DiagonalGaussian(self.act_dim, learn_std=learn_std)

        #self.apply(init_params)
    
    def init_hidden(self, batch_size):
        # h0 should be of shape (num_layers * num_directions, batch size, hidden_size)
        # num_layers is 1, and RNN is not bidirectional, so num_dir = 1
        # (1, batch_size, hidden size)
        h = torch.zeros(self.num_layers, batch_size, self.hidden_size)
        return nn.Parameter(h, requires_grad=True)

    def forward(self, obs, rnn_h_a=None, rnn_h_c=None):
        # suppose obs is just a vector of previous prices
        #price_obs = obs[:,:self.num_tickers * self.time_horizon]
        #other_obs = obs[:,self.num_tickers * self.time_horizon:]
        price_obs = obs

        obs_actor = self.fwd_actor(price_obs)
        obs_critic = self.fwd_critic(price_obs)
            
    
        
        forward_actor = self.actor(obs_actor)
        action_dist = self.dist.evaluate(forward_actor)
        
        forward_critic = self.critic(obs_critic)
        
        return action_dist, forward_critic, rnn_h_a, rnn_h_c

In [29]:
tv = torch.FloatTensor(np.random.uniform(size = (10, 4)))

In [44]:
trade_env.data_file.keys()

dict_keys(['JNJ', 'JPM', 'DIS', 'HD'])

In [43]:
trade_env = TradeEnv(tickers=tickers)



In [10]:
class RACModel(nn.Module):
    def __init__(self, obs_dim, act_dim, num_tickers, num_layers_1=1, num_layers_2 = 2, hidden_size=64, learn_std=True):
        super().__init__()
        
        # num tickers, time horizon, and num ta used to compute the number of inputs
        # for recurrent network, the input size to GRU is just the num tickers (prices at each timestep)
        # and input size to the actor/critic is the current cash, holdings, and technical analysis for each ticker
        # for feedforward, the input size is num tickers * time horizon
        # and input size to the actor/critic is the current cash, holdings, and technical analysis for each ticker

        # TODO changed for cartpole
        self.act_dim = act_dim
        self.obs_dim = obs_dim
        self.num_tickers = num_tickers
        self.num_layers_1 = num_layers_1

        
        self.hidden_size = hidden_size
        #self.num_layers = num_layers
        


        
        self.gru_actor = nn.GRU(self.num_tickers, hidden_size, num_layers=num_layers_1, batch_first=True)
        self.gru_critic = nn.GRU(self.num_tickers, hidden_size, num_layers=num_layers_1, batch_first=True)
            
        

        mid_layer = int(hidden_size)

        seq_2_act = [nn.Linear(hidden_size, mid_layer), nn.Tanh()]
        seq_2_obs = [nn.Linear(hidden_size, mid_layer), nn.Tanh()]
        for i in range(num_layers_2-2):
            seq_2_act.extend([nn.Linear(mid_layer, mid_layer), nn.Tanh()])
            seq_2_obs.extend([nn.Linear(mid_layer, mid_layer), nn.Tanh()])
            
        seq_2_act.extend([nn.Linear(mid_layer, self.act_dim), nn.Tanh()])
        seq_2_obs.extend([nn.Linear(mid_layer, 1)])

        self.actor = nn.Sequential(*seq_2_act)
        self.critic = nn.Sequential(*seq_2_obs)


        
        
        
        self.dist = DiagonalGaussian(self.act_dim, learn_std=learn_std)

        #self.apply(init_params)
    
    def init_hidden(self, batch_size):
        # h0 should be of shape (num_layers * num_directions, batch size, hidden_size)
        # num_layers is 1, and RNN is not bidirectional, so num_dir = 1
        # (1, batch_size, hidden size)
        h = torch.zeros(self.num_layers_1, batch_size, self.hidden_size)
        return nn.Parameter(h, requires_grad=True)

    def forward(self, obs, rnn_h_a=None, rnn_h_c=None):
        # suppose obs is just a vector of previous prices
        
        price_obs = obs[:,:self.num_tickers]
        other_obs = obs[:,self.num_tickers:]
        

        if rnn_h_a is None:
            rnn_h_a = self.init_hidden(obs.size(0))
        if rnn_h_c is None:
            rnn_h_c = self.init_hidden(obs.size(0))

        price_obs = torch.reshape(price_obs, (-1, self.num_tickers))
        
        
        gru_obs = price_obs.unsqueeze(1)
        


        obs_actor, rnn_h_a = self.gru_actor(gru_obs, rnn_h_a)

        obs_critic, rnn_h_c = self.gru_critic(gru_obs, rnn_h_c)

        obs_actor = torch.squeeze(obs_actor, 1)
        obs_critic = torch.squeeze(obs_critic, 1)
        forward_actor = self.actor(obs_actor)
        action_dist = self.dist.evaluate(forward_actor)
        
        forward_critic = self.critic(obs_critic)

        
        return action_dist, forward_critic, rnn_h_a, rnn_h_c

In [21]:
class RolloutBuffer:
    def __init__(self, acmodel, env, discount=0.9998, gae_lambda=0.95, device=None):
        # TODO changed for cartpole
        self.episode_length = env.episode_length
        self.env = env
#         self.episode_length = 200 - 1
        self.device = device
        self.acmodel = acmodel
        self.discount = discount
        self.gae_lambda = gae_lambda
        
        self.actions = None
        self.values = None
        self.rewards = None
        self.log_probs = None
        self.obss = None
        self.gaes = None
        self.returns = None
        
        self.num_rollouts = 40
        self.reset()
        
        
    def reset(self):
        self.actions = torch.zeros((self.num_rollouts*self.episode_length, len(self.env.tickers)+1), device=self.device)
        self.values = torch.zeros(self.num_rollouts*self.episode_length, device=self.device)
        self.rewards = torch.zeros(self.num_rollouts*self.episode_length, device=self.device)
        self.returns = torch.zeros(self.num_rollouts*self.episode_length, device=self.device)
        
        self.log_probs = torch.zeros((self.num_rollouts*self.episode_length, len(self.env.tickers)+1), device=self.device)
        self.obss = [None] * (self.episode_length*self.num_rollouts)
        
        self.gaes = torch.zeros(self.num_rollouts*self.episode_length, device=self.device)
        
        self.hidden_as = torch.zeros((self.num_rollouts*self.episode_length, 1, 1, 64), device=self.device)
        self.hidden_cs = torch.zeros((self.num_rollouts*self.episode_length, 1, 1, 64), device=self.device)
                                     
                                     
                                     
    
    def process_obs(self, obs):
        # TODO: formatting stuff
        if isinstance(obs, list):
            obs = np.stack(obs)

        if len(obs.shape) == 1: # 1 dimensional
            obs = np.expand_dims(obs, axis=0)
            
        return torch.FloatTensor(obs)
    
    def collect_experience(self):
        
        total_return = 0
        self.returns = []
        
        for ep in range(self.num_rollouts):
            obs = env.reset()
            hidden_a, hidden_c = None, None
            
            self.actions_ = torch.zeros((self.episode_length, len(self.env.tickers)+1), device=self.device)
            self.values_ = torch.zeros(self.episode_length, device=self.device)
            self.rewards_ = torch.zeros(self.episode_length, device=self.device)
            self.obss_ = [None] * (self.episode_length)
            self.log_probs_ = torch.zeros((self.episode_length, len(self.env.tickers)+1), device=self.device)
        
            self.hidden_as_ = torch.zeros((self.episode_length, 1, 1, 64), device=self.device)
            self.hidden_cs_ = torch.zeros((self.episode_length, 1, 1, 64), device=self.device)
            
            
            
            T = 0
        
            while True:
                with torch.no_grad():
                    dist, value, hidden_a, hidden_c = self.acmodel(self.process_obs(obs),
                                                                  rnn_h_a = hidden_a,
                                                                  rnn_h_c = hidden_c)

                action = dist.sample()

                self.obss_[T] = obs

                obs, reward, done, _ = env.step(action)
                
                total_return += reward
                
                self.actions_[T] = action[0]
                self.values_[T] = value
                self.rewards_[T] = float(reward)
                self.log_probs_[T] = dist.log_prob(action)[0]

                self.hidden_as_[T] = hidden_a
                self.hidden_cs_[T] = hidden_c


                T += 1
                if done:
                    break
                    
            self.actions[ep*self.episode_length:(ep+1)*self.episode_length] = self.actions_[:T]
            self.values[ep*self.episode_length:(ep+1)*self.episode_length] = self.values_[:T]
            self.rewards[ep*self.episode_length:(ep+1)*self.episode_length] = self.rewards_[:T]
            
            self.hidden_as[ep*self.episode_length:(ep+1)*self.episode_length] = self.hidden_as_[:T]
            self.hidden_cs[ep*self.episode_length:(ep+1)*self.episode_length] = self.hidden_cs_[:T]
            
            
            discounted_reward = 0.0
            self.returns_ = []
            for r in reversed(self.rewards_):
                discounted_reward = r + self.discount*discounted_reward
                self.returns_.insert(0, discounted_reward)
            self.returns[ep*self.episode_length:(ep+1)*self.episode_length] = self.returns_[:T]
            

            self.log_probs[ep*self.episode_length:(ep+1)*self.episode_length] = self.log_probs_[:T]
            
            self.obss[ep*self.episode_length:(ep+1)*self.episode_length] = self.process_obs(self.obss_[:T])
            self.gaes_ = self.compute_advantage_gae(self.rewards_, self.values_, T)
            self.gaes[ep*self.episode_length:(ep+1)*self.episode_length] = self.gaes_[:T]
            
        self.obss = torch.FloatTensor(np.stack(self.obss))
        self.returns = torch.FloatTensor(self.returns)
        
        return total_return/self.num_rollouts, T
            
    def compute_advantage_gae(self, rewards, values, T):

        deltas = torch.cat((rewards[:-1] + self.discount*values[1:] - values[:-1], rewards[-1:] - values[-1:]))
        deltas_flip = torch.flip(deltas, [0])
        A = 0.0*deltas_flip[0]
        advantages = torch.zeros_like(values)
        for i in range(len(deltas_flip)):
            A = self.discount*self.gae_lambda*A + deltas_flip[i]
            advantages[i] = A
        advantages = torch.flip(advantages, [0])
        return advantages
            
        

In [22]:
class PPO:
    def __init__(self,
                 acmodel,
                 clip_ratio=0.2,
                 entropy_coef=0.01,
                 lr=2e-4,
                 target_kl=0.01,
                 train_iters=10):
        
        self.acmodel = acmodel
        self.clip_ratio = clip_ratio
        self.entropy_coef = entropy_coef
        self.target_kl=target_kl
        self.train_iters = train_iters
        
        self.optimizer = torch.optim.Adam(acmodel.parameters(), lr=lr)
        
    def update(self, rollouts):
        # rollouts should be RolloutBuffer object

        dist, _, _, _ = self.acmodel(rollouts.obss) # TODO may need to process these observations
        old_logp = dist.log_prob(rollouts.actions.view(-1,self.acmodel.act_dim)).detach()
        

#         policy_loss, _ = self._compute_policy_loss_ppo(rollouts.obss, old_logp, rollouts.actions, rollouts.gaes)
#         value_loss = self._compute_value_loss(rollouts.obss, rollouts.returns)
        
        batch_size = 256
        avg_policy_loss, avg_value_loss = 0, 0
        for i in range(self.train_iters):
            rand_idxs = np.random.choice(np.arange(len(rollouts.obss)), batch_size , replace = False)
            
            self.optimizer.zero_grad()
            pi_loss, approx_kl = self._compute_policy_loss_ppo(rollouts.obss[rand_idxs], old_logp[rand_idxs], 
                                                               rollouts.actions[rand_idxs], rollouts.gaes[rand_idxs],
                                                              rollouts.hidden_as[rand_idxs],
                                                              rollouts.hidden_cs[rand_idxs])

            v_loss = self._compute_value_loss(rollouts.obss[rand_idxs], rollouts.returns[rand_idxs],
                                             rollouts.hidden_as[rand_idxs],
                                                              rollouts.hidden_cs[rand_idxs])
            #print ('vloss:', v_loss, 'pi_loss:', pi_loss)
            loss = .01*v_loss + pi_loss
            
            avg_policy_loss += pi_loss.item()
            avg_value_loss += v_loss.item()
   
            
            if approx_kl > 1.5 * self.target_kl:
                break
            
            loss.backward(retain_graph=True) # lol todo are we supposed to retain graph?
            #loss.backward()
            
#             torch.nn.utils.clip_grad_norm_(acmodel.parameters(), .5)
            
            self.optimizer.step()
            
        return avg_policy_loss/self.train_iters, avg_value_loss/self.train_iters
        
    def _compute_policy_loss_ppo(self, obs, old_logp, actions, advantages, hidden_as, hidden_cs):
        policy_loss, approx_kl = 0, 0
        

        dist, _, _, _ = self.acmodel(obs, 
                                     rnn_h_a = torch.swapaxes(torch.squeeze(hidden_as, 2), 0, 1),
                                    rnn_h_c = torch.swapaxes(torch.squeeze(hidden_cs, 2), 0, 1))
        
        new_logp = dist.log_prob(actions.view(-1,self.acmodel.act_dim))
        
        entropy = torch.mean(dist.entropy())
        entropy = entropy * self.entropy_coef


        r = torch.exp(new_logp - old_logp.detach())

        clamp_adv = torch.clamp(r, 1-self.clip_ratio, 1+self.clip_ratio)*advantages.view(-1,1)
        
        min_advs = torch.minimum(r*advantages.view(-1,1), clamp_adv)
        

        policy_loss = -torch.mean(min_advs) - entropy
        approx_kl = (old_logp - new_logp).mean()
        
        return policy_loss, approx_kl
    
    def _compute_value_loss(self, obs, returns, hidden_as, hidden_cs):
        _, values, _, _ = self.acmodel(obs, 
                                       rnn_h_a = torch.swapaxes(torch.squeeze(hidden_as, 2), 0, 1),
                                    rnn_h_c = torch.swapaxes(torch.squeeze(hidden_cs, 2), 0, 1))
        
        value_loss = torch.mean((returns.view(-1,1) - values.view(-1,1))**2)

        return value_loss



In [1]:
# Adapted from 6.884 HW4

def run_experiment(acmodel, env, ppo_kwargs, rollout_kwargs = {}, max_episodes=200000, score_threshold=0.8):
    # acmodel_args should be dictionary corresponding to inputs of acmodel
    # ie {num_tickers: 4, time_horizon: 5, etc..}
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    acmodel.to(device)

    is_solved = False
    
    SMOOTH_REWARD_WINDOW = 30

    pd_logs, rewards = [], [0]*SMOOTH_REWARD_WINDOW
    
    num_frames = 0
    rollouts = RolloutBuffer(acmodel, env, **rollout_kwargs)
    ppo = PPO(acmodel, **ppo_kwargs)

    pbar = tqdm(range(max_episodes))
    for update in pbar:
        rollouts.reset() # resetting the buffer
        total_return, T = rollouts.collect_experience()
        policy_loss, value_loss = ppo.update(rollouts)
        
        num_frames += T
        rewards.append(total_return)
        
        smooth_reward = np.mean(rewards[-SMOOTH_REWARD_WINDOW:])

        data = {'episode':update, 'num_frames':num_frames, 'smooth_reward':smooth_reward,
                'reward':total_return, 'policy_loss': policy_loss, 'value_loss': value_loss}

        pd_logs.append(data)

        pbar.set_postfix(data)

        # Early terminate
        if smooth_reward >= score_threshold:
            is_solved = True
            break
#         if update % 20 == 0 and update != 0:
#             pickle.dump(pd.DataFrame(pd_logs).set_index('episode'),
#                        open('train_df.p', 'wb'))

    if is_solved:
        print('Solved!')
    else:
        print('Unsolved. Check your implementation.')
    
    return pd.DataFrame(pd_logs).set_index('episode')

In [2]:
env = TradeEnv(tickers=tickers)

NameError: name 'TradeEnv' is not defined

In [27]:
num_ind = len(TECHNICAL_INDICATORS_LIST) + 1 # plus volume

acmodel_kwargs = {'hidden_size': 64, 'num_layers_1': 1, 'num_layers_2': 2}
acmodel = RACModel(env.observation_space.shape[0],
                  env.action_space.shape[0],
                   len(tickers),
                  **acmodel_kwargs)
ppo_kwargs = {'lr': 1e-3, 'entropy_coef': 0.01, 'train_iters': 10}
fwd_df = run_experiment(acmodel, env, ppo_kwargs, max_episodes=70000, score_threshold=10000)

  1%|          | 754/70000 [1:03:00<96:27:18,  5.01s/it, episode=753, num_frames=22620, smooth_reward=4.78, reward=10, policy_loss=-1.76, value_loss=3.51e+3]   


KeyboardInterrupt: 

In [26]:
torch.save(acmodel.state_dict(), 'rnn_model')

In [134]:
num_ind = len(TECHNICAL_INDICATORS_LIST) + 1 # plus volume
acmodel_kwargs = {'hidden_size': 128, 'num_layers_1': 1, 'num_layers_2': 2}
acmodel = ACModel(env.observation_space.shape[0],
                  env.action_space.shape[0],
                  **acmodel_kwargs)
ppo_kwargs = {'lr': 2e-3, 'entropy_coef': 0.01, 'train_iters': 10}
fwd_df = run_experiment(acmodel, env, ppo_kwargs, max_episodes=70000, score_threshold=10000)

  1%|          | 577/70000 [33:59<68:09:07,  3.53s/it, episode=576, num_frames=17310, smooth_reward=24.9, reward=42.4, policy_loss=-10.2, value_loss=2.96e+3]   


KeyboardInterrupt: 

<All keys matched successfully>