<a href="https://colab.research.google.com/github/interritus141/COMP0031-Group-Research-Project/blob/master/COMP0031.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
import gym
from stable_baselines3 import A2C, DDPG, DQN, PPO
from ta import add_all_ta_features
import yfinance as yf
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import torch

import functools
import os
import csv
import json
from math import prod

# Helpers

In [None]:
def plot_line_graph(env, x_vals, y_dict, title):
  for agent in env.possible_agents:
    plt.plot(x_vals, y_dict[agent], label=agent)
  plt.title(title)
  plt.legend()
  plt.show()

In [None]:
def calculate_sharpe(portfolio_return_mem):
  df_daily_return = pd.DataFrame(portfolio_return_mem)
  df_daily_return.columns = ['daily_return']
  sharpe = -1 
  if df_daily_return['daily_return'].std() != 0:
    sharpe = (252**0.5)*df_daily_return['daily_return'].mean() / df_daily_return['daily_return'].std()
  return sharpe # -1 means error

In [None]:
def save_to_json(filename, data_dict):
  with open(filename, "w") as data_out:
    json.dump(data_dict, data_out)

# Data

## Tech Indicators

In [None]:
def add_ta(df):
  ta_df = add_all_ta_features(df, open="Open", high="High", low="Low", close="Close", volume="Volume")
  # print(ta_df.columns)
  ta_df = ta_df[["Open", "High", "Low", "Close", "Volume", "Adj Close", "volume_obv",
                                "volume_adi", "trend_adx", "momentum_ao", "trend_macd", "momentum_rsi", 
                                "momentum_stoch"]]
  ta_df = ta_df.fillna(ta_df.mean())
  return ta_df

## Stocks

1. Apple Inc. (AAPL)
2. Microsoft Corp. (MSFT)
3. Amazon.com, Inc. ( AMZN)
4. Tesla, Inc. (TSLA)
5. Nvidia Corp. (NVDA)

In [None]:
# interval = 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo
# prepost = T/F

stock_volatilities = {}
stocks = dict.fromkeys(["AAPL", "MSFT", "AMZN", "TSLA", "NVDA", "CAAS"])

for stock in stocks.keys():
  stock_df = yf.download(stock, start="2018-01-01", end="2022-12-31", keepna=True)
  stock_df = stock_df.fillna(stock_df.mean())
  stock_df = add_ta(stock_df)
  stocks[stock] = stock_df
  stock_volatilities[stock] = None

## Volatility

In [None]:
def add_volatility(df, df_name):
  df["Log returns"] = np.log(df['Close'] / df['Close'].shift())
  stock_volatilities[df_name] = df['Log returns'].std() * 252 ** .5

In [None]:
def visualise_volatility(df, df_name, volatility):
  fig, ax = plt.subplots()
  df['Log returns'].hist(ax=ax, bins=50, alpha=0.6, color='b')
  ax.set_xlabel("Log return")
  ax.set_ylabel("Freq of log return")
  ax.set_title("{:s} volatility: {:.2f}%".format(df_name, volatility*100))

for stock, stock_df in stocks.items():
  add_volatility(stock_df, stock)
  # visualise_volatility(stock_df, stock, stock_volatilities[stock]) # use to generate graphs

print(stock_volatilities)

## Cov

In [None]:
def add_cov(df):
  df = df.reset_index()

  cov_list = []
  return_list = []

  # look back is one year
  lookback=252
  for i in range(lookback,len(df.index.unique())):
    data_lookback = df.iloc[i-lookback:i,:]
    price_lookback=data_lookback.pivot_table(index = 'Date', values = 'Close')
    return_lookback = price_lookback.pct_change().dropna()
    return_list.append(return_lookback)

    covs = return_lookback.cov().values 
    cov_list.append(covs)


  df_cov = pd.DataFrame({'Date':df["Date"].unique()[lookback:],'cov_list':cov_list,'return_list':return_list})
  df = df.merge(df_cov, on='Date')
  df = df.sort_values(['Date']).reset_index(drop=True)
  return df

In [None]:
# high volatility
new_aapl_df = add_cov(stocks["AAPL"])

# low volatility
new_tsla_df = add_cov(stocks["TSLA"])

In [None]:
data_aapl_df = new_aapl_df.copy()
data_aapl_df["tic"] = "AAPL"

data_tsla_df = new_tsla_df.copy()
data_tsla_df["tic"] = "TSLA"

mixed_df = pd.concat([data_aapl_df, data_tsla_df])

# Environment

## Agent

In [None]:
policy = "MlpPolicy"
training_timesteps = 5

In [None]:
AGENT_STR_TO_OBJECT = {
    "A2C": A2C,
    "DDPG": DDPG,
    "PPO": PPO,
}

In [None]:
TYPE_TO_NAMES = {
    "A2C": ["A2C1", "A2C2", "A2C3"],
    "DDPG": ["DDPG1", "DDPG2", "DDPG3"],
    "PPO": ["PPO1", "PPO2", "PPO3"],
    "Mixed": ["A2C", "DDPG", "PPO"],
}

In [None]:
# configurations

stock_dimension = len(mixed_df["tic"].unique())
state_space = 2

env_kwargs = {
  "hmax": 100, 
  "initial_amount": 1000000, 
  "transaction_cost_pct": 0.001, 
  "state_space": state_space, 
  "stock_dim": stock_dimension, 
  "tech_indicator_list": [
    "volume_obv",
    "volume_adi", 
    "trend_adx", 
    "momentum_ao", 
    "trend_macd", 
    "momentum_rsi", 
    "momentum_stoch"
  ], 
  "action_space": stock_dimension, 
  "reward_scaling": 1e-4,
    
}

## Competitive

In [None]:
class CompetitivePMEnv(gym.Env):
  metadata = {"render_modes": ["human"], "name": "marlpm_v1"}

  def __init__(
      self, 
      df,
      stock_dim,
      hmax,
      initial_amount,
      transaction_cost_pct,
      reward_scaling,
      state_space,
      action_space,
      tech_indicator_list,
      turbulence_threshold=None,
      lookback=252,
      day=0,
      render_mode=None,
      algo_type="Mixed", # default=mixed
  ):

    assert algo_type in ["A2C", "DDPG", "PPO", "Mixed"]

    # attributes
    self.lookback=lookback
    self.df = df
    self.stock_dim = stock_dim
    self.hmax = hmax
    self.initial_amount = initial_amount
    self.transaction_cost_pct =transaction_cost_pct
    self.reward_scaling = reward_scaling
    self.state_space = state_space
    self.action_dim = action_space
    self.tech_indicator_list = tech_indicator_list
    self.possible_agents = TYPE_TO_NAMES[algo_type]
    
    # buy/sell ratio reference, to explore
    self.end_day = len(self.df.index.unique()) - 1
    self.stock_volume_reference = 10000

    # spaces
    # check: spaces for observations only? sharing will affect?
    self.action_space = gym.spaces.Box(low = -1, high = 1, shape = (self.action_dim,))
    self.observation_space = gym.spaces.Box(low=0, high=np.inf, shape = (1+len(self.tech_indicator_list), self.state_space))

    # agents
    self.agent_name_mapping = {
        # agent: AGENT_STR_TO_OBJECT[agent](policy, self) for agent in self.possible_agents
        agent: AGENT_STR_TO_OBJECT[algo_type](policy, self, n_steps=self.end_day) for agent in self.possible_agents
    }
    self.training_agent = None
    self.day = {
        agent: day for agent in self.possible_agents
    }

    # data
    self.data = {
        agent: self.df.loc[self.day[agent],:] for agent in self.possible_agents
    }
    self.covs = {
        agent: [[x[0][0] for x in self.data[agent]['cov_list']]] for agent in self.possible_agents
    }
    self.state = {
        agent: np.append(np.array(self.covs[agent]), [self.data[agent][tech].values.tolist() for tech in self.tech_indicator_list ], axis=0) for agent in self.possible_agents
    }
    self.terminal = False     
    self.turbulence_threshold = turbulence_threshold   

    # memory
    self.portfolio_value = {
        agent: self.initial_amount for agent in self.possible_agents
    }
    self.asset_memory = {
        agent: [self.initial_amount] for agent in self.possible_agents
    }
    self.portfolio_return_memory = {
        agent: [0] for agent in self.possible_agents
    }
    self.cum_portfolio_return_memory = {
        agent: [0] for agent in self.possible_agents
    }

    # stock ratio
    self.actions_memory = {
        agent: [[0]*self.stock_dim] for agent in self.possible_agents
    }
    self.date_memory = {
        agent: [self.data[agent]["Date"].unique()[0]] for agent in self.possible_agents
    }

    # free cash
    self.money_memory = {
        agent: [self.initial_amount] for agent in self.possible_agents
    }
    # cash + stock value
    self.total_value_memory = {
        agent: [self.initial_amount] for agent in self.possible_agents
    }
    # individual actions collection
    self.individual_preds = {
        agent: [] for agent in self.possible_agents
    }

    # render mode
    self.render_mode = render_mode

  def collect_individual_preds(self):
    self.individual_preds = {
        agent: [] for agent in self.possible_agents
    }
    for i in range(self.end_day+1):
      # states are somewhat static
      curr_data = self.df.loc[i,:]
      curr_covs = [[x[0][0] for x in curr_data['cov_list']]]
      curr_state = np.append(np.array(curr_covs), [curr_data[tech].values.tolist() for tech in self.tech_indicator_list ], axis=0)
      for agent in self.possible_agents:
        action, _states = self.agent_name_mapping[agent].predict(curr_state)
        self.individual_preds[agent].append(action)
  
  def step(self, actions):
    # print(self.day)
    # print(actions)

    # termination
    # check: termination determined correctly?
    self.terminal = self.day[self.training_agent] >= self.end_day
    # self.terminal = {
    #     agent: self.day >= len(self.df.index.unique())/self.stock_dim-1 for agent in self.possible_agents
    # }

    if self.terminal:
      print("=================================")
      print("begin_total_asset:{}".format(self.asset_memory[self.training_agent][0]))           
      # print("end_total_asset:{}".format(self.portfolio_value[self.training_agent]))
      print("end_total_asset:{}".format(self.total_value_memory[self.training_agent][0]))

      df_daily_return = pd.DataFrame(self.portfolio_return_memory[self.training_agent])
      # df_daily_pv = pd.DataFrame(self.total_value_memory[self.training_agent])
      df_daily_return.columns = ['daily_return']
      # df_daily_pv.columns = ['daily_pv']
      if df_daily_return['daily_return'].std() !=0:
        sharpe = (252**0.5)*df_daily_return['daily_return'].mean() / df_daily_return['daily_return'].std()
      # if df_daily_pv['daily_pv'].std() !=0:
      #   sharpe = (252**0.5)*df_daily_pv['daily_pv'].mean() / df_daily_pv['daily_pv'].std()
        print("Sharpe: ",sharpe)
      print("=================================")

      return self.state[self.training_agent], self.reward[self.training_agent], self.terminal, {}

    else:
      # loop through all agents so that each of them predict an action (portfolio weights)
      for agent in self.possible_agents:
        # get action
        if agent == self.training_agent:
          action = actions
        else:
          # action, _states = self.agent_name_mapping[agent].predict(self.state[agent], deterministic=False)
          action = self.individual_preds[agent][self.day[agent]]

        # normalisation
        weights = self.softmax_normalization(action) 

        # stock ratio - buy/sell/hold
        prev_stock_ratio = self.actions_memory[agent][-1]
        diff_stock_ratio = prev_stock_ratio - weights

        # money - increase if sell, decrease if buy, no changes if hold
        prev_money = self.money_memory[agent][-1]
        curr_money = prev_money + sum(diff_stock_ratio * self.stock_volume_reference * self.data[agent]["Close"].values)
        self.money_memory[agent].append(curr_money)

        # total value - money + currently held stock value
        curr_total = curr_money + sum(weights * self.stock_volume_reference * self.data[agent]["Close"].values)
        self.total_value_memory[agent].append(curr_total)

        # actions memory
        self.actions_memory[agent].append(weights)
        last_day_memory = self.data[agent]

        # load next state
        self.day[agent] += 1
        self.data[agent] = self.df.loc[self.day[agent],:]
        self.covs[agent] = [[x[0][0] for x in self.data[agent]['cov_list']]]
        self.state[agent] =  np.append(np.array(self.covs[agent]), [self.data[agent][tech].values.tolist() for tech in self.tech_indicator_list ], axis=0)
        
        # calculate portfolio return
        # individual stocks' return * weight
        portfolio_return = sum(((self.data[agent]["Close"].values / last_day_memory["Close"].values)-1)*weights)
        
        # update portfolio value
        # todo: to fix to our version?
        # new_portfolio_value = self.portfolio_value[agent]*(1+portfolio_return)
        # self.portfolio_value[agent] = new_portfolio_value

        # save into memory
        self.portfolio_return_memory[agent].append(portfolio_return)
        self.cum_portfolio_return_memory[agent].append(self.cum_portfolio_return_memory[agent][-1] + portfolio_return)
        self.date_memory[agent].append(self.data[agent]["Date"].unique()[0])            
        self.asset_memory[agent].append(curr_total)

        # the reward is the new portfolio value or end portfolio value
        self.reward[agent] = curr_total 
        #self.reward = self.reward*self.reward_scaling
      
      # penalise or reward the target agent based on the result of all other agents
      all_rewards = list(self.reward.values())
      for agent in self.possible_agents:
        # ratio = current agent / other agent
        # if reward of current agent > other agent, ratio > 1, reward is increased
        # else ratio < 1, reward is penalised
        self.reward[agent] *= prod(self.reward[agent] / all_rewards)

        # if money on hand is negative, large penalty is applied as this is unwanted
        if self.money_memory[agent][-1] < 0:
          self.reward[agent] *= -1
        # self.reward[self.training_agent] *= (self.reward[self.training_agent] / pv)

    return self.state[agent], self.reward[self.training_agent], self.terminal, {}

  def reset(self, seed=None, return_info=False, options=None):
    # print("reset")

    # agents
    self.agents = self.possible_agents[:]

    # attributes
    self.day = {
        agent: 0 for agent in self.possible_agents
    }
    self.data = {
        agent: self.df.loc[self.day[agent],:] for agent in self.possible_agents
    }
    self.covs = {
        agent: [[x[0][0] for x in self.data[agent]['cov_list']]] for agent in self.possible_agents
    }
    self.state = {
        agent: np.append(np.array(self.covs[agent]), [self.data[agent][tech].values.tolist() for tech in self.tech_indicator_list ], axis=0) for agent in self.possible_agents
    }

    # memory
    self.portfolio_value = {
        agent: self.initial_amount for agent in self.possible_agents
    }
    self.asset_memory = {
        agent: [self.initial_amount] for agent in self.possible_agents
    }
    self.portfolio_return_memory = {
        agent: [0] for agent in self.possible_agents
    }
    self.cum_portfolio_return_memory = {
        agent: [0] for agent in self.possible_agents
    }

    # stock ratio
    self.actions_memory = {
        agent: [[0]*self.stock_dim] for agent in self.possible_agents
    }
    self.date_memory = {
        agent: [self.data[agent]["Date"].unique()[0]] for agent in self.possible_agents
    }

    # free cash
    self.money_memory = {
        agent: [self.initial_amount] for agent in self.possible_agents
    }
    # cash + stock value
    self.total_value_memory = {
        agent: [self.initial_amount] for agent in self.possible_agents
    }

    # rewards
    self.reward = {
        agent: None for agent in self.possible_agents
    }
    
    # misc
    self.terminal = False 
    #self.cost = 0
    #self.trades = 0
    
    return self.state[self.training_agent] 

  def render(self):
    return self.state[self.training_agent]

  def seed(self, seed=None):
    self.np_random, seed = seeding.np_random(seed)
    return [seed]
  
  def softmax_normalization(self, actions):
    numerator = np.exp(actions)
    denominator = np.sum(np.exp(actions))
    softmax_output = numerator/denominator
    return softmax_output

  def set_training_agent(self, agent):
    # print(agent)
    self.training_agent = agent

  def learn(self, total_timesteps=1000):
    init_pv = {}
    final_pv = {}
    init_cash = {}
    final_cash = {}
    init_daily_pr = {}
    final_daily_pr = {}
    init_cum_pr = {}
    final_cum_pr = {}
    sharpe_ratio = {
        agent: [] for agent in self.possible_agents
    }

    # run till terminal in each timestep
    for n in range(total_timesteps):
      print("Step:", n+1)
      self.collect_individual_preds()
      for agent in self.possible_agents:
        self.set_training_agent(agent)
        self.agent_name_mapping[agent] = self.agent_name_mapping[agent].learn(total_timesteps=1)
        sharpe_ratio[agent].append(calculate_sharpe(self.portfolio_return_memory[agent]))
        
      if n == 0:
        # save init for plot
        for agent in self.possible_agents:
          init_pv[agent] = self.total_value_memory[agent]
          init_daily_pr[agent] = self.portfolio_return_memory[agent]
          init_cum_pr[agent] = self.cum_portfolio_return_memory[agent]
          init_cash[agent] = self.money_memory[agent]
    
    # save final for plot
    for agent in self.possible_agents:
      final_pv[agent] = self.total_value_memory[agent]
      final_daily_pr[agent] = self.portfolio_return_memory[agent]
      final_cum_pr[agent] = self.cum_portfolio_return_memory[agent]
      final_cash[agent] = self.money_memory[agent]
    
    return init_pv, final_pv, init_cash, final_cash, init_daily_pr, final_daily_pr, init_cum_pr, final_cum_pr, sharpe_ratio


### A2C

In [None]:
a2c_comp_env = CompetitivePMEnv(df=mixed_df, algo_type="A2C", **env_kwargs)

In [None]:
(
    a2c_comp_init_pv, 
    a2c_comp_final_pv, 
    a2c_comp_init_cash, 
    a2c_comp_final_cash, 
    a2c_comp_init_daily_pr, 
    a2c_comp_final_daily_pr, 
    a2c_comp_init_cum_pr, 
    a2c_comp_final_cum_pr, 
    a2c_comp_sr
) = a2c_comp_env.learn(
    total_timesteps=training_timesteps,
)

In [None]:
save_to_json("a2c_comp_init_pv_{}.json".format(training_timesteps), a2c_comp_init_pv)
save_to_json("a2c_comp_final_pv_{}.json".format(training_timesteps), a2c_comp_final_pv)

save_to_json("a2c_comp_init_cash_{}.json".format(training_timesteps), a2c_comp_init_cash)
save_to_json("a2c_comp_final_cash_{}.json".format(training_timesteps), a2c_comp_final_cash)

save_to_json("a2c_comp_init_daily_pr_{}.json".format(training_timesteps), a2c_comp_init_daily_pr)
save_to_json("a2c_comp_final_daily_pr_{}.json".format(training_timesteps), a2c_comp_final_daily_pr)

save_to_json("a2c_comp_init_cum_pr_{}.json".format(training_timesteps), a2c_comp_init_cum_pr)
save_to_json("a2c_comp_final_cum_pr_{}.json".format(training_timesteps), a2c_comp_final_cum_pr)

save_to_json("a2c_comp_sr_{}.json".format(training_timesteps), a2c_comp_sr)

### PPO

In [None]:
ppo_comp_env = CompetitivePMEnv(df=mixed_df, algo_type="PPO", **env_kwargs)

In [None]:
(
    ppo_comp_init_pv, 
    ppo_comp_final_pv, 
    ppo_comp_init_cash, 
    ppo_comp_final_cash, 
    ppo_comp_init_daily_pr, 
    ppo_comp_final_daily_pr, 
    ppo_comp_init_cum_pr, 
    ppo_comp_final_cum_pr, 
    ppo_comp_sr
) = ppo_comp_env.learn(
    total_timesteps=training_timesteps,
)

In [None]:
save_to_json("ppo_comp_init_pv_{}.json".format(training_timesteps), ppo_comp_init_pv)
save_to_json("ppo_comp_final_pv_{}.json".format(training_timesteps), ppo_comp_final_pv)

save_to_json("ppo_comp_init_cash_{}.json".format(training_timesteps), ppo_comp_init_cash)
save_to_json("ppo_comp_final_cash_{}.json".format(training_timesteps), ppo_comp_final_cash)

save_to_json("ppo_comp_init_daily_pr_{}.json".format(training_timesteps), ppo_comp_init_daily_pr)
save_to_json("ppo_comp_final_daily_pr_{}.json".format(training_timesteps), ppo_comp_final_daily_pr)

save_to_json("ppo_comp_init_cum_pr_{}.json".format(training_timesteps), ppo_comp_init_cum_pr)
save_to_json("ppo_comp_final_cum_pr_{}.json".format(training_timesteps), ppo_comp_final_cum_pr)

save_to_json("ppo_comp_sr_{}.json".format(training_timesteps), ppo_comp_sr)

## Cooperative

In [None]:
class CooperativePMEnv(gym.Env):
  metadata = {"render_modes": ["human"], "name": "marlpm_v1"}

  def __init__(
      self, 
      df,
      stock_dim,
      hmax,
      initial_amount,
      transaction_cost_pct,
      reward_scaling,
      state_space,
      action_space,
      tech_indicator_list,
      turbulence_threshold=None,
      lookback=252,
      day=0,
      render_mode=None,
      algo_type="Mixed", # default=mixed
  ):

    assert algo_type in ["A2C", "DDPG", "PPO", "Mixed"]

    # attributes
    self.lookback=lookback
    self.df = df
    self.stock_dim = stock_dim
    self.hmax = hmax
    self.initial_amount = initial_amount
    self.transaction_cost_pct =transaction_cost_pct
    self.reward_scaling = reward_scaling
    self.state_space = state_space
    self.action_dim = action_space
    self.tech_indicator_list = tech_indicator_list
    self.possible_agents = TYPE_TO_NAMES[algo_type]
    
    # buy/sell ratio reference, to explore
    self.end_day = len(self.df.index.unique()) - 1
    self.stock_volume_reference = 10000

    # spaces
    # check: spaces for observations only? sharing will affect?
    self.action_space = gym.spaces.Box(low = -1, high = 1, shape = (self.action_dim,))
    self.observation_space = gym.spaces.Box(low=0, high=np.inf, shape = (1+len(self.tech_indicator_list), self.state_space))

    # agents
    self.agent_name_mapping = {
        # agent: AGENT_STR_TO_OBJECT[agent](policy, self) for agent in self.possible_agents
        agent: AGENT_STR_TO_OBJECT[algo_type](policy, self, n_steps=self.end_day) for agent in self.possible_agents
    }
    self.training_agent = None
    self.day = {
        agent: day for agent in self.possible_agents
    }

    # data
    self.data = {
        agent: self.df.loc[self.day[agent],:] for agent in self.possible_agents
    }
    self.covs = {
        agent: [[x[0][0] for x in self.data[agent]['cov_list']]] for agent in self.possible_agents
    }
    self.state = {
        agent: np.append(np.array(self.covs[agent]), [self.data[agent][tech].values.tolist() for tech in self.tech_indicator_list ], axis=0) for agent in self.possible_agents
    }
    self.terminal = False     
    self.turbulence_threshold = turbulence_threshold   

    # memory
    self.portfolio_value = {
        agent: self.initial_amount for agent in self.possible_agents
    }
    self.asset_memory = {
        agent: [self.initial_amount] for agent in self.possible_agents
    }
    self.portfolio_return_memory = {
        agent: [0] for agent in self.possible_agents
    }
    self.cum_portfolio_return_memory = {
        agent: [0] for agent in self.possible_agents
    }

    # stock ratio
    self.actions_memory = {
        agent: [[0]*self.stock_dim] for agent in self.possible_agents
    }
    self.date_memory = {
        agent: [self.data[agent]["Date"].unique()[0]] for agent in self.possible_agents
    }

    # free cash
    self.money_memory = {
        agent: [self.initial_amount] for agent in self.possible_agents
    }
    # cash + stock value
    self.total_value_memory = {
        agent: [self.initial_amount] for agent in self.possible_agents
    }
    # individual actions collection
    self.individual_preds = {
        agent: [] for agent in self.possible_agents
    }

    # render mode
    self.render_mode = render_mode

  def collect_individual_preds(self):
    self.individual_preds = {
        agent: [] for agent in self.possible_agents
    }
    for i in range(self.end_day+1):
      # states are somewhat static
      curr_data = self.df.loc[i,:]
      curr_covs = [[x[0][0] for x in curr_data['cov_list']]]
      curr_state = np.append(np.array(curr_covs), [curr_data[tech].values.tolist() for tech in self.tech_indicator_list ], axis=0)
      for agent in self.possible_agents:
        action, _states = self.agent_name_mapping[agent].predict(curr_state)
        self.individual_preds[agent].append(action)
  
  def step(self, actions):
    # print(self.day)
    # print(actions)

    # termination
    # check: termination determined correctly?
    self.terminal = self.day[self.training_agent] >= self.end_day
    # self.terminal = {
    #     agent: self.day >= len(self.df.index.unique())/self.stock_dim-1 for agent in self.possible_agents
    # }

    if self.terminal:
      print("=================================")
      print("begin_total_asset:{}".format(self.asset_memory[self.training_agent][0]))           
      # print("end_total_asset:{}".format(self.portfolio_value[self.training_agent]))
      print("end_total_asset:{}".format(self.total_value_memory[self.training_agent][0]))

      df_daily_return = pd.DataFrame(self.portfolio_return_memory[self.training_agent])
      # df_daily_pv = pd.DataFrame(self.total_value_memory[self.training_agent])
      df_daily_return.columns = ['daily_return']
      # df_daily_pv.columns = ['daily_pv']
      if df_daily_return['daily_return'].std() !=0:
        sharpe = (252**0.5)*df_daily_return['daily_return'].mean() / df_daily_return['daily_return'].std()
      # if df_daily_pv['daily_pv'].std() !=0:
      #   sharpe = (252**0.5)*df_daily_pv['daily_pv'].mean() / df_daily_pv['daily_pv'].std()
        print("Sharpe: ",sharpe)
      print("=================================")

      return self.state[self.training_agent], self.reward[self.training_agent], self.terminal, {}

    else:
      # loop through all agents so that each of them predict an action (portfolio weights)
      for agent in self.possible_agents:
        # get action
        if agent == self.training_agent:
          action = actions
        else:
          # action, _states = self.agent_name_mapping[agent].predict(self.state[agent], deterministic=False)
          action = self.individual_preds[agent][self.day[agent]]

        # normalisation
        weights = self.softmax_normalization(action) 

        # stock ratio - buy/sell/hold
        prev_stock_ratio = self.actions_memory[agent][-1]
        diff_stock_ratio = prev_stock_ratio - weights

        # money - increase if sell, decrease if buy, no changes if hold
        prev_money = self.money_memory[agent][-1]
        curr_money = prev_money + sum(diff_stock_ratio * self.stock_volume_reference * self.data[agent]["Close"].values)
        self.money_memory[agent].append(curr_money)

        # total value - money + currently held stock value
        curr_total = curr_money + sum(weights * self.stock_volume_reference * self.data[agent]["Close"].values)
        self.total_value_memory[agent].append(curr_total)

        # actions memory
        self.actions_memory[agent].append(weights)
        last_day_memory = self.data[agent]

        # load next state
        self.day[agent] += 1
        self.data[agent] = self.df.loc[self.day[agent],:]
        self.covs[agent] = [[x[0][0] for x in self.data[agent]['cov_list']]]
        self.state[agent] =  np.append(np.array(self.covs[agent]), [self.data[agent][tech].values.tolist() for tech in self.tech_indicator_list ], axis=0)
        
        # calculate portfolio return
        # individual stocks' return * weight
        portfolio_return = sum(((self.data[agent]["Close"].values / last_day_memory["Close"].values)-1)*weights)
        
        # update portfolio value
        # todo: to fix to our version?
        # new_portfolio_value = self.portfolio_value[agent]*(1+portfolio_return)
        # self.portfolio_value[agent] = new_portfolio_value

        # save into memory
        self.portfolio_return_memory[agent].append(portfolio_return)
        self.cum_portfolio_return_memory[agent].append(self.cum_portfolio_return_memory[agent][-1] + portfolio_return)
        self.date_memory[agent].append(self.data[agent]["Date"].unique()[0])            
        self.asset_memory[agent].append(curr_total)

        # the reward is the new portfolio value or end portfolio value
        self.reward[agent] = curr_total 
        #self.reward = self.reward*self.reward_scaling
      
      # penalise or reward the target agent based on the result of all other agents
      all_rewards = list(self.reward.values())
      for agent in self.possible_agents:
        # ratio = smaller reward / greater reward
        # rewards have to be similar across all agents
        # the greater the difference, the greater the penalty
        self.reward[agent] *= prod([
            pv / self.reward[agent]
            if self.reward[agent] > pv
            else self.reward[agent] / pv
            for pv in all_rewards
        ])

        # if money on hand is negative, large penalty is applied as this is unwanted
        if self.money_memory[agent][-1] < 0:
          self.reward[agent] *= -1

    return self.state[agent], self.reward[self.training_agent], self.terminal, {}

  def reset(self, seed=None, return_info=False, options=None):
    # print("reset")

    # agents
    self.agents = self.possible_agents[:]

    # attributes
    self.day = {
        agent: 0 for agent in self.possible_agents
    }
    self.data = {
        agent: self.df.loc[self.day[agent],:] for agent in self.possible_agents
    }
    self.covs = {
        agent: [[x[0][0] for x in self.data[agent]['cov_list']]] for agent in self.possible_agents
    }
    self.state = {
        agent: np.append(np.array(self.covs[agent]), [self.data[agent][tech].values.tolist() for tech in self.tech_indicator_list ], axis=0) for agent in self.possible_agents
    }

    # memory
    self.portfolio_value = {
        agent: self.initial_amount for agent in self.possible_agents
    }
    self.asset_memory = {
        agent: [self.initial_amount] for agent in self.possible_agents
    }
    self.portfolio_return_memory = {
        agent: [0] for agent in self.possible_agents
    }
    self.cum_portfolio_return_memory = {
        agent: [0] for agent in self.possible_agents
    }

    # stock ratio
    self.actions_memory = {
        agent: [[0]*self.stock_dim] for agent in self.possible_agents
    }
    self.date_memory = {
        agent: [self.data[agent]["Date"].unique()[0]] for agent in self.possible_agents
    }

    # free cash
    self.money_memory = {
        agent: [self.initial_amount] for agent in self.possible_agents
    }
    # cash + stock value
    self.total_value_memory = {
        agent: [self.initial_amount] for agent in self.possible_agents
    }

    # rewards
    self.reward = {
        agent: None for agent in self.possible_agents
    }
    
    # misc
    self.terminal = False 
    #self.cost = 0
    #self.trades = 0
    
    return self.state[self.training_agent] 

  def render(self):
    return self.state[self.training_agent]

  def seed(self, seed=None):
    self.np_random, seed = seeding.np_random(seed)
    return [seed]
  
  def softmax_normalization(self, actions):
    numerator = np.exp(actions)
    denominator = np.sum(np.exp(actions))
    softmax_output = numerator/denominator
    return softmax_output

  def set_training_agent(self, agent):
    # print(agent)
    self.training_agent = agent

  def learn(self, total_timesteps=1000):
    init_pv = {}
    final_pv = {}
    init_cash = {}
    final_cash = {}
    init_daily_pr = {}
    final_daily_pr = {}
    init_cum_pr = {}
    final_cum_pr = {}
    sharpe_ratio = {
        agent: [] for agent in self.possible_agents
    }

    # run till terminal in each timestep
    for n in range(total_timesteps):
      print("Step:", n+1)
      self.collect_individual_preds()
      for agent in self.possible_agents:
        self.set_training_agent(agent)
        self.agent_name_mapping[agent] = self.agent_name_mapping[agent].learn(total_timesteps=1)
        sharpe_ratio[agent].append(calculate_sharpe(self.portfolio_return_memory[agent]))
        
      if n == 0:
        # save init for plot
        for agent in self.possible_agents:
          init_pv[agent] = self.total_value_memory[agent]
          init_daily_pr[agent] = self.portfolio_return_memory[agent]
          init_cum_pr[agent] = self.cum_portfolio_return_memory[agent]
          init_cash[agent] = self.money_memory[agent]
    
    # save final for plot
    for agent in self.possible_agents:
      final_pv[agent] = self.total_value_memory[agent]
      final_daily_pr[agent] = self.portfolio_return_memory[agent]
      final_cum_pr[agent] = self.cum_portfolio_return_memory[agent]
      final_cash[agent] = self.money_memory[agent]
    
    return init_pv, final_pv, init_cash, final_cash, init_daily_pr, final_daily_pr, init_cum_pr, final_cum_pr, sharpe_ratio


### A2C

In [None]:
a2c_coop_env = CooperativePMEnv(df=mixed_df, algo_type="A2C", **env_kwargs)

In [None]:
(
    a2c_coop_init_pv, 
    a2c_coop_final_pv, 
    a2c_coop_init_cash, 
    a2c_coop_final_cash, 
    a2c_coop_init_daily_pr, 
    a2c_coop_final_daily_pr, 
    a2c_coop_init_cum_pr, 
    a2c_coop_final_cum_pr, 
    a2c_coop_sr
) = a2c_coop_env.learn(
    total_timesteps=training_timesteps,
)

In [None]:
save_to_json("a2c_coop_init_pv_{}.json".format(training_timesteps), a2c_coop_init_pv)
save_to_json("a2c_coop_final_pv_{}.json".format(training_timesteps), a2c_coop_final_pv)

save_to_json("a2c_coop_init_cash_{}.json".format(training_timesteps), a2c_coop_init_cash)
save_to_json("a2c_coop_final_cash_{}.json".format(training_timesteps), a2c_coop_final_cash)

save_to_json("a2c_coop_init_daily_pr_{}.json".format(training_timesteps), a2c_coop_init_daily_pr)
save_to_json("a2c_coop_final_daily_pr_{}.json".format(training_timesteps), a2c_coop_final_daily_pr)

save_to_json("a2c_coop_init_cum_pr_{}.json".format(training_timesteps), a2c_coop_init_cum_pr)
save_to_json("a2c_coop_final_cum_pr_{}.json".format(training_timesteps), a2c_coop_final_cum_pr)

save_to_json("a2c_coop_sr_{}.json".format(training_timesteps), a2c_coop_sr)

### PPO

In [None]:
ppo_coop_env = CooperativePMEnv(df=mixed_df, algo_type="PPO", **env_kwargs)

In [None]:
(
    ppo_coop_init_pv, 
    ppo_coop_final_pv, 
    ppo_coop_init_cash, 
    ppo_coop_final_cash, 
    ppo_coop_init_daily_pr, 
    ppo_coop_final_daily_pr, 
    ppo_coop_init_cum_pr, 
    ppo_coop_final_cum_pr, 
    ppo_coop_sr
) = ppo_coop_env.learn(
    total_timesteps=training_timesteps,
)

In [None]:
save_to_json("ppo_coop_init_pv_{}.json".format(training_timesteps), ppo_coop_init_pv)
save_to_json("ppo_coop_final_pv_{}.json".format(training_timesteps), ppo_coop_final_pv)

save_to_json("ppo_coop_init_cash_{}.json".format(training_timesteps), ppo_coop_init_cash)
save_to_json("ppo_coop_final_cash_{}.json".format(training_timesteps), ppo_coop_final_cash)

save_to_json("ppo_coop_init_daily_pr_{}.json".format(training_timesteps), ppo_coop_init_daily_pr)
save_to_json("ppo_coop_final_daily_pr_{}.json".format(training_timesteps), ppo_coop_final_daily_pr)

save_to_json("ppo_coop_init_cum_pr_{}.json".format(training_timesteps), ppo_coop_init_cum_pr)
save_to_json("ppo_coop_final_cum_pr_{}.json".format(training_timesteps), ppo_coop_final_cum_pr)

save_to_json("ppo_coop_sr_{}.json".format(training_timesteps), ppo_coop_sr)

# Plots

## Market Trend

In [None]:
plt.plot(stocks["AAPL"]["Close"].index, stocks["AAPL"]["Close"], label="AAPL")
plt.plot(stocks["TSLA"]["Close"].index, stocks["TSLA"]["Close"], label="TSLA")
plt.title("Daily Closing Value")
plt.legend()
plt.show()

## A2C

### Competitive

Portfolio Value

In [None]:
plot_line_graph(a2c_comp_env, a2c_comp_env.date_memory[a2c_comp_env.possible_agents[0]], a2c_comp_init_pv, "Portfolio Value before Training")
plot_line_graph(a2c_comp_env, a2c_comp_env.date_memory[a2c_comp_env.possible_agents[0]], a2c_comp_final_pv, "Portfolio Value after Training")

Cash Movement

In [None]:
plot_line_graph(a2c_comp_env, a2c_comp_env.date_memory[a2c_comp_env.possible_agents[0]], a2c_comp_init_cash, "Cash Movement before Training")
plot_line_graph(a2c_comp_env, a2c_comp_env.date_memory[a2c_comp_env.possible_agents[0]], a2c_comp_final_cash, "Cash Movement after Training")

Daily Portfolio Return

In [None]:
plot_line_graph(a2c_comp_env, a2c_comp_env.date_memory[a2c_comp_env.possible_agents[0]], a2c_comp_init_daily_pr, "Daily Portfoliio Return before Training")
plot_line_graph(a2c_comp_env, a2c_comp_env.date_memory[a2c_comp_env.possible_agents[0]], a2c_comp_final_daily_pr, "Daily Portfoliio Return after Training")

Cumulative Portfolio Return

In [None]:
plot_line_graph(a2c_comp_env, a2c_comp_env.date_memory[a2c_comp_env.possible_agents[0]], a2c_comp_init_cum_pr, "Cumulative Portfoliio Return before Training")
plot_line_graph(a2c_comp_env, a2c_comp_env.date_memory[a2c_comp_env.possible_agents[0]], a2c_comp_final_cum_pr, "Cumulative Portfoliio Return after Training")

Sharpe Ratio

In [None]:
plot_line_graph(a2c_comp_env, range(1, training_timesteps+1), a2c_comp_sr, "Competitive Sharpe Ratio Trend")

### Cooperative

Portfolio Value

In [None]:
plot_line_graph(a2c_coop_env, a2c_coop_env.date_memory[a2c_coop_env.possible_agents[0]], a2c_coop_init_pv, "Portfolio Value before Training")
plot_line_graph(a2c_coop_env, a2c_coop_env.date_memory[a2c_coop_env.possible_agents[0]], a2c_coop_final_pv, "Portfolio Value after Training")

Cash Movement

In [None]:
plot_line_graph(a2c_coop_env, a2c_coop_env.date_memory[a2c_coop_env.possible_agents[0]], a2c_coop_init_cash, "Cash Movement before Training")
plot_line_graph(a2c_coop_env, a2c_coop_env.date_memory[a2c_coop_env.possible_agents[0]], a2c_coop_final_cash, "Cash Movement after Training")

Daily Portfolio Return

In [None]:
plot_line_graph(a2c_coop_env, a2c_coop_env.date_memory[a2c_coop_env.possible_agents[0]], a2c_coop_init_daily_pr, "Daily Portfoliio Return before Training")
plot_line_graph(a2c_coop_env, a2c_coop_env.date_memory[a2c_coop_env.possible_agents[0]], a2c_coop_final_daily_pr, "Daily Portfoliio Return after Training")

Cumulative Portfolio Return

In [None]:
plot_line_graph(a2c_coop_env, a2c_coop_env.date_memory[a2c_coop_env.possible_agents[0]], a2c_coop_init_cum_pr, "Cumulative Portfolio Return before Training")
plot_line_graph(a2c_coop_env, a2c_coop_env.date_memory[a2c_coop_env.possible_agents[0]], a2c_coop_final_cum_pr, "Cumulative Portfolio Return after Training")

Sharpe Ratio

In [None]:
plot_line_graph(a2c_coop_env, range(1, training_timesteps+1), a2c_coop_sr, "Cooperative Sharpe Ratio Trend")

## PPO

### Competitive

Portfolio Value

In [None]:
plot_line_graph(ppo_comp_env, ppo_comp_env.date_memory[ppo_comp_env.possible_agents[0]], ppo_comp_init_pv, "Portfolio Value before Training")
plot_line_graph(ppo_comp_env, ppo_comp_env.date_memory[ppo_comp_env.possible_agents[0]], ppo_comp_final_pv, "Portfolio Value after Training")

Cash Movement

In [None]:
plot_line_graph(ppo_comp_env, ppo_comp_env.date_memory[ppo_comp_env.possible_agents[0]], ppo_comp_init_cash, "Cash Movement before Training")
plot_line_graph(ppo_comp_env, ppo_comp_env.date_memory[ppo_comp_env.possible_agents[0]], ppo_comp_final_cash, "Cash Movement after Training")

Daily Portfolio Return

In [None]:
plot_line_graph(ppo_comp_env, ppo_comp_env.date_memory[ppo_comp_env.possible_agents[0]], ppo_comp_init_daily_pr, "Daily Portfolio Return before Training")
plot_line_graph(ppo_comp_env, ppo_comp_env.date_memory[ppo_comp_env.possible_agents[0]], ppo_comp_final_daily_pr, "Daily Portfolio Return after Training")

Cumulative Portfolio Return

In [None]:
plot_line_graph(ppo_comp_env, ppo_comp_env.date_memory[ppo_comp_env.possible_agents[0]], ppo_comp_init_cum_pr, "Cumulative Portfolio Return before Training")
plot_line_graph(ppo_comp_env, ppo_comp_env.date_memory[ppo_comp_env.possible_agents[0]], ppo_comp_final_cum_pr, "Cumulative Portfolio Return after Training")

Sharpe Ratio

In [None]:
plot_line_graph(ppo_comp_env, range(1, training_timesteps+1), ppo_comp_sr, "Competitive Sharpe Ratio Trend")

### Cooperative

Portfolio Value

In [None]:
plot_line_graph(ppo_coop_env, ppo_coop_env.date_memory[ppo_coop_env.possible_agents[0]], ppo_coop_init_pv, "Portfolio Value before Training")
plot_line_graph(ppo_coop_env, ppo_coop_env.date_memory[ppo_coop_env.possible_agents[0]], ppo_coop_final_pv, "Portfolio Value after Training")

Cash Movement

In [None]:
plot_line_graph(ppo_coop_env, ppo_coop_env.date_memory[ppo_coop_env.possible_agents[0]], ppo_coop_init_cash, "Cash Movement before Training")
plot_line_graph(ppo_coop_env, ppo_coop_env.date_memory[ppo_coop_env.possible_agents[0]], ppo_coop_final_cash, "Cash Movement after Training")

Daily Portfolio Return

In [None]:
plot_line_graph(ppo_coop_env, ppo_coop_env.date_memory[ppo_coop_env.possible_agents[0]], ppo_coop_init_daily_pr, "Daily Portfolio Return before Training")
plot_line_graph(ppo_coop_env, ppo_coop_env.date_memory[ppo_coop_env.possible_agents[0]], ppo_coop_final_daily_pr, "Daily Portfolio Return after Training")

Cumulative Portfolio Return

In [None]:
plot_line_graph(ppo_coop_env, ppo_coop_env.date_memory[ppo_coop_env.possible_agents[0]], ppo_coop_init_cum_pr, "Cumulative Portfolio Return before Training")
plot_line_graph(ppo_coop_env, ppo_coop_env.date_memory[ppo_coop_env.possible_agents[0]], ppo_coop_final_cum_pr, "Cumulative Portfolio Return after Training")

Sharpe Ratio

In [None]:
plot_line_graph(ppo_coop_env, range(1, training_timesteps+1), ppo_coop_sr, "Cooperative Sharpe Ratio Trend")