### Summary
This workbook includes all experiments for trading bot algorithm

##### Import

In [1]:
import pandas as pd
import numpy as np
import random, typing, os, cv2
import utils
from collections import deque, namedtuple
from typing import Any

import matplotlib.pyplot as plt
from mplfinance.original_flavor import candlestick_ohlc
import matplotlib.dates as mpl_dates
from datetime import datetime

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Input, Lambda
from tensorflow.keras import backend as K




##### HYPER PARAMETERS and Common assumption

In [2]:
# Define of action space
action_space = [0, 1, 2]

# Common assumption
initial_balance=20
min_trading = 10
trading_fee_rate = 0.001
lr = 0.0001
epochs = 1
normalize_value = 100000
# optimizer = Adam


##### Crypto price file input

In [47]:
input = pd.read_pickle(r'C:\Users\Dell\OneDrive\Documents\projects\trading_bot_v0\Processed_Dataset_from_1Jan22_17Feb24.pkl') # Importing dataset from pickle
print(input.columns)
criteria = ['closeTime', 'open', 'high', 'low', 'close','volume']
mapping = {name:num for num, name in enumerate(criteria)}
actions = {0: 'sell', 1: 'hold', 2: 'buy'}
print(mapping)
series = np.array(input[criteria][-1200:])

Index(['open', 'high', 'low', 'close', 'volume', 'closeTime',
       'baseAssetVolume', 'numberOfTrade', 'takerBuyVolume',
       'takerBuyBaseVolume',
       ...
       'RSI', 'STOCH_0', 'STOCH_1', 'STOCHF_0', 'STOCHF_1', 'STOCHRSI_0',
       'STOCHRSI_1', 'TRIX', 'ULTOSC', 'WILLR'],
      dtype='object', length=114)
{'closeTime': 0, 'open': 1, 'high': 2, 'low': 3, 'close': 4, 'volume': 5}


### Algorithm 1: PPO, Actor-Critic Style

#### Define Actor-Critic Network

In [48]:
# class Actor:
#     def __init__(self, input_shape, action_space, lr=None, optimizer=None) -> None:
#         self.action_space = action_space
#         self.model = Sequential(
#             Input(input_shape=input_shape),
#             Dense(64, activation=r'relu'),
#             Dense(32, activation=r'relu'),
#             Dense(action_shape, activation=r'softmax')
#         )

#     def loss(self, y_true, y_pred):
#         # Defined in https://arxiv.org/abs/1707.06347
#         advantages, prediction_picks, actions = y_true[:, :1], y_true[:, 1:1+self.action_space], y_true[:, 1+self.action_space:]
#         LOSS_CLIPPING = 0.2
#         ENTROPY_LOSS = 0.001

#         prob = actions * y_pred
#         old_prob = actions * prediction_picks

#         ratio = K.exp(K.log(prob) - K.log(old_prob))

#         p1 = ratio * advantages
#         p2 = K.clip(ratio, min_value=1 - LOSS_CLIPPING, max_value=1 + LOSS_CLIPPING) * advantages
        
#         actor_loss = -K.mean(K.minimum(p1, p2))

#         entropy = -(y_pred * K.log(y_pred + 1e-10))
#         entropy = ENTROPY_LOSS * K.mean(entropy)
        
#         total_loss = actor_loss - entropy

#         return total_loss
    
#     def predict(self, state):
#         return self.model.predict(state)



# class Critic:
#     def __init__(self, input_shape, action_space, lr, optimizer):
#         self.model = Sequential(
#             Input(input_shape=input_shape),
#             Dense(64, activation=r'relu'),
#             Dense(32, activation=r'relu'),
#             Dense(action_shape, activation=r'softmax')
#         )

#         self.model.compile(loss=self.critic_PPO2_loss, optimizer=optimizer(lr=lr))

#     def loss(self, y_true, y_pred):
#         value_loss = K.mean((y_true - y_pred) ** 2) # standard PPO loss
#         return value_loss

#     def predict(self, state):
#         return self.model.predict([state, np.zeros((state.shape[0], 1))])

#### State and Account

In [49]:
class State:
    def __init__(
            self, 
            # Market state
            timestamp=0,
            series= np.zeros([1,len(mapping)]),
            mapping:dict=mapping
        ):
        market_state = series[timestamp]
        self.values = market_state
        self.timestamp=0,
        self.open = market_state[mapping['open']]
        self.high = market_state[mapping['high']]
        self.low = market_state[mapping['low']]
        self.close = market_state[mapping['close']]
        self.volume = market_state[mapping['volume']]
        self.shape = len(mapping)
    def __repr__(self):
        fixed_width = 15
        message = '    |'.join(f'{i}: {j:.2f}'.ljust(fixed_width) for i, j in zip(mapping.keys(), self.values))
        return message
    
    def __call__(self):
        # return {i:j for i, j in zip(mapping.keys(), self.market_state)}
        return list(self.values)

class Account:
    
    def __init__(self, 
                 state=None,
                 initial_balance=initial_balance,
                 min_trading=min_trading,
                 trading_fee_rate=trading_fee_rate
                 ):

        price                = state.close

        self.initial_balance = initial_balance
        self.min_trading     = min_trading
        self.trading_fee_rate= trading_fee_rate
        self.coin_qty        = 0
        self.coin_cost       = 0
        self.cash_balance    = initial_balance 
        self.net_worth       = self.coin_qty * price + self.cash_balance

    def _buy(self, price):
        if self.cash_balance < self.min_trading:
            print("Insufficient cash balance to perform buy action.")
            return False

        else:
            hold_qty     = self.coin_qty
            avg_price    = self.coin_cost
            buy_price    = price * (1 + self.trading_fee_rate)
            buy_limit    = self.cash_balance / buy_price
            buy_qty      = buy_limit  # Example: buy half of the limit

            self.coin_cost    = (hold_qty * avg_price + buy_price * buy_qty) / (hold_qty + buy_qty)
            self.coin_qty    += buy_qty
            self.cash_balance-= buy_qty * buy_price
            self.net_worth    = self.coin_qty * price + self.cash_balance

            print(f'Buy {buy_qty:.5f} at {buy_price}')

            return True

    def _sell(self, price):
        # hold_qty     = self.coin_qty
        avg_price    = self.coin_cost 

        sell_price   = price * (1 - self.trading_fee_rate)
        sell_qty     = self.coin_qty  # Example: sell half of the holdings

        if sell_qty == 0:
            return False
        else:
            self.cash_balance += sell_qty * sell_price
            self.coin_qty     -= sell_qty
            self.coin_cost     = 0 
            self.net_worth     = self.coin_qty * self.coin_cost + self.cash_balance

            print(f'Sell {sell_qty:.5f} at {sell_price}')


    def __repr__(self):
        return f'coin_qty: {self.coin_qty:.5f} |coin_cost: {self.coin_cost:.2f} |cash_balance: {self.cash_balance:.2f} |net_worth: {self.net_worth:.2f}' 
    
    def __call__(self, *args: Any, **kwds: Any) -> Any:
        return [i for i in self.__dict__.values()]

In [50]:
def display_frames_as_gif(frames, episode):
    import pylab
    from matplotlib import animation
    try:
        pylab.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 72)
        patch = pylab.imshow(frames[0])
        pylab.axis('off')
        pylab.subplots_adjust(left=0, right=1, top=1, bottom=0)
        def animate(i):
            patch.set_data(frames[i])
        anim = animation.FuncAnimation(pylab.gcf(), animate, frames = len(frames), interval=33)
        anim.save(str(episode)+'_gameplay.gif')
    except:
        pylab.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 72)
        patch = pylab.imshow(frames[0])
        pylab.axis('off')
        pylab.subplots_adjust(left=0, right=1, top=1, bottom=0)
        def animate(i):
            patch.set_data(frames[i])
        anim = animation.FuncAnimation(pylab.gcf(), animate, frames = len(frames), interval=33)
        anim.save(str(episode)+'_gameplay.gif', writer=animation.PillowWriter(fps=10))

class TradingGraph:
    # A crypto trading visualization using matplotlib made to render custom prices which come in following way:
    # Date, Open, High, Low, Close, Volume, net_worth, trades
    # call render every step
    def __init__(self, render_range):
        self.Volume = deque(maxlen=render_range)
        self.net_worth = deque(maxlen=render_range)
        self.render_data = deque(maxlen=render_range)
        self.Render_range = render_range

        # We are using the style ‘ggplot’
        plt.style.use('ggplot')
        # close all plots if there are open
        plt.close('all')
        # figsize attribute allows us to specify the width and height of a figure in unit inches
        self.fig = plt.figure(figsize=(16,8)) 

        # Create top subplot for price axis
        self.ax1 = plt.subplot2grid((6,1), (0,0), rowspan=5, colspan=1)
        
        # Create bottom subplot for volume which shares its x-axis
        self.ax2 = plt.subplot2grid((6,1), (5,0), rowspan=1, colspan=1, sharex=self.ax1)
        
        # Create a new axis for net worth which shares its x-axis with price
        self.ax3 = self.ax1.twinx()

        # Formatting Date
        self.date_format = mpl_dates.DateFormatter('%d-%m-%Y')
        #self.date_format = mpl_dates.DateFormatter('%d-%m-%Y')
        
        # Add paddings to make graph easier to view
        #plt.subplots_adjust(left=0.07, bottom=-0.1, right=0.93, top=0.97, wspace=0, hspace=0)

        # we need to set layers
        self.ax2.set_xlabel('Date')
        self.ax1.set_ylabel('Price')
        self.ax3.set_ylabel('Balance')

        # I use tight_layout to replace plt.subplots_adjust
        self.fig.tight_layout()

        # Show the graph with matplotlib
        plt.show()

    # Render the environment to the screen
    def render(self, 
               state=np.zeros(len(mapping))
               ):
        # before appending to deque list, need to convert Date to special format
        print(state)
        Date, Open, High, Low, Close, Volume = state().values()
        Date = mpl_dates.date2num([pd.to_datetime(Date)])[0]
        print(Date)
        self.render_data.append([Date, Open, High, Low, Close])
        
        # Clear the frame rendered last step
        self.ax1.clear()
        candlestick_ohlc(self.ax1, self.render_data, width=0.8/24, colorup='green', colordown='red', alpha=0.8)

        # we need to set layers every step, because we are clearing subplots every step
        self.ax2.set_xlabel('Date')
        self.ax1.set_ylabel('Price')
        self.ax3.set_ylabel('Balance')

        """Display image with matplotlib - interrupting other tasks"""
        # Show the graph without blocking the rest of the program
        plt.show(block=False)
        # Necessary to view frames before they are unrendered
        plt.pause(0.001)

#### Trading environment

In [62]:
class TradingEnv:
    def __init__(self, 
                 data=None, 
                 window_size=50,
                 mapping = mapping,
                 action_space=action_space,
                 initial_balance:float=initial_balance,
                 trading_fee_rate=trading_fee_rate):

        # Environment parameters
        self.data               = data
        self.series             = data
        self.length             = len(data)
        self.window_size        = int(window_size)
        self.action_space       = np.array(action_space)
        self.mapping            = mapping
        self.initial_balance    = initial_balance
        self.min_trading        = 10
        self.trading_fee_rate   = trading_fee_rate
        self.timestep = 1


        # # Create Actor-Critic network model
        # self.Actor = Actor(input_shape=self.state_size, action_space = self.action_space.shape[0], lr=self.lr, optimizer = self.optimizer)
        # self.Critic = Critic(input_shape=self.state_size, action_space = self.action_space.shape[0], lr=self.lr, optimizer = self.optimizer)


        # Orders history contains the balance, net_worth, crypto_bought, crypto_sold, crypto_held values for the last lookback_window_size steps
        self.reset()

    def reset(self, start_index=0, max_timesteps=10):

        # self.visualization = TradingGraph(render_range=self.render_range) # init visualization
        self.max_timesteps      = max_timesteps
        self.timestamp          = 0
        self.series             = self.data[start_index:start_index + max_timesteps]
        self.state              = State(timestamp=self.timestamp,
                                        series=self.series)
        self.prev_state         = State(timestamp=self.timestamp,
                                        series=self.series)
        self.account            = Account(initial_balance=self.initial_balance,
                                          state=self.state)
        self.goal               = tuple(self.series[-1])
        self.done               = len(self.series) <= 1

        self.orders_history     = deque(maxlen=self.window_size)
        self.market_history     = deque(maxlen=self.window_size)

        reward = 0
        return self.state.values, reward, self.done
    
    # create Tensor board writer
    def create_writer(self):
        self.replay_count = 0
        self.writer = SummaryWriter(comment="Trader")


    def update_state(self, timestep=1):
        # ADD 1 TO TIMESTAMP AFTER EACH STEP
        self.timestamp +=  timestep

        # UPDATE THE STATE ACCORDINGLY
        self.state      =  State(
            self.timestamp,
            series=self.series
        )

        # HISTORY IS THE n LATEST STEP
        self.orders_history.append(self.account())
        self.market_history.append(self.state())
        
    def get_reward(self):
        # SIMPLE REWARD CALCULATION 
        prev_net_worth = self.prev_account.net_worth
        net_worth = self.account.net_worth
        reward = net_worth - prev_net_worth
        return reward

    def step(self, action, trading_log=False):
        price=self.state.close

        # UPDATE ACCOUNT CURRENT STATE
        self.prev_account = self.account
        if action == 0:
            pass
        elif action == 1 and self.account.cash_balance > self.initial_balance/10000:
            self.account._buy(price=price)
        elif action == 2 and self.account.coin_qty > 0.000001:
            self.account._sell(price=price)

        # UPDATE MARKET STATE
        self.update_state()

        # CALCULATE REWARD
        reward = self.get_reward()

        # CHECK IF SCENARIO IS DONE
        self.done = len(self.series) <= self.max_timesteps

        if trading_log:
            print(f'Step {self.timestamp}: {actions[action]}\n {self.account}\n reward:{reward:.2f}')

        return self.state.values, reward, self.done

    def render(self, visualize=False):
        if visualize:
            img = self.visualization.render(state=self.state)
        return img
    
    def random_game(self, 
                    num_steps=200, 
                    ):
        # average_net_worth = 0
        self.reset()
        for i in range(num_steps):
            action = np.random.randint(3, size=1)[0]
            state, reward, done = self.step(action)
            if done:
                break

    def get_advanatage(self, 
                       rewards, 
                       dones, 
                       values, 
                       next_values, 
                       gamma = 0.99, 
                       lamda = 0.95, 
                       normalize=True):
        """calculates the Generalized Advantage Estimation (GAE) for a reinforcement learning agent. 
        GAE is used to reduce the variance of the advantage estimates, 
        which helps in stabilizing the training process.

        Args:
            rewards (_type_): _description_
            dones (_type_): _description_
            values (_type_): _description_
            next_values (_type_): _description_
            gamma (float, optional): _description_. Defaults to 0.99.
            lamda (float, optional): _description_. Defaults to 0.95.
            normalize (bool, optional): _description_. Defaults to True.

        Returns:
            _type_: _description_
        """
        
        deltas = [r + gamma * (1 - d) * nv - v for r, d, nv, v in zip(rewards, dones, next_values, values)]
        deltas = np.stack(deltas)
        gaes = copy.deepcopy(deltas)
        for t in reversed(range(len(deltas) - 1)):
            gaes[t] = gaes[t] + (1 - dones[t]) * gamma * lamda * gaes[t + 1]

        target = gaes + values
        if normalize:
            gaes = (gaes - gaes.mean()) / (gaes.std() + 1e-8)
        return np.vstack(gaes), np.vstack(target)
    

#### Experience/Q-table

In [63]:
# Store experiences as named tuples
# experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])

In [64]:
MEMORY_SIZE = 100_000     # size of memory buffer
GAMMA = 0.996             # discount factor
ALPHA = 1e-3              # learning rate
NUM_STEPS_FOR_UPDATE = 4  # perform a learning update every C time steps

In [65]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np

class CriticNetwork(tf.keras.Model):
    def __init__(self, state_dim):
        super(CriticNetwork, self).__init__()
        self.dense1 = layers.Dense(128, activation='relu')
        self.dense2 = layers.Dense(128, activation='relu')
        self.value = layers.Dense(1, activation=None)
    
    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        return self.value(x)

class PolicyNetwork(tf.keras.Model):
    def __init__(self, state_dim, action_dim):
        super(PolicyNetwork, self).__init__()
        self.dense1 = layers.Dense(128, activation='relu')
        self.dense2 = layers.Dense(128, activation='relu')
        self.logits = layers.Dense(action_dim, activation=None)
    
    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        return tf.nn.softmax(self.logits(x))

class Memory:
    def __init__(self):
        self.states = []
        self.actions = []
        self.logprobs = []
        self.rewards = []
        self.is_terminals = []
    
    def clear_memory(self):
        del self.states[:]
        del self.actions[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.is_terminals[:]

def compute_gae(rewards, values, done_vals, gamma=0.99, lam=0.95):
    advantages = []
    gae = 0
    for i in reversed(range(len(rewards))):
        delta = rewards[i] + gamma * values[i + 1] * (1 - done_vals[i]) - values[i]
        gae = delta + gamma * lam * (1 - done_vals[i]) * gae
        advantages.insert(0, gae)
    return advantages

class PPO:
    def __init__(self, state_dim, action_dim, lr=1e-3, gamma=0.99, eps_clip=0.2, lam=0.95):
        self.policy = PolicyNetwork(state_dim, action_dim)
        self.critic = CriticNetwork(state_dim)
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.lam = lam
        self.policy_old = PolicyNetwork(state_dim, action_dim)
        self.policy_old.set_weights(self.policy.get_weights())
    
    def select_action(self, state):
        state = tf.expand_dims(state, axis=0)
        probs = self.policy_old(state)
        dist = tf.random.categorical(tf.math.log(probs), 1)
        action = tf.squeeze(dist, axis=-1).numpy()[0]

        return action, tf.math.log(probs[0, action])
    
    def update(self, experiences):
        states, actions, logprobs, rewards, _, done_vals = experiences

        # Calculate discounted rewards
        discounted_rewards = []
        discounted_reward = 0
        for reward, is_done in zip(reversed(rewards), reversed(done_vals)):
            if is_done:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            discounted_rewards.insert(0, discounted_reward)

        # Convert to tensors
        rewards = tf.convert_to_tensor(discounted_rewards, dtype=tf.float32)
        old_states = tf.convert_to_tensor(states, dtype=tf.float32)
        old_actions = actions
        old_actions = tf.cast(old_actions, tf.int32)
        old_logprobs = tf.convert_to_tensor(logprobs, dtype=tf.float32)

        # Get values from the critic
        values = self.critic(old_states).numpy()
        values = np.append(values, 0)  # Append 0 for the last value

        # Compute advantages
        advantages = compute_gae(rewards, values, done_vals, self.gamma, self.lam)
        advantages = tf.convert_to_tensor(advantages, dtype=tf.float32)

        # Ensure the policy model is built
        dummy_input = tf.convert_to_tensor(np.zeros((1, old_states.shape[1])), dtype=tf.float32)
        _ = self.policy(dummy_input)

        # Training step
        with tf.GradientTape() as tape:
            probs = self.policy(old_states)
            probs = tf.clip_by_value(probs, 1e-10, 1.0)
            logprobs = tf.math.log(tf.gather(probs, old_actions, axis=1, batch_dims=1))
            dist_entropy = -tf.reduce_sum(probs * tf.math.log(probs), axis=1)
            ratios = tf.exp(logprobs - old_logprobs)
            surr1 = tf.expand_dims(ratios, axis=1) * advantages
            surr2 = tf.expand_dims(tf.clip_by_value(ratios, 1 - self.eps_clip, 1 + self.eps_clip), axis=1) * advantages
            loss = -tf.reduce_mean(tf.minimum(surr1, surr2)) + 0.01 * tf.reduce_mean(dist_entropy)
            value_loss = tf.reduce_mean(tf.square(rewards - self.critic(old_states)))
            total_loss = loss + value_loss

        # Apply gradients
        grads = tape.gradient(total_loss, self.policy.trainable_variables + self.critic.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.policy.trainable_variables + self.critic.trainable_variables))

        # Copy weights to policy_old
        self.policy_old.set_weights(self.policy.get_weights())

        # states, actions, logprobs, rewards, _, done_vals = experiences
        # rewards = []
        # discounted_reward = 0
        # for reward, is_terminal in zip(reversed(rewards), reversed(done_vals)):
        #     if is_terminal:
        #         discounted_reward = 0
        #     discounted_reward = reward + (self.gamma * discounted_reward)
        #     rewards.insert(0, discounted_reward)
        

        # # rewards, old_states, old_actions, old_logprobs, *_ = experiences
        # rewards         = tf.convert_to_tensor(rewards, dtype=tf.float32)
        # old_states      = tf.convert_to_tensor(states, dtype=tf.float32)
        # old_actions     = tf.convert_to_tensor(actions, dtype=tf.int32)
        # old_logprobs    = tf.convert_to_tensor(logprobs, dtype=tf.float32)
        # values          = self.critic(old_states).numpy()
        # values          = np.append(values, 0)  # Append 0 for the last value
        
        # advantages = compute_gae(rewards, values, self.gamma, self.lam)
        # advantages = tf.convert_to_tensor(advantages, dtype=tf.float32)
        
        # with tf.GradientTape() as tape:
        #     logprobs = tf.math.log(tf.gather(self.policy(old_states), old_actions, axis=1, batch_dims=1))
        #     dist_entropy = -tf.reduce_sum(self.policy(old_states) * tf.math.log(self.policy(old_states)), axis=1)
        #     ratios = tf.exp(logprobs - old_logprobs)
        #     print(ratios)
        #     print(advantages)
        #     surr1 = tf.expand_dims(ratios, axis=1) * advantages
        #     surr2 = tf.expand_dims(tf.clip_by_value(ratios, 1-self.eps_clip, 1+self.eps_clip), axis=1) * advantages
        #     loss = -tf.reduce_mean(tf.minimum(surr1, surr2)) + 0.01 * tf.reduce_mean(dist_entropy)
        #     value_loss = tf.reduce_mean(tf.square(rewards - self.critic(old_states)))
        #     total_loss = loss + value_loss
        
        # grads = tape.gradient(total_loss, self.policy.trainable_variables + self.critic.trainable_variables)
        # self.optimizer.apply_gradients(zip(grads, self.policy.trainable_variables + self.critic.trainable_variables))
        # self.policy_old.set_weights(self.policy.get_weights())


In [66]:
ppo_agent.select_action(state)

(0, <tf.Tensor: shape=(), dtype=float32, numpy=0.0>)

In [68]:
num_episodes = 1000
max_timesteps = 50
update_timestep = 100
timestep = 0

env = TradingEnv(data=series)

# Store experiences as named tuples
experience = namedtuple("Experience", field_names=["state", "action", "logprob", "reward", "next_state", "done"])

# TRAINING LOG SETUP
# env.create_writer()                # Create TensorBoard writer
state_dim = len(mapping)
action_dim = len(action_space)

# PPO agent and memory buffer
ppo_agent = PPO(state_dim, action_dim)
memory_buffer = deque(maxlen=MEMORY_SIZE)

episodes = np.random.choice(range(0, env.length-max_timesteps), size=num_episodes, replace=False)

for i, episode in enumerate( episodes):
    print(f'Episode {i+1}/{num_episodes}')
    state, reward, done = env.reset(start_index=episode, max_timesteps=max_timesteps)
    episode_reward = 0

    for t in range(max_timesteps-1):
        for i in range(50):
        # print(timestep)
        # timestep += 1

        # Select action
            action, logprob = ppo_agent.select_action(state)
            # print(f'action {action} - logprob {logprob}')
            # next_state, reward, done = env.step(action)

            # Store experience in memory
            memory_buffer.append(experience(state, action, logprob, reward, next_state, done))

            state = next_state

            episode_reward += reward

            # Update PPO agent
            if t+1 % update_timestep == 0:
                # Sample random mini-batch of experience tuples (S,A,R,S') from D
                experiences = utils.get_experiences(memory_buffer)
                ppo_agent.update(experiences)
                # timestep = 0

            if done:
                print(f"Episode {i+1}/{num_episodes} - Timestep {t+1}/{max_timesteps} - Net worth {env.net_worth[-1]}")
                break

    print(f"Episode {episode + 1}, Reward: {episode_reward},")


Episode 1/1000
Episode 717, Reward: 0,
Episode 2/1000
Episode 780, Reward: 0,
Episode 3/1000
Episode 962, Reward: 0,
Episode 4/1000


KeyboardInterrupt: 

In [18]:
ppo_agent.policy.get_weights()

NameError: name 'ppo_agent' is not defined

In [12]:
ppo_agent = PPO(state_dim, action_dim)
max_timesteps = 26
state, reward, done = env.reset(start_index=119, max_timesteps=max_timesteps)
for t in range(max_timesteps-1):
    # Select action
    action, logprob = ppo_agent.select_action(state)
    # print(f'action {action} - logprob {logprob}')
    next_state, reward, done = env.step(action)

    # Store experience in memory
    memory_buffer.append(experience(state, action, logprob, reward, next_state, done))

    state = next_state
    episode_reward += reward
    
states = tf.convert_to_tensor(
    [memory_buffer[e].state for e in range(len(memory_buffer)-1)],
    dtype='float32'
    )

actions = tf.convert_to_tensor(
    [memory_buffer[e].action for e in range(len(memory_buffer)-1)],
    dtype=tf.int32
    )

logprobs = tf.convert_to_tensor(
    [memory_buffer[e].logprob for e in range(len(memory_buffer)-1) ],
     dtype=tf.float32)

rewards = [memory_buffer[e].reward for e in range(len(memory_buffer)-1)]


next_states = tf.convert_to_tensor(
    [memory_buffer[e].next_state for e in range(len(memory_buffer)-1)],
    dtype=tf.float32
    )

done_vals = tf.convert_to_tensor(
    [np.float32(memory_buffer[e].done) for e in range(len(memory_buffer)-1)],
    dtype=tf.float32
    )

experiences = (states, actions, logprobs, rewards, next_states, done_vals)



In [13]:
gamma = 0.99
states, actions, logprobs, rewards, _, done_vals = experiences

# Calculate discounted rewards
discounted_rewards = []
discounted_reward = 0
for reward, is_done in zip(reversed(rewards), reversed(done_vals)):
    if is_done:
        discounted_reward = 0
    discounted_reward = reward + (gamma * discounted_reward)
    discounted_rewards.insert(0, discounted_reward)

# Convert to tensors
rewards = tf.convert_to_tensor(discounted_rewards, dtype=tf.float32)
old_states = tf.convert_to_tensor(states, dtype=tf.float32)
old_actions = tf.convert_to_tensor(actions, dtype=tf.int32)
old_logprobs = tf.convert_to_tensor(logprobs, dtype=tf.float32)

# Get values from the critic
values = ppo_agent.critic(old_states).numpy()
values = np.append(values, 0)  # Append 0 for the last value

# Compute advantages
advantages = compute_gae(rewards, values, done_vals, ppo_agent.gamma, ppo_agent.lam)
advantages = tf.convert_to_tensor(advantages, dtype=tf.float32)

# Ensure the policy model is built
dummy_input = tf.convert_to_tensor(np.zeros((1, old_states.shape[1])), dtype=tf.float32)
_ = ppo_agent.policy(dummy_input)

# Training step
with tf.GradientTape() as tape:
    probs = ppo_agent.policy(old_states)
    probs = tf.clip_by_value(probs, 1e-10, 1.0)
    logprobs = tf.math.log(tf.gather(probs, old_actions, axis=1, batch_dims=1))
    dist_entropy = -tf.reduce_sum(probs * tf.math.log(probs), axis=1)
    ratios = tf.exp(logprobs - old_logprobs)
    surr1 = tf.expand_dims(ratios, axis=1) * advantages
    surr2 = tf.expand_dims(tf.clip_by_value(ratios, 1 - ppo_agent.eps_clip, 1 + ppo_agent.eps_clip), axis=1) * advantages
    loss = -tf.reduce_mean(tf.minimum(surr1, surr2)) + 0.01 * tf.reduce_mean(dist_entropy)
    value_loss = tf.reduce_mean(tf.square(rewards - ppo_agent.critic(old_states)))
    total_loss = loss + value_loss

# Apply gradients
grads = tape.gradient(total_loss, ppo_agent.policy.trainable_variables + ppo_agent.critic.trainable_variables)
ppo_agent.optimizer.apply_gradients(zip(grads, ppo_agent.policy.trainable_variables + ppo_agent.critic.trainable_variables))

# Copy weights to policy_old
ppo_agent.policy_old.set_weights(ppo_agent.policy.get_weights())


In [86]:
ppo_agent.policy_old.get_weights()

[array([[-1.15601808e-01,  1.84389815e-01,  3.01752985e-03,
          5.75450063e-03,  7.34859258e-02, -1.42361224e-01,
          2.78608501e-02, -1.42272025e-01,  4.14957255e-02,
          1.23044103e-02, -2.18932331e-02, -1.55890614e-01,
          3.22590023e-02,  1.67304471e-01, -1.21764243e-02,
          1.22337043e-03, -2.05073550e-01, -6.51282072e-03,
          2.07127944e-01,  2.86708772e-02,  1.29683778e-01,
          9.95540470e-02, -1.48160726e-01, -3.95816565e-02,
          1.62010744e-01, -1.87491447e-01,  1.36520788e-01,
          4.71553206e-03,  1.79946467e-01,  2.73328424e-02,
         -1.76280290e-01,  5.68728596e-02,  5.85278869e-03,
          1.38843015e-01,  5.68162054e-02,  1.59480974e-01,
          2.00298592e-01, -5.38851023e-02,  7.21125752e-02,
         -1.23790801e-02, -1.93298817e-01, -1.49940565e-01,
         -1.16506577e-01, -7.58408010e-02, -1.41963169e-01,
         -1.71932995e-01, -4.25611436e-02, -1.99298114e-02,
         -1.25296682e-01,  2.10477635e-0

In [14]:
advantages

<tf.Tensor: shape=(44,), dtype=float32, numpy=
array([-1.1930662e+10, -1.2127937e+10, -1.2337765e+10, -1.2560848e+10,
       -1.2798065e+10, -1.3050268e+10, -1.3318447e+10, -1.3603593e+10,
       -1.3906797e+10, -1.4229164e+10, -1.4571965e+10, -1.4936414e+10,
       -1.5323933e+10, -1.5735954e+10, -1.6174060e+10, -1.6639864e+10,
       -1.7135172e+10, -1.7661780e+10, -1.8221703e+10, -1.8817044e+10,
       -1.9450073e+10, -2.0123132e+10, -2.0838771e+10, -2.1599689e+10,
       -2.2408741e+10, -2.3269007e+10, -2.4183667e+10, -2.5156188e+10,
       -2.6190240e+10, -2.7289723e+10, -2.8458750e+10, -2.9701761e+10,
       -3.1023380e+10, -3.2428612e+10, -3.3922685e+10, -3.5511341e+10,
       -3.7200531e+10, -3.8996558e+10, -4.0906224e+10, -4.2936689e+10,
       -4.5095608e+10, -4.7391105e+10, -4.9831846e+10, -5.2426977e+10],
      dtype=float32)>

In [94]:
dummy_input = tf.convert_to_tensor(np.zeros((1, state_dim)), dtype=tf.float32)
_ = ppo_agent.policy(dummy_input)

In [95]:
for i, (w1, w2) in enumerate(zip(policy_weights, policy_old_weights)):
    print(f"Layer {i}: policy shape {w1.shape}, policy_old shape {w2.shape}")


Layer 0: policy shape (6, 128), policy_old shape (6, 128)
Layer 1: policy shape (128,), policy_old shape (128,)
Layer 2: policy shape (128, 128), policy_old shape (128, 128)
Layer 3: policy shape (128,), policy_old shape (128,)
Layer 4: policy shape (128, 3), policy_old shape (128, 3)
Layer 5: policy shape (3,), policy_old shape (3,)


In [96]:
gamma = 0.99
states, actions, logprobs, rewards, _, done_vals = experiences
rewards = []
discounted_reward = 0
for reward, is_done in zip(reversed(rewards), reversed(done_vals)):
    if is_done:
        discounted_reward = 0
    discounted_reward = reward + (gamma * discounted_reward)
    rewards.insert(0, discounted_reward)



rewards, old_states, old_actions, old_logprobs, *_ = experiences
rewards         = tf.convert_to_tensor(rewards, dtype=tf.float32)
old_states      = tf.convert_to_tensor(states, dtype=tf.float32)
old_actions     = tf.convert_to_tensor(actions, dtype=tf.int32)
old_logprobs    = tf.convert_to_tensor(logprobs, dtype=tf.float32)
values          = ppo_agent.critic(old_states).numpy()
values          = np.append(values, 0)  # Append 0 for the last value

advantages = compute_gae(rewards, values, ppo_agent.gamma, ppo_agent.lam)
advantages = tf.convert_to_tensor(advantages, dtype=tf.float32)

with tf.GradientTape() as tape:
    probs = ppo_agent.policy(old_states)
    probs = tf.clip_by_value(probs, 1e-10, 1.0)
    logprobs = tf.math.log(tf.gather(probs, old_actions, axis=1, batch_dims=1))
    policy_old_outputs = ppo_agent.policy(old_states)
    policy_old_outputs = tf.clip_by_value(policy_old_outputs, 1e-10, 1.0)
    dist_entropy = -tf.reduce_sum(policy_old_outputs * tf.math.log(policy_old_outputs), axis=1) #-tf.reduce_sum(ppo_agent.policy(old_states) * tf.math.log(ppo_agent.policy(old_states)), axis=1)
    ratios = tf.exp(logprobs - old_logprobs)
    print(ratios)
    # print(advantages)
    surr1 = tf.expand_dims(ratios, axis=1) * advantages
    surr2 = tf.expand_dims(tf.clip_by_value(ratios, 1-ppo_agent.eps_clip, 1+ppo_agent.eps_clip), axis=1) * advantages
    loss = -tf.reduce_mean(tf.minimum(surr1, surr2)) + 0.01 * tf.reduce_mean(dist_entropy)
    value_loss = tf.reduce_mean(tf.square(rewards - ppo_agent.critic(old_states)))
    total_loss = loss + value_loss

grads = tape.gradient(total_loss, ppo_agent.policy.trainable_variables + ppo_agent.critic.trainable_variables)
ppo_agent.optimizer.apply_gradients(zip(grads, ppo_agent.policy.trainable_variables + ppo_agent.critic.trainable_variables))


policy_weights = ppo_agent.policy.get_weights()
policy_old_weights = ppo_agent.policy_old.get_weights()

# for i, (w1, w2) in enumerate(zip(policy_weights, policy_old_weights)):
#     print(f"Layer {i}: policy shape {w1.shape}, policy_old shape {w2.shape}")


ppo_agent.policy_old.set_weights(ppo_agent.policy.get_weights())

tf.Tensor(
[9.99998764e-11 9.99998764e-11 9.99998764e-11 9.99998764e-11
 9.99998764e-11 9.99998764e-11 9.99998764e-11 9.99998764e-11
 9.99998764e-11 9.99998764e-11 9.99998764e-11 9.99998764e-11
 9.99998764e-11 9.99998764e-11 9.99998764e-11 9.99998764e-11
 9.99998764e-11 9.99998764e-11 9.99998764e-11 9.99998764e-11
 9.99998764e-11 9.99998764e-11 9.99998764e-11 9.99998764e-11
 9.99998764e-11 9.99998764e-11 9.99998764e-11 9.99998764e-11
 9.99998764e-11 9.99998764e-11 9.99998764e-11 9.99998764e-11
 9.99998764e-11 9.99998764e-11 9.99998764e-11 9.99998764e-11
 9.99998764e-11 9.99998764e-11 9.99998764e-11 9.99998764e-11
 1.00000064e-10 1.00000064e-10 1.00000064e-10 1.00000064e-10], shape=(44,), dtype=float32)


In [78]:
ppo_agent.policy_old(states)

<tf.Tensor: shape=(199, 3), dtype=float32, numpy=
array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
     

In [158]:
for i, (w1, w2, g1) in enumerate(zip(policy_weights, policy_old_weights, grads)):
    print(f"Layer {i}: policy shape {w1.shape}, policy_old shape {w2.shape} grad shape {g1.shape}")


Layer 0: policy shape (6, 128), policy_old shape (6, 128) grad shape (6, 128)
Layer 1: policy shape (128,), policy_old shape (128,) grad shape (128,)
Layer 2: policy shape (128, 128), policy_old shape (128, 128) grad shape (128, 128)
Layer 3: policy shape (128,), policy_old shape (128,) grad shape (128,)
Layer 4: policy shape (128, 3), policy_old shape (128, 3) grad shape (128, 3)
Layer 5: policy shape (3,), policy_old shape (3,) grad shape (3,)


In [95]:
import tensorflow as tf
from tensorflow.keras import layers

class PolicyNetwork(tf.keras.Model):
    def __init__(self, state_dim, action_dim):
        super(PolicyNetwork, self).__init__()
        self.dense1 = layers.Dense(128, activation='relu')
        self.dense2 = layers.Dense(128, activation='relu')
        self.logits = layers.Dense(action_dim, activation=None)
    
    def call(self, inputs):
        print(f"Input shape: {inputs.shape}")
        x = self.dense1(inputs)
        print(f"Shape after dense1: {x.shape}")
        x = self.dense2(x)
        print(f"Shape after dense2: {x.shape}")
        x = self.logits(x)
        print(f"Shape after logits: {x.shape}")
        return tf.nn.softmax(x)

# Example usage
state_dim = 6  # Example state dimension
action_dim = 3  # Number of actions

policy_net = PolicyNetwork(state_dim, action_dim)
example_input = tf.random.normal((1, state_dim))  # Add batch dimension
output = policy_net(example_input)

print(f"Output: {output}")

# Ensure the input has the correct shape when calling the policy network
state = states[0] #tf.random.normal((state_dim,))  # Example state
print(f"Original state shape: {state.shape}")
state = tf.expand_dims(state, 0)  # Add batch dimension
print(f"State shape after expand_dims: {state.shape}")
output = policy_net(state)

print(f"Output: {output}")


Input shape: (1, 6)
Shape after dense1: (1, 128)
Shape after dense2: (1, 128)
Shape after logits: (1, 3)
Output: [[0.26531073 0.27612218 0.45856702]]
Original state shape: (6,)
State shape after expand_dims: (1, 6)
Input shape: (1, 6)
Shape after dense1: (1, 128)
Shape after dense2: (1, 128)
Shape after logits: (1, 3)
Output: [[1. 0. 0.]]


### Deployment

## Algorithm 1: PPO, Actor-Critic Style

1. **for** iteration = 1, 2, . . .
   - **for** actor = 1, 2, . . . , N
     - Run policy πθold in environment for T timesteps
     - Compute advantage estimates Aˆ1, . . . , AˆT
   - Optimize surrogate L wrt θ, with K epochs and minibatch size M ≤ NT
   - θold ← θ
