In [1]:
from dataclasses import dataclass
import pandas as pd
import numpy as np
from stable_baselines3 import PPO

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
@dataclass
class RandomAgent:
    
    env_d: int # number of ticks away from mid price the market maker can quote

    def __post_init__(self):
        self.reset()

    def reset(self):
        self.t = 0

    def get_action(self):
        self.t += 1
        selected_action_bid = np.random.randint(0, self.env_d)
        selected_action_ask = np.random.randint(0, self.env_d)
        return np.array([selected_action_bid, selected_action_ask])

In [3]:
@dataclass
class QAgent:
    
    # Environment params
    env_d: int # number of ticks away from mid price the market maker can quote
    env_Q: int # the maximum (absolute) allowed held volume
    env_T: int # the number of time steps the model will be run for

    # Hyperparameters
    learning_rate: float = 0.1
    gamma: float = 0.99
    epsilon: float = 1.0
    epsilon_min: float = 0.01
    epsilon_decay: float = 0.3

    def __post_init__(self):
        self.reset()

    def reset(self):
        # Initialize Q-table
        self.q_table = {(i, j): np.zeros((self.env_d, self.env_d)) 
                        for i in range(-self.env_Q, self.env_Q + 1) 
                        for j in range(self.env_T + 1)}
        # log
        self.t = 0
        self.log = []

    def update_Q(self, action, reward, state, next_state):
        # Update Q action-value given (action, reward)
        self.q_table[state][action] += self.learning_rate * ((reward + self.gamma * (np.max(self.q_table[next_state]))) - self.q_table[state][action])
        # record log
        self.log.append({'t':self.t, 
                         'state':state, 'action':action, 'reward':reward, 'next_state':next_state,
                         'q_table':self.q_table.copy()})
   
    def get_action(self, state):
        self.t += 1
        if np.random.rand() < self.epsilon:
            # Random explore
            selected_action_bid = np.random.randint(0, self.env_d)
            selected_action_ask = np.random.randint(0, self.env_d) 
            return np.array([selected_action_bid, selected_action_ask])
        else:
            # Best exploit
            return np.array(np.unravel_index(np.argmax(self.q_table[state]), self.q_table[state].shape))

    def decay_epsilon(self):
        self.epsilon = max(self.epsilon_min, self.epsilon - self.epsilon_decay)

    def get_log(self, show=True):
        log_df = pd.DataFrame(self.log)
        if show: display(log_df)
        return log_df

In [4]:
@dataclass
class ExploreFirstAgent:

    num_actions_bid: int
    num_actions_ask: int
    max_explore: int

    def __post_init__(self):
        self.reset()

    def reset(self):
        # log
        self.t = 0
        self.log = []
        # action counts n(a)
        self.action_counts_bid = np.zeros(self.num_actions_bid, dtype=int) 
        self.action_counts_ask = np.zeros(self.num_actions_ask, dtype=int) 
        # action value Q(a)
        self.Q_bid = np.zeros(self.num_actions_bid, dtype=float) 
        self.Q_ask = np.zeros(self.num_actions_ask, dtype=float) 

    def update_Q(self, action, reward):
        # update Q action-value given (action, reward)
        # bid
        action_bid = action[0]
        sum_reward = self.Q_bid[action_bid] * self.action_counts_bid[action_bid] + reward # Sum of reward
        self.action_counts_bid[action_bid] = self.action_counts_bid[action_bid] + 1 # Update counts
        self.Q_bid[action_bid] = sum_reward / self.action_counts_bid[action_bid] # Update Q
        # ask
        action_ask = action[1]
        sum_reward = self.Q_ask[action_ask] * self.action_counts_ask[action_ask] + reward # Sum of reward
        self.action_counts_ask[action_ask] = self.action_counts_ask[action_ask] + 1 # Update counts
        self.Q_ask[action_ask] = sum_reward / self.action_counts_ask[action_ask] # Update Q
        # record log
        self.log.append({'t': self.t, 
                         'action_bid':action_bid, 'N_bid':self.action_counts_bid.copy(), 'Q_bid':self.Q_bid.copy(),
                         'action_ask':action_ask, 'N_ask':self.action_counts_ask.copy(), 'Q_ask':self.Q_ask.copy()})

    def get_action(self):
        self.t += 1
        if self.t <= self.max_explore: 
            # Random explore
            selected_action_bid = np.random.randint(0, self.num_actions_bid)
            selected_action_ask = np.random.randint(0, self.num_actions_ask) 
        else:
            # Best exploit
            selected_action_bid = np.argmax(self.Q_bid)
            selected_action_ask = np.argmax(self.Q_ask)

        return [selected_action_bid, selected_action_ask]   
    
    def get_log(self, show=True):
        log_df = pd.DataFrame(self.log)
        if show: display(log_df)
        return log_df

In [5]:
@dataclass
class UCBAgent:
    
    num_actions_bid: int
    num_actions_ask: int

    def __post_init__(self):
        self.reset()

    def reset(self):
        # log
        self.t = 0
        self.log = []
        # action counts n(a)
        self.action_counts_bid = np.zeros(self.num_actions_bid, dtype=int) 
        self.action_counts_ask = np.zeros(self.num_actions_ask, dtype=int) 
        # action value Q(a)
        self.Q_bid = np.zeros(self.num_actions_bid, dtype=float) 
        self.Q_ask = np.zeros(self.num_actions_ask, dtype=float) 

    def update_Q(self, action, reward):
        # Update Q action-value given (action, reward)
        # bid
        action_bid = action[0]
        sum_reward = self.Q_bid[action_bid] * self.action_counts_bid[action_bid] + reward # Sum of reward
        self.action_counts_bid[action_bid] = self.action_counts_bid[action_bid] + 1 # Update counts
        self.Q_bid[action_bid] = sum_reward / self.action_counts_bid[action_bid] # Update Q
        # ask
        action_ask = action[1]
        sum_reward = self.Q_ask[action_ask] * self.action_counts_ask[action_ask] + reward # Sum of reward
        self.action_counts_ask[action_ask] = self.action_counts_ask[action_ask] + 1 # Update counts
        self.Q_ask[action_ask] = sum_reward / self.action_counts_ask[action_ask] # Update Q
        # record log
        self.log.append({'t': self.t, 
                         'action_bid':action_bid, 'N_bid':self.action_counts_bid.copy(), 'Q_bid':self.Q_bid.copy(),
                         'action_ask':action_ask, 'N_ask':self.action_counts_ask.copy(), 'Q_ask':self.Q_ask.copy()})
   
    def get_action(self):
        self.t += 1
        # Calculate the exploration bonus. To avoid a division by zero, add a small delta=1e-5 to the denominator
        # bid
        exploration_bonus_bid = np.zeros(self.num_actions_bid, dtype=float)
        delta = 1e-5
        for action_bid in range(self.num_actions_bid):
            exploration_bonus_bid[action_bid] = np.sqrt(4 * np.log(self.t) / (self.action_counts_bid[action_bid] + delta))
        Q_explore_bid = self.Q_bid + exploration_bonus_bid
        selected_action_bid = np.argmax(Q_explore_bid)
        # ask
        exploration_bonus_ask = np.zeros(self.num_actions_ask, dtype=float)
        delta = 1e-5
        for action_ask in range(self.num_actions_ask):
            exploration_bonus_ask[action_ask] = np.sqrt(4 * np.log(self.t) / (self.action_counts_ask[action_ask] + delta))
        Q_explore_ask = self.Q_ask + exploration_bonus_ask
        selected_action_ask = np.argmax(Q_explore_ask)

        return [selected_action_bid, selected_action_ask]   
    
    def get_log(self, show=True):
        log_df = pd.DataFrame(self.log)
        if show: display(log_df)
        return log_df

In [6]:
@dataclass
class EpsilonGreedyAgent:
    
    num_actions_bid: int
    num_actions_ask: int
    epsilon: float = 0.1

    def __post_init__(self):
        self.reset()

    def reset(self):
        # log
        self.t = 0
        self.log = []
        # action counts n(a)
        self.action_counts_bid = np.zeros(self.num_actions_bid, dtype=int) 
        self.action_counts_ask = np.zeros(self.num_actions_ask, dtype=int) 
        # action value Q(a)
        self.Q_bid = np.zeros(self.num_actions_bid, dtype=float) 
        self.Q_ask = np.zeros(self.num_actions_ask, dtype=float) 

    def update_Q(self, action, reward):
        # Update Q action-value given (action, reward)
        # bid
        action_bid = action[0]
        sum_reward = self.Q_bid[action_bid] * self.action_counts_bid[action_bid] + reward # Sum of reward
        self.action_counts_bid[action_bid] = self.action_counts_bid[action_bid] + 1 # Update counts
        self.Q_bid[action_bid] = sum_reward / self.action_counts_bid[action_bid] # Update Q
        # ask
        action_ask = action[1]
        sum_reward = self.Q_ask[action_ask] * self.action_counts_ask[action_ask] + reward # Sum of reward
        self.action_counts_ask[action_ask] = self.action_counts_ask[action_ask] + 1 # Update counts
        self.Q_ask[action_ask] = sum_reward / self.action_counts_ask[action_ask] # Update Q
        # record log
        self.log.append({'t': self.t, 
                         'action_bid':action_bid, 'N_bid':self.action_counts_bid.copy(), 'Q_bid':self.Q_bid.copy(),
                         'action_ask':action_ask, 'N_ask':self.action_counts_ask.copy(), 'Q_ask':self.Q_ask.copy()})
   
    def get_action(self):
        self.t += 1
        # Epsilon-greedy policy
        if np.random.random() < self.epsilon:
            # Random explore
            selected_action_bid = np.random.randint(0, self.num_actions_bid)
            selected_action_ask = np.random.randint(0, self.num_actions_ask) 
        else:
            # Best exploit
            selected_action_bid = np.argmax(self.Q_bid)
            selected_action_ask = np.argmax(self.Q_ask)

        return [selected_action_bid, selected_action_ask]   
    
    def get_log(self, show=True):
        log_df = pd.DataFrame(self.log)
        if show: display(log_df)
        return log_df

In [7]:
@dataclass
class DecayEpsilonGreedyAgent:
    
    num_actions_bid: int
    num_actions_ask: int
    epsilon: float = 0.5
    epsilon_min: float = 0.01
    epsilon_decay: float = 0.01

    def __post_init__(self):
        self.reset()

    def reset(self):
        # log
        self.t = 0
        self.log = []
        # action counts n(a)
        self.action_counts_bid = np.zeros(self.num_actions_bid, dtype=int) 
        self.action_counts_ask = np.zeros(self.num_actions_ask, dtype=int) 
        # action value Q(a)
        self.Q_bid = np.zeros(self.num_actions_bid, dtype=float) 
        self.Q_ask = np.zeros(self.num_actions_ask, dtype=float) 

    def update_Q(self, action, reward):
        # Update Q action-value given (action, reward)
        # bid
        action_bid = action[0]
        sum_reward = self.Q_bid[action_bid] * self.action_counts_bid[action_bid] + reward # Sum of reward
        self.action_counts_bid[action_bid] = self.action_counts_bid[action_bid] + 1 # Update counts
        self.Q_bid[action_bid] = sum_reward / self.action_counts_bid[action_bid] # Update Q
        # ask
        action_ask = action[1]
        sum_reward = self.Q_ask[action_ask] * self.action_counts_ask[action_ask] + reward # Sum of reward
        self.action_counts_ask[action_ask] = self.action_counts_ask[action_ask] + 1 # Update counts
        self.Q_ask[action_ask] = sum_reward / self.action_counts_ask[action_ask] # Update Q
        # record log
        self.log.append({'t': self.t, 
                         'action_bid':action_bid, 'N_bid':self.action_counts_bid.copy(), 'Q_bid':self.Q_bid.copy(),
                         'action_ask':action_ask, 'N_ask':self.action_counts_ask.copy(), 'Q_ask':self.Q_ask.copy()})
   
    def get_action(self):
        self.t += 1
        # Epsilon-greedy policy
        if np.random.random() < self.epsilon:
            # Random explore
            selected_action_bid = np.random.randint(0, self.num_actions_bid)
            selected_action_ask = np.random.randint(0, self.num_actions_ask) 
        else:
            # Best exploit
            selected_action_bid = np.argmax(self.Q_bid)
            selected_action_ask = np.argmax(self.Q_ask)

        return [selected_action_bid, selected_action_ask]  

    def decay_epsilon(self):
        self.epsilon = max(self.epsilon_min, self.epsilon - self.epsilon_decay) 
    
    def get_log(self, show=True):
        log_df = pd.DataFrame(self.log)
        if show: display(log_df)
        return log_df

In [8]:
class OptimalAgent:
    def __init__(self, env):
        self.T = env.T
        self.Q = env.Q
        self.dp = env.dp
        self.phi = env.phi

    def reset(self):
        self.t = 0
        self.log = []

    def generate_optimal_depth(self, env, bid=True):
        data = []

        q_s = np.arange(start=-env.Q, stop=env.Q + 1)

        for q in q_s:
            data_q = []
            for t in range(self.T + 1):
                env.t = t
                env.Q_t = q
                depth = env.calc_analytically_optimal()[1 - bid]
                #depth = env.transform_action(env.discrete_analytically_optimal())[1 - bid] * (1 - 2 * bid)

                data_q.append(depth)

            data.append(data_q)

        return np.array(data)

In [6]:
class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(ActorCritic, self).__init__()
        self.actor = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.ReLU(),
            nn.Linear(128, action_dim)
        )
        
        self.critic = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )
        
    def forward(self):
        raise NotImplementedError
        
    def act(self, state, memory):
        state = torch.from_numpy(np.array(state)).float().to(device)
        #state = torch.from_numpy(state).float().to(device) 
        probs = F.softmax(self.actor(state), dim=-1)
        value = self.critic(state)
        
        dist = Categorical(probs)
        action = dist.sample()
        
        memory.states.append(state)
        memory.actions.append(action)
        memory.logprobs.append(dist.log_prob(action))
        memory.values.append(value)
        
        return action.item()
    
    def evaluate(self, state, action):
        state = state.float()
        action = action.float()
        
        probs = F.softmax(self.actor(state), dim=-1)
        value = self.critic(state)
        
        dist = Categorical(probs)
        
        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        
        return action_logprobs, torch.squeeze(value), dist_entropy

In [7]:
class PPO:
    def __init__(self, state_dim, action_dim, lr, betas, gamma, K_epochs, eps_clip):
        self.lr = lr
        self.betas = betas
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        
        self.policy = ActorCritic(state_dim, action_dim).to(device)
        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr, betas=betas)
        self.policy_old = ActorCritic(state_dim, action_dim).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())
        
        self.MseLoss = nn.MSELoss()
    
    def reset(self):
        self.t = 0
        self.log = []
    
    def update(self, memory):
        # Monte Carlo estimate of state rewards:
        rewards = []
        discounted_reward = 0
        for reward, done in zip(reversed(memory.rewards), reversed(memory.is_terminals)):
            if done:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)
        
        # Normalizing the rewards:
        rewards = torch.tensor(rewards).to(device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)
        rewards = rewards.float()
        
        # convert list to tensor
        old_states = torch.stack(memory.states).to(device).detach()
        old_actions = torch.stack(memory.actions).to(device).detach()
        old_logprobs = torch.stack(memory.logprobs).to(device).detach()
        
        # Optimize policy for K epochs:
        for _ in range(self.K_epochs):
            # Evaluating old actions and values :
            logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)
            
            # Finding the ratio (pi_theta / pi_theta__old):
            ratios = torch.exp(logprobs - old_logprobs.detach())
            advantages = rewards - state_values.detach()   
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages
            loss = -torch.min(surr1, surr2) + 0.5*self.MseLoss(state_values, rewards) - 0.01*dist_entropy

            # take gradient step
            self.optimizer.zero_grad()
            loss = loss.float()
            loss.mean().backward()
            self.optimizer.step()

        # Copy new weights into old policy:
        self.policy_old.load_state_dict(self.policy.state_dict())

In [8]:
class Memory:
    def __init__(self):
        self.actions = []
        self.states = []
        self.logprobs = []
        self.rewards = []
        self.is_terminals = []
        self.values = []
    
    def clear_memory(self):
        del self.actions[:]
        del self.states[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.is_terminals[:]
        del self.values[:]