In [None]:
from typing import Optional
from collections import deque
import random
import numpy as np
import pandas as pd
from scipy.stats import norm
import torch
import torch.nn as nn
import torch.optim as optim
import gymnasium as gym
from gymnasium.spaces import Discrete, Tuple
import warnings
warnings.filterwarnings("ignore")
from sklearn.gaussian_process import GaussianProcessRegressor
from DQN_Agent import *
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(101)
if torch.cuda.is_available():
    torch.cuda.manual_seed(101)
    torch.cuda.manual_seed_all(101)

In [None]:
data = pd.read_csv("/kaggle/input/aapl-quotes-data/AAPL_Quotes_Data.csv")
data.head()

In [None]:
# Preprocessing training data, fitler date with less than 390 mins
data["date"] = data["timestamp"].apply(lambda x: x.split(" ")[0])
num_minutes_each_date = data.groupby("date").count()["timestamp"]
valid_date = num_minutes_each_date[num_minutes_each_date == 390].index.tolist()
data = data[data["date"].isin(valid_date)].reset_index().drop(["date", "index"], axis = 1)
data.head()

In [None]:
# total number of date
data.shape[0] // 390
# Use last 8 dates in the data as testing

#### We first try to work on default settings

In [None]:
env = TradingEnv(1000, 390, 35, data, True)
agent_params = {
    "device": device
}

# Train the DQN agent
agent = create_and_train_dqn(agent_params, env, num_episodes = 40)

agent.save_agent_model("agent_model")

In [None]:
env = TradingEnv(1000, 390, 43, data, True)
res = pd.DataFrame(columns = ["timestamp", "shares"])
for i in range(35, 43):
    env.reset()
    env.set_current_date(i)
    curr_res, _ = get_best_actions(agent, env, i)
    res = pd.concat([res, curr_res])
res.to_csv("data\default_dqn_trades.csv", index = False)

#### Now we try to conduct using bayesian optimization

In [None]:
def objective(gamma: float, epsilon: float, epsilon_decay, lr: float):
    # Train the agent
    agent = DQNAgent(device, 390, 1001, gamma = gamma, epsilon = epsilon, epsilon_decay = epsilon_decay, lr = lr)
    env = TradingEnv(num_shares = 1000, num_time_units = 390, num_date = 28, data = data) # among the 35 episode used, 7 of them used in test
    agent = train_dqn(agent, env)
    # Use the agent on the last 7 days
    total_rewards_all = 0
    for i in range(28, 35):
        env.reset()
        env.set_current_date(i)
        _, total_rewards = get_best_actions(agent, env)
        total_rewards_all += total_rewards
    print(f"Finish trial {gamma}, {epsilon}, {epsilon_decay}, {lr} with rewards: {total_rewards}")
    return total_rewards_all

In [None]:
surrogate_model = GaussianProcessRegressor()
# surrogate function or approx for objective function using a Gaussian Process
def surrogate(params):
    return surrogate_model.predict(params, return_std=True)

In [None]:
# Optimize the acquisition function to get the next point
def acquisition(possible_params_new):
    curr_max = np.max(surrogate(params_used))
    # calculate mean and stdev via surrogate function
    obj_new_mean, obj_new_std = surrogate(possible_params_new)
    # calculate the probability of improvement
    probs = norm.cdf((obj_new_mean - curr_max) / (obj_new_std+1E-9))
    return probs

# now we optimize the acquisition function with random search
def optimize_acquisition(rng):
    # Explore some possible choice
    possible_gamma = rng.uniform(0, 1, size = (1000,1))
    possible_epsilon = rng.uniform(0, 1, size = (1000,1))
    possible_epsilon_decay = rng.uniform(0.7, 1, size = (1000,1))
    possible_learning_rate = rng.choice([0.001, 0.003, 0.01, 0.05, 0.1], size = (1000,1))
    possible_params_new = np.hstack((possible_gamma, possible_epsilon, possible_epsilon_decay, possible_learning_rate))
    
    # Calculate acquisition score
    scores = acquisition(possible_params_new)
    
    # get the max score and return as new found X
    return possible_params_new[np.argmax(scores), :]

In [None]:
# Now we conduct Bayesian Optimization
params_used = np.zeros((0, 4))
obj_used = np.zeros((0, 1))

# Default rng
rng = np.random.default_rng(seed = 101)

# Number of BO trials:
n_trials = 15

# Start with some exploration trials:
n_explore_trials = 5

for i in range(n_trials + n_explore_trials):
    if i < n_explore_trials:
        # exploration steps
        gamma_new = rng.uniform(0, 1)
        epsilon_new = rng.uniform(0, 1)
        epsilon_decay_new = rng.uniform(0.7, 1)
        learning_rate_new = rng.choice([0.001, 0.003, 0.01, 0.05, 0.1])
        params_new = np.array([gamma_new, epsilon_new, epsilon_decay_new, learning_rate_new])
        obj_new = objective(gamma_new, epsilon_new, epsilon_decay_new, learning_rate_new)
    else:
        # Find next training point
        params_new = optimize_acquisition(rng)
        obj_new = objective(*params_new)
    
    # Train the surrogate model
    params_used = np.vstack((params_used, params_new.reshape((1, 4))))
    obj_used = np.vstack((obj_used, np.array([obj_new]).reshape((1, 1))))
    surrogate_model.fit(params_used, obj_used)

In [None]:
print(params_used[np.argmax(obj_used), :])