In [1]:
%matplotlib inline

# Imports

In [4]:
# Author: Till Zemann
# License: MIT License

from __future__ import annotations

from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
from matplotlib.patches import Patch
from tqdm import tqdm

import gymnasium as gym
import gym_trading_env
from gym_trading_env.utils.history import History
from gym_trading_env.downloader import download
import datetime


# Create features

- These are the state vector inputs
- this would be an import from `TAindicators`
- only data columns starting with "`feature_`" will appear in the observation space

In [6]:
from ta_indicators import * # Import the indicators

# df is a DataFrame with columns : "open", "high", "low", "close", "Volume USD"

# this can be defined in TAindicators and imported directly to the agent
# after we get through testing things out in notebooks
def define_state_vector(df: pd.DataFrame):
    # TODO: replace these generic features with indicator functions once adapted

    # Create the feature : ( close[t] - close[t-1] )/ close[t-1]
    df["feature_close"] = df["close"].pct_change()

    # Create the feature : open[t] / close[t]
    df["feature_open"] = df["open"]/df["close"]

    # Create the feature : high[t] / close[t]
    df["feature_high"] = df["high"]/df["close"]

    # Create the feature : low[t] / close[t]
    df["feature_low"] = df["low"]/df["close"]

    # Create the feature : volume[t] / max(*volume[t-7*24:t+1])
    df["feature_volume"] = df["Volume USD"] / df["Volume USD"].rolling(7*24).max()

    df.dropna(inplace= True) # Clean again !
    # Eatch step, the environment will return 5 inputs  : "feature_close", "feature_open", "feature_high", "feature_low", "feature_volume"

# Define custom rewards and dynamic features

In [8]:
def add_reward_columns(df: pd.DataFrame):
    """add reward columns to dataframe for incremental updates"""
    for col in ['lr', 'alr', 'var_sum']:
        df[col] = 0

def update_reward_columns(history: History) -> None:
    """Set this episode lr, alr, var_sum, sr, powc"""

    # Using weighted incremental algorithmic approach for average
    # https://math.stackexchange.com/questions/106700/incremental-averaging
    # general formula is: mean = ((n - 1) * last_mean + this_value) / n))

    # logarithmic return
    this_lr = 0
    # if position is 1 (100% BTC)
    if history['position', -1] == 1:
        this_lr = np.log(history['data_close', -1]) - np.log(history['data_close', -2])
    history.__setitem__(('data_lr', -1), this_lr) # update history with new lr


    # running average of logarithmic return
    n = len(history)
    last_alr = history['data_alr', -2]
    this_alr = ((n - 1) * last_alr + this_lr) / n
    history.__setitem__(('data_alr', -1), this_alr) # update history with new alr

    # running variance sum of logarithmic return
    # for each nth row, dividing this sum by n gives population variance
    last_alr = history['data_alr', -2]
    last_var_sum = history['data_var_sum', -2]
    this_var_sum = last_var_sum + abs((this_lr - last_alr) * (this_lr - this_alr))
    history.__setitem__(('data_var_sum', -1), this_var_sum)

def get_random_weights(arr_len):
    """get numpy array of random weights"""
    max_val = 100
    weight_vector = np.zeros(arr_len)
    for i in range(arr_len - 1):
        n = np.random.randint(0, max_val)
        max_val = max_val - n
        weight_vector[i] = n
    weight_vector /= 100
    weight_vector[-1] = 1 - sum(weight_vector[:-1])
    np.random.shuffle(weight_vector)
    return weight_vector


def reward_function(history: History) -> float:
    """reward function for gym-trading-env"""
    update_reward_columns(history)
    average_log_return = history['data_alr', -1]
    var_sum = history['data_var_sum', -1]
    variance = var_sum / len(history)
    std_dev = np.sqrt(variance)
    sharpe_ratio = average_log_return / std_dev
    this_lr = history['data_lr', -1]
    powc = 0
    # if this eposide position is 0 (100% USD) and last position was 1 (100% BTC)
    # this compute time can also be traded for memory by adding a tracking column if needed
    if (history['position', -1] == 0 and history['position', -2] == 1):
        idx = history[-2]['idx']
        while idx >= 0:
            if (history['position', idx] == 0):
                last_lr = history['data_lr', idx + 1]
                powc = this_lr - last_lr
    reward_vector = np.array([average_log_return, sharpe_ratio, powc])
    weight_vector = get_random_weights(len(reward_vector))
    reward = reward_vector @ weight_vector # dot product of random weights and reward values
    return reward

In [9]:
max_val = 100
w = np.zeros(6)
for i in range(5):
    n = np.random.randint(0, max_val)
    max_val = max_val - n
    w[i] = n
w /= 100
w[-1] = 1 - sum(w[:-1])
print(w)
np.random.shuffle(w)
print(w)
print(sum(w))


[0.94 0.   0.03 0.02 0.   0.01]
[0.94 0.   0.01 0.02 0.   0.03]
1.0


# Download training data

In [10]:
# Download data from 2017-01-01 to 2022-12-31
# Download BTC/USDT historical data from Binance and stores it to directory ./data/binance-BTCUSDT-1h.pkl
download(exchange_names = ["binance"],
    symbols= ["BTC/USDT"],
    timeframe= "1h",
    dir = "data",
    since = datetime.datetime(year= 2017, month= 1, day=1),
)
# Import your fresh data
df = pd.read_pickle("./data/binance-BTCUSDT-1h.pkl")
df.head()

ExchangeNotAvailable: binance GET https://api.binance.com/api/v3/exchangeInfo 451  {
  "code": 0,
  "msg": "Service unavailable from a restricted location according to 'b. Eligibility' in https://www.binance.com/en/terms. Please contact customer service if you believe you received this message in error."
}

In [11]:
df = pd.read_pickle('./data/binance-BTCUSDT-1h.pkl')
trainingDF = df.truncate(
    after = pd.Timestamp('2023-01-01'),
    copy = True
)

trainingDF.dropna(inplace=True)
trainingDF.sort_index(inplace=True)
trainingDF.to_csv('./data/binance_training_data.csv')

FileNotFoundError: [Errno 2] No such file or directory: './data/binance-BTCUSDT-1h.pkl'

In [None]:
df = pd.read_pickle('./data/binance-BTCUSDT-1h.pkl')
testingDF = df.truncate(
    before = pd.Timestamp('2023-01-01'),
    copy = True
)

testingDF.dropna(inplace=True)
testingDF.sort_index(inplace=True)
testingDF.to_csv('./data/binance_testing_data.csv')

: 

In [None]:
download(
    exchange_names = ["bitfinex2", "huobi"],
    symbols= ["BTC/USDT"],
    timeframe= "1h",
    dir = "data",
    since= datetime.datetime(year= 2017, month= 1, day=1)
)

: 

In [None]:
huobiDF = pd.read_pickle('./data/huobi-BTCUSDT-1h.pkl')
huobi_training = huobiDF.truncate(after='2023-01-01')
huobi_training.to_csv('./data/huobi-BTCUSDT-1h-training.csv')
huobi_test = huobiDF.truncate(before='2023-01-01')
huobi_test.to_csv('./data/huobi-BTCUSDT-1h-test.csv')

: 

In [None]:
bitfinex2DF = pd.read_pickle('./data/bitfinex2-BTCUSDT-1h.pkl')
bitfinex2_training = bitfinex2DF.truncate(after='2023-01-01')
bitfinex2_test = bitfinex2DF.truncate(before='2023-01-01')
bitfinex2_training.to_csv('./data/bitfinex2-BTCUSDT-1h-training.csv')
bitfinex2_test.to_csv('./data/bitfinex2-BTCUSDT-1h-test.csv')

: 

# Create Environment

In [12]:
env = gym.make("TradingEnv",
        name= "BTCUSD",
        df = df, # Your dataset with your custom features
        positions = [0, 1], # -1 (=SHORT), 0(=SELL ALL), +1 (=BUY ALL)
        #trading_fees = 0.01/100, # 0.01% per stock buy / sell (Binance fees)
        #borrow_interest_rate= 0.0003/100, # 0.0003% per timestep (one timestep = 1h here)
        dynamic_feature_functions = [dynamic_feature_alr],
        reward_function = reward_function
    )

NameError: name 'df' is not defined

# Define Agent

In [None]:
class BitcoinTrainingAgent:
    """Q-learning agent."""
    def __init__(
        self,
        learning_rate: float = 0.001,
        initial_epsilon: float,
        epsilon_decay: float,
        final_epsilon: float,
        discount_factor: float = 0.95,
    ) -> None:
        """Initialize hyperparameters"""
        self.q_values = defaultdict(lambda: np.zeros(env.action_space.n))

        self.lr = learning_rate
        self.discount_factor = discount_factor

        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.final_epsilon = final_epsilon

        self.training_error = []

    def get_action(self, observation):
        """Given an observation, choose an action"""
        # with probability epsilon return a random action to explore the environment
        if np.random.random() < self.epsilon:
            return env.action_space.sample()

        # with probability (1 - epsilon) act greedily (exploit)
        else:
            return int(np.argmax(self.q_values[obs]))

    def update(
        self,
        obs: tuple[int, int, bool],
        action: int,
        reward: float,
        terminated: bool,
        next_obs: tuple[int, int, bool],
    ):
        """Updates the Q-value of an action."""
        future_q_value = (not terminated) * np.max(self.q_values[next_obs])
        temporal_difference = (
            reward + self.discount_factor * future_q_value - self.q_values[obs][action]
        )

        self.q_values[obs][action] = (
            self.q_values[obs][action] + self.lr * temporal_difference
        )
        self.training_error.append(temporal_difference)

: 

# Create Agent

In [None]:
# TODO: define hyperparameters and pass to agent
learning_rate = 0.01
n_episodes = 1
initial_epsilon = 1.0
# This reduces exploration over time.
epsilon_decay = start_epsilon / (n_episodes / 2)
final_epsilon = 0.1

agent = BitcoinTrainingAgent(
    learning_rate=learning_rate,
    initial_epsilon=start_epsilon,
    epsilon_decay=epsilon_decay,
    final_epsilon=final_epsilon,
)

: 

# Train model using agent

In [16]:
# Run an episode until it ends :
done, truncated = False, False
observation, info = env.reset()
while not done and not truncated:
    # Pick a position by its index in your position list (=[-1, 0, 1])....usually something like : position_index = your_policy(observation)
    position_index = env.action_space.sample() # At every timestep, pick a random position index from your position list (=[-1, 0, 1])
    observation, reward, done, truncated, info = env.step(position_index)

Market Return : 423.10%   |   Portfolio Return : -94.73%   |   


In [None]:
for episode in tqdm(range(n_episodes)):
    obs, info = env.reset()
    done, truncated = false, false

    while not done:
        action = agent.get_action(obs)
        next_obs, reward, terminated, truncated, info = env.step(action)

        agent.update(obs, action, reward, terminated, next_obs)

        done = terminated or truncated
        obs = next_obs
    
    agent.decay_epsilon()
