In [88]:
%matplotlib inline

# Imports

In [89]:
# Author: Till Zemann
# License: MIT License

from __future__ import annotations

from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
from matplotlib.patches import Patch
from tqdm import tqdm

import gymnasium as gym
import gym_trading_env
from gym_trading_env.utils.history import History
from gym_trading_env.downloader import download
import datetime


# Create features

- These are the state vector inputs
- this would be an import from `ta_indicators`
- only data columns starting with "`feature_`" will appear in the observation space

In [90]:
from ta_indicators import * # Import the indicators

# df is a DataFrame with columns : "open", "high", "low", "close", "Volume USD"

# this can be defined in TAindicators and imported directly to the agent
# after we get through testing things out in notebooks
def define_state_vector(df: pd.DataFrame):
    """DEPRECATED.  I changed ta_indicators to put 'feature_' in front of all final indicators."""

    # Create the feature : ( close[t] - close[t-1] )/ close[t-1]
    df["feature_close"] = df["close"].pct_change()

    # Create the feature : open[t] / close[t]
    df["feature_open"] = df["open"]/df["close"]

    # Create the feature : high[t] / close[t]
    df["feature_high"] = df["high"]/df["close"]

    # Create the feature : low[t] / close[t]
    df["feature_low"] = df["low"]/df["close"]

    # Create the feature : volume[t] / max(*volume[t-7*24:t+1])
    df["feature_volume"] = df["Volume USD"] / df["Volume USD"].rolling(7*24).max()

    df.dropna(inplace= True) # Clean again !
    # Eatch step, the environment will return 5 inputs  : "feature_close", "feature_open", "feature_high", "feature_low", "feature_volume"
    

df = pd.read_csv("./data/indicators.csv")
df.dropna(inplace= True)

#df.head()


# Define custom rewards and dynamic features

In [91]:
def add_reward_columns(df: pd.DataFrame):
    """add reward columns to dataframe for incremental updates"""
    for col in ['lr', 'alr', 'var_sum']:
        df[col] = 0

def update_reward_columns(history: History) -> None:
    """Set this episode lr, alr, var_sum, sr, powc"""

    # Using weighted incremental algorithmic approach for average
    # https://math.stackexchange.com/questions/106700/incremental-averaging
    # general formula is: mean = ((n - 1) * last_mean + this_value) / n))

    # logarithmic return
    this_lr = 0
    # if position is 1 (100% BTC)
    if history['position', -1] == 1:
        this_lr = np.log(history['data_close', -1]) - np.log(history['data_close', -2])
    history.__setitem__(('data_lr', -1), this_lr) # update history with new lr


    # running average of logarithmic return
    n = len(history)
    last_alr = history['data_alr', -2]
    this_alr = ((n - 1) * last_alr + this_lr) / n
    history.__setitem__(('data_alr', -1), this_alr) # update history with new alr

    # running variance sum of logarithmic return
    # for each nth row, dividing this sum by n gives population variance
    last_alr = history['data_alr', -2]
    last_var_sum = history['data_var_sum', -2]
    this_var_sum = last_var_sum + abs((this_lr - last_alr) * (this_lr - this_alr))
    history.__setitem__(('data_var_sum', -1), this_var_sum)

def get_random_weights(arr_len):
    """get numpy array of random weights"""
    max_val = 100
    weight_vector = np.zeros(arr_len)
    for i in range(arr_len - 1):
        n = np.random.randint(0, max_val)
        max_val = max_val - n
        weight_vector[i] = n
    weight_vector /= 100
    weight_vector[-1] = 1 - sum(weight_vector[:-1])
    np.random.shuffle(weight_vector)
    return weight_vector


def reward_function(history: History) -> float:
    """reward function for gym-trading-env"""
    update_reward_columns(history)
    average_log_return = history['data_alr', -1]
    var_sum = history['data_var_sum', -1]
    variance = var_sum / len(history)
    std_dev = np.sqrt(variance)
    sharpe_ratio = average_log_return / 0.5
    this_lr = history['data_lr', -1]
    powc = 0
    # if this eposide position is 0 (100% USD) and last position was 1 (100% BTC)
    # this compute time can also be traded for memory by adding a tracking column if needed
    if (history['position', -1] == 0 and history['position', -2] == 1):
        idx = history[-2]['idx']
        
        # This is an infinite loop if idx == 0 and history['position', idx] != 0.
        while idx >= 0:
            if (history['position', idx] == 0):
                last_lr = history['data_lr', idx + 1]
                powc = this_lr - last_lr
    reward_vector = np.array([average_log_return, sharpe_ratio, powc])
    weight_vector = get_random_weights(len(reward_vector))
    reward = reward_vector @ weight_vector # dot product of random weights and reward values
    return reward

def dynamic_features(history: History) -> float:
    """Calculates dynamic features."""
    #dyn_features = [last_position, real_position]
    #return dyn_features
    
    pass

In [92]:
max_val = 100
w = np.zeros(6)
for i in range(5):
    n = np.random.randint(0, max_val)
    max_val = max_val - n
    w[i] = n
w /= 100
w[-1] = 1 - sum(w[:-1])
print(w)
np.random.shuffle(w)
print(w)
print(sum(w))


[0.89 0.   0.   0.   0.02 0.09]
[0.   0.   0.09 0.02 0.89 0.  ]
1.0


# Prepare Dataset

In [93]:
#df = pd.read_csv("./data/indicators.csv")

"""trainingDF = df.truncate(
    after = pd.Timestamp('2023-01-01'),
    copy = True
)
testingDF = df.truncate(
    before = pd.Timestamp('2023-01-01'),
    copy = True
)"""
add_reward_columns(df)
df.head()


Unnamed: 0,date,close,open,high,low,vol,feature_ROC_2,feature_ROC_4,feature_ROC_6,feature_ROC_8,...,feature_ATR_2,feature_ATR_4,feature_ATR_6,feature_ATR_8,feature_ATR_16,feature_ATR_32,feature_ATR_64,lr,alr,var_sum
128,2018-01-04,15100.0,14917.0,15267.8,14100.0,21.32K,3.275263,14.165925,4.10087,1.623446,...,1121.1,1376.725,1637.716667,1645.2625,2044.16875,2029.025,1375.210938,0,0,0
129,2018-01-05,16999.0,15095.0,17170.0,14750.0,23.27K,13.202417,25.693765,25.35267,12.420934,...,1793.9,1734.225,1618.033333,1662.475,2085.41875,2080.025,1403.415625,0,0,0
130,2018-01-06,17067.0,16960.3,17132.2,16167.9,18.56K,13.305409,20.152726,28.641627,18.179089,...,1692.15,1406.625,1481.866667,1651.325,1952.83125,2097.659375,1412.132812,0,0,0
131,2018-01-07,16180.0,17090.9,17090.9,15658.5,12.49K,-4.155853,11.522696,26.919518,27.575779,...,1198.35,1496.125,1555.6,1513.1125,1752.10625,2082.121875,1425.732812,0,0,0
132,2018-01-08,14944.2,16200.0,16287.6,13180.0,26.61K,-12.727139,1.077295,12.332664,21.176851,...,2270.0,1981.075,1694.416667,1678.9,1830.14375,2074.859375,1469.440625,0,0,0


In [94]:
"""
df = pd.read_pickle('./data/binance-BTCUSDT-1h.pkl')
trainingDF = df.truncate(
    after = pd.Timestamp('2023-01-01'),
    copy = True
)

trainingDF.dropna(inplace=True)
trainingDF.sort_index(inplace=True)
trainingDF.to_csv('./data/binance_training_data.csv')
"""

"\ndf = pd.read_pickle('./data/binance-BTCUSDT-1h.pkl')\ntrainingDF = df.truncate(\n    after = pd.Timestamp('2023-01-01'),\n    copy = True\n)\n\ntrainingDF.dropna(inplace=True)\ntrainingDF.sort_index(inplace=True)\ntrainingDF.to_csv('./data/binance_training_data.csv')\n"

In [95]:
"""df = pd.read_pickle('./data/binance-BTCUSDT-1h.pkl')
testingDF = df.truncate(
    before = pd.Timestamp('2023-01-01'),
    copy = True
)

testingDF.dropna(inplace=True)
testingDF.sort_index(inplace=True)
testingDF.to_csv('./data/binance_testing_data.csv')
"""

"df = pd.read_pickle('./data/binance-BTCUSDT-1h.pkl')\ntestingDF = df.truncate(\n    before = pd.Timestamp('2023-01-01'),\n    copy = True\n)\n\ntestingDF.dropna(inplace=True)\ntestingDF.sort_index(inplace=True)\ntestingDF.to_csv('./data/binance_testing_data.csv')\n"

In [96]:
"""download(
    exchange_names = ["bitfinex2", "huobi"],
    symbols= ["BTC/USDT"],
    timeframe= "1h",
    dir = "data",
    since= datetime.datetime(year= 2017, month= 1, day=1)
)"""

'download(\n    exchange_names = ["bitfinex2", "huobi"],\n    symbols= ["BTC/USDT"],\n    timeframe= "1h",\n    dir = "data",\n    since= datetime.datetime(year= 2017, month= 1, day=1)\n)'

In [97]:
"""huobiDF = pd.read_pickle('./data/huobi-BTCUSDT-1h.pkl')
huobi_training = huobiDF.truncate(after='2023-01-01')
huobi_training.to_csv('./data/huobi-BTCUSDT-1h-training.csv')
huobi_test = huobiDF.truncate(before='2023-01-01')
huobi_test.to_csv('./data/huobi-BTCUSDT-1h-test.csv')"""

"huobiDF = pd.read_pickle('./data/huobi-BTCUSDT-1h.pkl')\nhuobi_training = huobiDF.truncate(after='2023-01-01')\nhuobi_training.to_csv('./data/huobi-BTCUSDT-1h-training.csv')\nhuobi_test = huobiDF.truncate(before='2023-01-01')\nhuobi_test.to_csv('./data/huobi-BTCUSDT-1h-test.csv')"

In [98]:
"""bitfinex2DF = pd.read_pickle('./data/bitfinex2-BTCUSDT-1h.pkl')
bitfinex2_training = bitfinex2DF.truncate(after='2023-01-01')
bitfinex2_test = bitfinex2DF.truncate(before='2023-01-01')
bitfinex2_training.to_csv('./data/bitfinex2-BTCUSDT-1h-training.csv')
bitfinex2_test.to_csv('./data/bitfinex2-BTCUSDT-1h-test.csv')"""

"bitfinex2DF = pd.read_pickle('./data/bitfinex2-BTCUSDT-1h.pkl')\nbitfinex2_training = bitfinex2DF.truncate(after='2023-01-01')\nbitfinex2_test = bitfinex2DF.truncate(before='2023-01-01')\nbitfinex2_training.to_csv('./data/bitfinex2-BTCUSDT-1h-training.csv')\nbitfinex2_test.to_csv('./data/bitfinex2-BTCUSDT-1h-test.csv')"

# Create Environment

In [99]:
env = gym.make("TradingEnv",
        name= "BTCUSD",
        df = df, # Your dataset with your custom features
        positions = [0, 1], # -1 (=SHORT), 0(=SELL ALL), +1 (=BUY ALL)
        #trading_fees = 0.01/100, # 0.01% per stock buy / sell (Binance fees)
        #borrow_interest_rate= 0.0003/100, # 0.0003% per timestep (one timestep = 1h here)
        #dynamic_feature_functions = [dynamic_features]
        reward_function = reward_function,
        portfolio_initial_value = 10000,
        #max_episode_duration = 1000,
    )

# Define Agent

In [100]:
class BitcoinTrainingAgent:
    """Q-learning agent."""
    def __init__(
        self,
        initial_epsilon: float,
        epsilon_decay: float,
        final_epsilon: float,
        learning_rate: float = 0.001,
        discount_factor: float = 0.95,
    ) -> None:
        """Initialize hyperparameters"""
        self.q_values = defaultdict(lambda: np.zeros(env.action_space.n))

        self.lr = learning_rate
        self.discount_factor = discount_factor

        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.final_epsilon = final_epsilon

        self.training_error = []

    def get_action(self, obs):
        """Given an observation, choose an action"""
        # with probability epsilon return a random action to explore the environment
        if np.random.random() < self.epsilon:
            return env.action_space.sample()

        # with probability (1 - epsilon) act greedily (exploit)
        else:
            return int(np.argmax(self.q_values[obs]))

    def update(
        self,
        obs: tuple[int, int, bool],
        action: int,
        reward: float,
        terminated: bool,
        next_obs: tuple[int, int, bool],
    ):
        """Updates the Q-value of an action."""
        next_obs_tuple = tuple(next_obs)
        obs_tuple = tuple(obs)
        future_q_value = (not terminated) * np.max(self.q_values[next_obs_tuple])
        temporal_difference = (
            reward + self.discount_factor * future_q_value - self.q_values[obs_tuple][action]
        )

        self.q_values[obs_tuple][action] = (
            self.q_values[obs_tuple][action] + self.lr * temporal_difference
        )
        self.training_error.append(temporal_difference)
   
    def decay_epsilon(self):
        self.epsilon = max(self.final_epsilon, self.epsilon * self.epsilon_decay)

# Create Agent

In [101]:
# Hyperparameters passed to agent
learning_rate = 0.01
n_episodes = 1
initial_epsilon = 1.0

# Episolon is involved in exporation/exploitation tradeoff, decay reduces exploration over time.
epsilon_decay = initial_epsilon / (n_episodes / 2)
final_epsilon = 0.1

agent = BitcoinTrainingAgent(
    learning_rate=learning_rate,
    initial_epsilon=initial_epsilon,
    epsilon_decay=epsilon_decay,
    final_epsilon=final_epsilon,
)

# Train model using agent

In [102]:
"""# Run an episode until it ends :
done, truncated = False, False
observation, info = env.reset()
while not done and not truncated:
    # Pick a position by its index in your position list (=[-1, 0, 1])....usually something like : position_index = your_policy(observation)
    position_index = env.action_space.sample() # At every timestep, pick a random position index from your position list (=[-1, 0, 1])
    observation, reward, done, truncated, info = env.step(position_index)
"""

'# Run an episode until it ends :\ndone, truncated = False, False\nobservation, info = env.reset()\nwhile not done and not truncated:\n    # Pick a position by its index in your position list (=[-1, 0, 1])....usually something like : position_index = your_policy(observation)\n    position_index = env.action_space.sample() # At every timestep, pick a random position index from your position list (=[-1, 0, 1])\n    observation, reward, done, truncated, info = env.step(position_index)\n'

In [104]:
for episode in tqdm(range(n_episodes)):
    obs, info = env.reset()
    done, truncated = False, False

    while not done:
        print("step, loop")

        for i in tqdm(range(100)):
            print(0, i)

            action = agent.get_action(obs)

            print(1, i)

            next_obs, reward, terminated, truncated, info = env.step(action)

            print(2, i)

            agent.update(obs, action, reward, terminated, next_obs)

            print(3, i)
 
            done = terminated or truncated
            obs = next_obs

            print(4, i)
        
    
    agent.decay_epsilon()


  0%|          | 0/1 [00:00<?, ?it/s]

step, loop




0 0
1 0
2 0
3 0
4 0
0 1
1 1


  1%|          | 1/100 [00:25<41:55, 25.41s/it]
  0%|          | 0/1 [00:25<?, ?it/s]


KeyboardInterrupt: 