In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import datetime

from sb3_contrib import RecurrentPPO
from stable_baselines3.common.evaluation import evaluate_policy
import stable_baselines3

import gymnasium as gym
import gym_trading_env
from gym_trading_env.utils.history import History
from gym_trading_env.downloader import download
from gym_trading_env.renderer import Renderer

from stable_baselines3.common.env_checker import check_env


In [2]:
"""feature_list = []
for col in df.columns:
    feature_list.append(col)
"""

'feature_list = []\nfor col in df.columns:\n    feature_list.append(col)\n'

In [3]:
def feature_cols(df):
    df["feature_close"] = df["close"]
    df["feature_high"] = df["high"]
    df["feature_low"] = df["low"]
    df["feature_open"] = df["open"]
    df["feature_vol"] = df["vol"]

def calculate_reward_cols(df, window_size):
    prices = df["feature_close"]
    # This needs to calculate only when the model has a position.
    log_returns = prices.pct_change().apply(lambda x: np.log(1 + x))
    df["log_return"] = log_returns
    df["avg_log_return"] = df["log_return"].rolling(window=window_size).mean()
    df["avg_log_return_std"] = df["avg_log_return"].std()


df = pd.read_csv("./data/indicators.csv", parse_dates=["date"], index_col="date")
feature_cols(df)
#calculate_reward_cols(df, 30)
df.dropna(inplace=True)

print(df.shape)

(1863, 45)


In [4]:
env = gym.make(
    "TradingEnv",
    name="BTCUSD",
    df=df,
    positions=[0, 1],
    )

obs = env.reset()
obs

(array([ 3.2752628e+00,  1.4165925e+01,  4.1008701e+00,  1.6234456e+00,
        -2.6530146e+01,  4.3732853e+01,  1.7506500e+02, -5.6983795e+01,
        -2.0178050e+02, -7.5750748e+01, -1.2202590e+02, -1.4951251e+02,
         4.6771442e+01,  6.7115637e+02,  6.6474899e+01,  1.4187622e+01,
        -1.3082519e+01, -2.9601984e+01, -2.9691364e+01,  2.0186232e+01,
        -4.2739563e+00,  5.0000000e+01,  4.5869595e+01,  3.6106823e+01,
         3.2149414e+01,  2.9272116e+01,  3.6292324e+01,  3.7901905e+01,
         1.1211000e+03,  1.3767250e+03,  1.6377167e+03,  1.6452625e+03,
         2.0441687e+03,  2.0290250e+03,  1.3752109e+03,  1.5100000e+04,
         1.5267800e+04,  1.4100000e+04,  1.4917000e+04,  2.1320000e+04,
         0.0000000e+00,  0.0000000e+00], dtype=float32),
 {'idx': 0,
  'step': 0,
  'date': numpy.datetime64('2018-01-04T00:00:00.000000000'),
  'position_index': 0,
  'position': 0,
  'real_position': 0,
  'data_low': 14100.0,
  'data_open': 14917.0,
  'data_vol': 21320.0,
  'da

Running env with random action taken each step

In [19]:
done, truncated = False, False
observation, info = env.reset()
while not done and not truncated:
    # Pick a position by its index in your position list (=[-1, 0, 1])....usually something like : position_index = your_policy(observation)
    position_index = env.action_space.sample() # At every timestep, pick a random position index from your position list (=[-1, 0, 1])
    observation, reward, done, truncated, info = env.step(position_index)

Market Return : 44.32%   |   Portfolio Return : 190.86%   |   


In [23]:
model = RecurrentPPO('MlpLstmPolicy', # feed-forward neural network with multiple hidden layers
            env, # environment in which the agent interacts and learns
            verbose=1, # enables the training progress to be printed during the learning process
            gamma=0.95, # determines the importance of future rewards compared to immediate rewards
            n_steps=15, # steps to collect samples from the environment before performing an update
            ent_coef=0.01, # encourages exploration by adding entropy to the policy loss
            learning_rate=0.001, # controls the step size at which model's parameters are updated based on the gradient of the loss function
            clip_range=0.1, # limits the update to a certain range to prevent large policy updates
            batch_size=15)

model.learn(total_timesteps=10000)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------
| time/              |     |
|    fps             | 107 |
|    iterations      | 1   |
|    time_elapsed    | 0   |
|    total_timesteps | 15  |
----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 48          |
|    iterations           | 2           |
|    time_elapsed         | 0           |
|    total_timesteps      | 30          |
| train/                  |             |
|    approx_kl            | 0.004607499 |
|    clip_fraction        | 0.107       |
|    clip_range           | 0.1         |
|    entropy_loss         | -0.692      |
|    explained_variance   | -10.8       |
|    learning_rate        | 0.001       |
|    loss                 | -0.0395     |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0231     |
|    value_loss           | 0.

<sb3_contrib.ppo_recurrent.ppo_recurrent.RecurrentPPO at 0x1494c97c790>

#Need to split the dataset into training and eval.

#Hourly data?

#Multi-vectored environment?

In [27]:
obs, info = env.reset()


for i in range(1000):
    action, _state = model.predict(obs)
    obs, reward, done, trauncated, info = env.step(action)
    print(info["portfolio_valuation"])

1000.0000000000001
1000.0000000000001
1000.0000000000001
1000.0000000000001
1000.0000000000001
1000.0000000000001
890.1040582027289
890.1040582027289
890.1040582027289
890.1040582027289
890.1040582027289
890.1040582027289
890.1040582027289
890.1040582027289
890.1040582027289
890.1040582027289
890.1040582027289
890.1040582027289
893.5625402284653
893.5625402284653
877.4642932350604
877.4642932350604
909.9864164670569
909.9864164670569
909.9864164670569
909.9864164670569
909.9864164670569
817.067075935288
817.067075935288
817.067075935288
726.6568277502746
726.6568277502746
726.6568277502746
719.851025551964
719.851025551964
719.851025551964
706.8379771175751
706.8379771175751
780.5061921051529
780.5061921051529
863.7760294406482
863.7760294406482
876.9966064840421
876.9966064840421
823.4473843435627
823.4473843435627
827.1564736671977
827.1564736671977
777.0159384866936
777.0159384866936
742.5116821743964
742.5116821743964
742.5116821743964
742.5116821743964
742.5116821743964
742.511682