In [4]:
from vpk_custom_functions import general_notebook_settings
from vpk_custom_functions import activate_parent
general_notebook_settings()
activate_parent('iqinvest')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
DataFrame width maximized!
Autoreloading of functions enabled!
Working directory set to: /workspaces/iqinvest


## Sample

In [3]:
import gym
from gym import spaces
import numpy as np

class SimpleEnv(gym.Env):
    def __init__(self):
        super(SimpleEnv, self).__init__()
        
        # Define action space: The agent can choose between 0 or 1
        self.action_space = spaces.Discrete(2)
        
        # Define observation space: The state is just one number
        self.observation_space = spaces.Box(low=0, high=10, shape=(1,))
        
        # Initialize the environment
        self.state = 5  # Starting state
        
    def reset(self):
        """Reset the environment to the initial state."""
        self.state = 5
        return np.array([self.state])
    
    def step(self, action):
        """Take an action and return (new_state, reward, done, info)."""
        if action == 0:  # If action is 0, decrease the state
            self.state -= 1
        elif action == 1:  # If action is 1, increase the state
            self.state += 1
        
        # Reward: High reward for being close to 10
        reward = 10 - abs(self.state - 10)
        
        # Done: Episode ends if state goes out of bounds
        done = self.state < 0 or self.state > 20
        
        return np.array([self.state]), reward, done, {}


In [4]:
# Test the Simple Environment
env = SimpleEnv()
state = env.reset()
done = False

while not done:
    action = env.action_space.sample()  # Choose a random action
    state, reward, done, _ = env.step(action)
    print(f"State: {state}, Reward: {reward}, Done: {done}")

State: [4], Reward: 4, Done: False
State: [3], Reward: 3, Done: False
State: [4], Reward: 4, Done: False
State: [3], Reward: 3, Done: False
State: [4], Reward: 4, Done: False
State: [3], Reward: 3, Done: False
State: [4], Reward: 4, Done: False
State: [3], Reward: 3, Done: False
State: [4], Reward: 4, Done: False
State: [5], Reward: 5, Done: False
State: [4], Reward: 4, Done: False
State: [3], Reward: 3, Done: False
State: [2], Reward: 2, Done: False
State: [3], Reward: 3, Done: False
State: [2], Reward: 2, Done: False
State: [3], Reward: 3, Done: False
State: [4], Reward: 4, Done: False
State: [3], Reward: 3, Done: False
State: [2], Reward: 2, Done: False
State: [1], Reward: 1, Done: False
State: [0], Reward: 0, Done: False
State: [-1], Reward: -1, Done: True


In [5]:
type(env.action_space)

gym.spaces.discrete.Discrete

## with stocks

In [2]:
import gym
from gym import spaces
import numpy as np
import logging

# Setting up logging
logging.basicConfig(level=logging.INFO, format='%(message)s')
logger = logging.getLogger()

# Define the Portfolio Environment
class PortfolioEnv(gym.Env):
    def __init__(self, data: np.ndarray, initial_cash: float = 10000.0, transaction_fee: float = 0.001):
        """
        Initialize the portfolio environment.
        :param data: Historical asset price data (numpy array).
        :param initial_cash: Starting capital.
        :param transaction_fee: Transaction fee percentage.
        """
        super(PortfolioEnv, self).__init__()
        
        self.data = data
        self.initial_cash = initial_cash
        self.num_assets = data.shape[1]
        self.transaction_fee = transaction_fee
        
        # Define state space and action space
        self.observation_space = spaces.Box(low=0, high=np.inf, shape=(self.num_assets + 2,))
        self.action_space = spaces.Box(low=0, high=1, shape=(self.num_assets,))
        
        self.reset()

    def reset(self):
        """
        Reset the environment to the initial state.
        """
        self.current_step = 0
        self.cash = self.initial_cash
        self.portfolio = np.zeros(self.num_assets)
        logger.info(f"Environment reset. Initial cash: {self.cash}, Portfolio: {self.portfolio}")
        return self._get_observation()

    def step(self, action: np.ndarray):
        """
        Take an action in the environment.
        :param action: Allocation proportions for each asset.
        """
        
        # Normalize action to ensure it sums to 1
        action = action / (np.sum(action) + 1e-8)  # Add small epsilon to avoid division by zero
        logger.info(f"Step {self.current_step}: Taking action: {[f'{x:.3f}' for x in action]}")
        
        # Calculate portfolio rebalancing
        current_prices = self.data[self.current_step]
        logger.info(f"current prices: {[f'{x:.3f}' for x in current_prices]}")

        portfolio_value = np.dot(self.portfolio, current_prices) + self.cash
        new_portfolio = portfolio_value * action / current_prices

        transaction_costs = np.sum(np.abs(new_portfolio - self.portfolio)) * self.transaction_fee  # Transaction fee
        
        reward = portfolio_value - transaction_costs - self.cash
        
        # Update state
        self.cash = portfolio_value - np.sum(new_portfolio * current_prices)
        self.portfolio = new_portfolio
        logger.info(f"New portfolio: {[f'{x:.3f}' for x in self.portfolio]}, Remaining cash: {self.cash: .3f}, Reward: {reward: .3f}")

        self.current_step += 1
        done = self.current_step >= len(self.data) - 1
        return self._get_observation(), reward, done, {}

    def _get_observation(self):
        """
        Generate the current observation.
        """
        current_prices = self.data[self.current_step]
        portfolio_value = np.dot(self.portfolio, current_prices)
        return np.concatenate([current_prices, [portfolio_value, self.cash]])

In [3]:
# # Simulating Sample Data
# np.random.seed(42)
# sample_data = np.random.uniform(low=50, high=150, size=(10, 5))

# # Running the environment
# env = PortfolioEnv(data=sample_data, initial_cash=10000.0)
# state = env.reset()

# for _ in range(len(sample_data)):
#     logging.info("-------------------")
#     # Sample a random action (proportions for each asset)
#     action = env.action_space.sample()
#     state, reward, done, _ = env.step(action)
#     if done:
#         logger.info("End of the episode reached.")
#         break

# PPO

In [7]:
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

# Simulated historical price data (rows: timesteps, columns: assets)
# Replace this with your actual historical data
data = np.random.uniform(low=10, high=100, size=(500, 3))  # 500 timesteps, 3 assets

# Initialize the environment
env = PortfolioEnv(data=data, initial_cash=10000.0, transaction_fee=0.001)

# Wrap the environment in DummyVecEnv for compatibility with stable-baselines3
env = DummyVecEnv([lambda: env])

# Define the PPO model
model = PPO(
    "MlpPolicy",  # Multi-Layer Perceptron policy
    env,          # Portfolio environment
    verbose=1,    # Log training process
    tensorboard_log="./ppo_portfolio_tensorboard/"  # TensorBoard log directory
)

# Train the model
model.learn(total_timesteps=100000)  # Adjust timesteps based on the size of your dataset

# Save the trained model
model.save("ppo_portfolio_model")

# Test the model
obs = env.reset()
for _ in range(200):  # Test for 200 timesteps
    action, _ = model.predict(obs)  # Use the trained model to predict actions
    obs, reward, done, _ = env.step(action)
    if done:
        break


Environment reset. Initial cash: 10000.0, Portfolio: [0. 0. 0.]
Environment reset. Initial cash: 10000.0, Portfolio: [0. 0. 0.]


Using cpu device


ImportError: Trying to log data to tensorboard but tensorboard is not installed.

In [8]:
! pip list | grep tensorboard

tensorboard               2.18.0
tensorboard-data-server   0.7.2


In [11]:
! mkdir ppo_portfolio_tensorboard