In [1]:
!pip install stable-baselines3 torch

Collecting stable-baselines3
  Downloading stable_baselines3-2.5.0-py3-none-any.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (

In [2]:
import pandas as pd
import numpy as np
import os

In [3]:
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.models import load_model
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math

In [4]:
import pandas as pd

def get_forex_data():
    # Load the dataset
    data_set = pd.read_csv('/content/Foreign_Exchange_Rates.csv', na_values='ND')

    # Interpolate missing values to handle missing data
    data_set = data_set.infer_objects(copy=False)  # Ensure non-numeric columns are correctly inferred
    data_set.interpolate(inplace=True)

    # Select only the columns for EUR/USD and JPY/USD exchange rates
    df = data_set[['EURO AREA - EURO/US$', 'UNITED KINGDOM - UNITED KINGDOM POUND/US$']].copy()

    # Add derived column for YEN/EURO exchange rate
    df['YEN/EURO'] = df['UNITED KINGDOM - UNITED KINGDOM POUND/US$'] / df['EURO AREA - EURO/US$']


    return df

In [5]:
import gymnasium as gym
from gymnasium import Env
from gymnasium.spaces import Discrete, Box


class State:
    def __init__(self, bars_count=30, prediction_count=4):
        self.bars_count = bars_count
        self._prices = None
        self._first_diff = None
        self._offset = None
        self.balance = None
        self.portfolio = None
        self.trade_max_percentage = None

    def reset(self, prices, offset, initial_balance, trade_max_percentage=0.2):
        assert offset >= self.bars_count - 1, "Offset must allow for sufficient historical data"
        self._prices = prices
        first_differences = prices.diff()
        mean_diff = np.mean(first_differences)
        std_diff = np.std(first_differences)

        # Apply Z-score normalization
        # self._first_diff = (first_differences - mean_diff) / std_diff
        # Normalize the first differences using Min-Max normalization
        self._first_diff =  (first_differences - first_differences.min()) / (first_differences.max() - first_differences.min())

        self._offset = offset
        self.balance = initial_balance
        self.trade_max_percentage = trade_max_percentage
        self.portfolio = {'USD': initial_balance, 'EUR': 0, 'JPY':0}




    def step(self, action, trade_percentage, reward_type = "Direct"):
        reward = 0
        current_price = self._prices.iloc[self._offset][['EURO AREA - EURO/US$', 'UNITED KINGDOM - UNITED KINGDOM POUND/US$', 'YEN/EURO']].values
        next_price = self._prices.iloc[self._offset+1][['EURO AREA - EURO/US$', 'UNITED KINGDOM - UNITED KINGDOM POUND/US$', 'YEN/EURO']].values
        _, eur_usd,_ = current_price
        _, eur_usd_1,_ = next_price
        max_trade_amount = self.balance * self.trade_max_percentage

        # action is an array of floats between -1 and 1
        # USD and EUR, positive means buy EUR
        portfolio_value = (self.portfolio['USD'] + self.portfolio['EUR'] / eur_usd)
        if action[0] > 0:
            trade_amount =  abs(max_trade_amount*action[0])
            trade_volume = min(self.portfolio['USD'], trade_amount)

            self.portfolio['EUR'] += trade_volume * eur_usd
            self.portfolio['USD'] -= trade_volume

        elif action[0] < 0:
            trade_amount =  abs(max_trade_amount*action[0])
            trade_volume = min(self.portfolio['EUR'], trade_amount * eur_usd)

            self.portfolio['USD'] += trade_volume / eur_usd
            self.portfolio['EUR'] -= trade_volume


        portfolio_value_1 = (self.portfolio['USD'] + self.portfolio['EUR'] / eur_usd_1)

        if reward_type == "Direct":
            if portfolio_value_1 / portfolio_value<=0:
                print(f"ERROR!!! { portfolio_value_1}/{portfolio_value}")
                print(f"portfolio_value = {self.portfolio['USD']} + {self.portfolio['EUR']} / {eur_usd} = {portfolio_value}")
                print(f"portfolio_value_1 = {self.portfolio['USD']} + {self.portfolio['EUR']} / {eur_usd_1} = {portfolio_value_1}")
                reward = 0

            else:
                reward = math.log(portfolio_value_1 / portfolio_value)

        self.balance = portfolio_value_1
        self._offset += 1
        done = self._offset >= len(self._prices) - 2

        info = {
            "balance": self.balance,  # Include the current balance
            "portfolio": self.portfolio,
            # Add any other relevant fields from the State object if needed
        }

        return reward, done, info


    def encode(self):
        current_prices = self._first_diff.iloc[self._offset]
        encoded_prices = np.array(current_prices[['UNITED KINGDOM - UNITED KINGDOM POUND/US$']]).flatten()
        portfolio_fraction = np.array([self.portfolio['USD'],self.portfolio['EUR']])/self.balance
        encoded_features = np.concatenate([
            encoded_prices,
            portfolio_fraction,
        ])
        return encoded_features.astype(np.float32)

    @property
    def shape(self):
        # shape is bars_count * 3 + predictions count
        return (3,)

In [6]:
from gymnasium.spaces import Discrete, Box
class ForexTradingEnv(Env):
    def __init__(self, df, initial_balance=1000, bars_count=30):
        super(ForexTradingEnv, self).__init__()
        self.df = df
        self.initial_balance = initial_balance
        self.bars_count = bars_count
        self.state = State(bars_count=self.bars_count)
        self.action_space = Box(low=-1, high=1, shape=(1,), dtype=np.float32)
        self.observation_space = Box(
            low=0, high=1, shape=self.state.shape, dtype=np.float32
        )
    def seed(self, seed):
        np.random.seed(seed)
    def reset(self, seed=None):
        super().reset(seed=seed)
        rng = np.random.default_rng(seed)
        offset = np.random.randint(self.bars_count - 1, len(self.df) - 1)
        self.state.reset(prices=self.df, offset=offset, initial_balance=self.initial_balance)
        return self.state.encode(), {}

    def step(self, action):
        reward, terminated, info = self.state.step(action, trade_percentage=1)
        truncated = self.state._offset >= len(self.df) - 1
        observation = self.state.encode()
        return observation, reward, terminated, truncated, info

    def render(self, mode='human'):
        if mode != 'human':
            raise NotImplementedError("Only 'human' rendering mode is supported.")
        print(f"Step: {self.state._offset}")
        print(f"Portfolio: {self.state.portfolio}")
        print(f"Balance: {self.state.balance}")

In [15]:
!pip install sb3-contrib torch




Collecting sb3-contrib
  Downloading sb3_contrib-2.5.0-py3-none-any.whl.metadata (4.1 kB)
Downloading sb3_contrib-2.5.0-py3-none-any.whl (92 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.8/92.8 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sb3-contrib
Successfully installed sb3-contrib-2.5.0


In [23]:

# import os
# from stable_baselines3 import PPO
# from stable_baselines3.common.env_util import make_vec_env
# from stable_baselines3.common.vec_env import VecNormalize
# import torch
# import warnings

# warnings.simplefilter(action="ignore", category=FutureWarning)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Set up the PPO model
# data = get_forex_data()
# kwargs = {"df": data}
# envs = make_vec_env(ForexTradingEnv, n_envs=4, env_kwargs=kwargs)
# envs = VecNormalize(envs, norm_obs=True, norm_reward=True, clip_obs=10.0)

# model_path = "/content/output/ppo_forex.zip"  # Ensure the correct file extension
# log_dir = "/content/output/ppo_logs/"

# # Load the existing model if it exists, otherwise create a new one
# if os.path.exists(model_path):
#     print(f"Loading existing model from {model_path}...")
#     model = PPO.load(model_path, env=envs, device=device, tensorboard_log=log_dir)
# else:
#     print("No existing model found. Training from scratch...")
#     model = PPO("MlpPolicy", envs, verbose=1, device=device)

# # Train the model
# try:
#     model.learn(total_timesteps=10_000_000)  # Adjust as needed
# except KeyboardInterrupt:
#     model.save(model_path)
#     print(f"Training interrupted. Model saved to {model_path}")

# # Save the model after training
# model.save(model_path)
# print(f"Model saved to {model_path}")

import os
from sb3_contrib import RecurrentPPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import VecNormalize
import torch
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set up the Recurrent PPO model (with LSTM Policy)
data = get_forex_data()
kwargs = {"df": data}
envs = make_vec_env(ForexTradingEnv, n_envs=4, env_kwargs=kwargs)
envs = VecNormalize(envs, norm_obs=True, norm_reward=True, clip_obs=10.0)

model_path = "/content/output/rppo_forex.zip"  # Updated model path for R-PPO
log_dir = "/content/output/rppo_logs/"

# Load the existing model if it exists, otherwise create a new one
if os.path.exists(model_path):
    print(f"Loading existing model from {model_path}...")
    model = RecurrentPPO.load(model_path, env=envs, device=device, tensorboard_log=log_dir)
else:
    print("No existing model found. Training from scratch...")
    model = RecurrentPPO("MlpLstmPolicy", envs, verbose=1, device=device, n_steps=256)  # Using LSTM Policy

# Train the model
try:
    model.learn(total_timesteps=10_000_000)  # Adjust as needed
except KeyboardInterrupt:
    model.save(model_path)
    print(f"Training interrupted. Model saved to {model_path}")

# Save the model after training
model.save(model_path)
print(f"Model saved to {model_path}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
|    policy_gradient_loss | -0.00918  |
|    std                  | 0.0332    |
|    value_loss           | 0.00545   |
---------------------------------------
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 2.63e+03  |
|    ep_rew_mean          | 0.247     |
| time/                   |           |
|    fps                  | 142       |
|    iterations           | 2396      |
|    time_elapsed         | 17252     |
|    total_timesteps      | 2453504   |
| train/                  |           |
|    approx_kl            | 1.0793734 |
|    clip_fraction        | 0.573     |
|    clip_range           | 0.2       |
|    entropy_loss         | 1.98      |
|    explained_variance   | 0.938     |
|    learning_rate        | 0.0003    |
|    loss                 | -0.016    |
|    n_updates            | 28730     |
|    policy_gradient_loss | 0.0254    |
|    std       

In [17]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
# obs = envs.reset()
# while True:
#     action, _states = model.predict(obs)
#     print(action)
#     obs, rewards, dones, info = envs.step(action)
#     if dones.any():
#         break  # Exit loop when an episode ends
#     # Render or log as needed
#     # print(rewards)
# print("Evaluation complete!")

  and should_run_async(code)


In [10]:
# import matplotlib.pyplot as plt
# import numpy as np

# portfolio_history = [
#     {"USD": [], "EUR": [], "BLC": []} for _ in range(envs.num_envs)
# ]

# # Reset the environments
# obs = envs.reset()
# while True:
#     # Get the predicted action from the model
#     action, _states = model.predict(obs)
#     # Perform the action in the environment
#     obs, rewards, dones, infos = envs.step(action)

#     # Append the portfolio values for each environment
#     for i, info in enumerate(infos):
#         portfolio = info["portfolio"]
#         portfolio_history[i]["USD"].append(portfolio.get("USD"))
#         portfolio_history[i]["EUR"].append(portfolio.get("EUR"))
#         portfolio_history[i]["BLC"].append(info["balance"])

#     # Break the loop when an episode ends
#     if dones.any():
#         break

# print("Evaluation complete!")

# # Plot the portfolio progression for each environment
# for i in range(envs.num_envs):
#     plt.figure(figsize=(10, 6))
#     plt.plot(portfolio_history[i]["USD"], label="USD", color="blue")
#     plt.plot(portfolio_history[i]["EUR"], label="EUR", color="green")
#     plt.plot(portfolio_history[i]["BLC"], label="BLC", color="red")
#     plt.title(f"Portfolio Progression in Environment {i + 1}")
#     plt.xlabel("Steps")
#     plt.ylabel("Portfolio Value")
#     plt.legend()
#     plt.grid()
#     plt.show()

In [2]:

# import matplotlib.pyplot as plt
# import numpy as np

# portfolio_history = [
#     {"USD": [], "EUR": [], "BLC": []} for _ in range(envs.num_envs)
# ]

# # Reset the environments
# obs = envs.reset()
# while True:
#     # Get the predicted action from the model
#     action, _states = model.predict(obs)
#     # Perform the action in the environment
#     obs, rewards, dones, infos = envs.step(action)

#     # Append the portfolio values for each environment
#     for i, info in enumerate(infos):
#         portfolio = info["portfolio"]
#         portfolio_history[i]["USD"].append(portfolio.get("USD"))
#         portfolio_history[i]["EUR"].append(portfolio.get("EUR"))
#         portfolio_history[i]["BLC"].append(info["balance"])

#     # Break the loop when an episode ends
#     if dones.any():
#         break

# print("Evaluation complete!")

# # Plot the portfolio progression for each environment (USD and EUR only)
# for i in range(envs.num_envs):
#     plt.figure(figsize=(10, 6))
#     plt.plot(portfolio_history[i]["USD"], label="USD", color="blue")
#     plt.plot(portfolio_history[i]["EUR"], label="EUR", color="green")
#     plt.title(f"Portfolio Progression in Environment {i + 1}")
#     plt.xlabel("Steps")
#     plt.ylabel("Portfolio Value")
#     plt.legend()
#     plt.grid()
#     plt.show()

# # Plot the balance (BLC) separately
# for i in range(envs.num_envs):
#     plt.figure(figsize=(10, 6))
#     plt.plot(portfolio_history[i]["BLC"], label="Balance (BLC)", color="red")
#     plt.title(f"Balance Progression in Environment {i + 1}")
#     plt.xlabel("Steps")
#     plt.ylabel("Balance Value")
#     plt.legend()
#     plt.grid()
#     plt.show()

import matplotlib.pyplot as plt
import numpy as np

portfolio_history = [
    {"USD": [], "EUR": [], "BLC": []} for _ in range(envs.num_envs)
]

# Reset the environments
obs = envs.reset()

# Initialize LSTM states and episode start signals
lstm_states = None
episode_starts = np.ones((envs.num_envs,), dtype=bool)

# Main loop for evaluation
while True:
    # Get the predicted action from the model
    action, lstm_states = model.predict(
        obs, state=lstm_states, episode_start=episode_starts, deterministic=True
    )

    # Perform the action in the environment
    obs, rewards, dones, infos = envs.step(action)

    # Append the portfolio values for each environment
    for i, info in enumerate(infos):
        portfolio = info["portfolio"]
        portfolio_history[i]["USD"].append(portfolio.get("USD"))
        portfolio_history[i]["EUR"].append(portfolio.get("EUR"))
        portfolio_history[i]["BLC"].append(info["balance"])

    # Update episode start signals (reset LSTM states for new episodes)
    episode_starts = dones

    # Break the loop when all episodes are done
    if dones.any():
        break

print("Evaluation complete!")

# Plot the portfolio progression for each environment (USD and EUR only)
for i in range(envs.num_envs):
    plt.figure(figsize=(10, 6))
    plt.plot(portfolio_history[i]["USD"], label="USD", color="blue")
    plt.plot(portfolio_history[i]["EUR"], label="EUR", color="green")
    plt.title(f"Portfolio Progression in Environment {i + 1}")
    plt.xlabel("Steps")
    plt.ylabel("Portfolio Value")
    plt.legend()
    plt.grid()
    plt.show()

# Plot the balance (BLC) separately
for i in range(envs.num_envs):
    plt.figure(figsize=(10, 6))
    plt.plot(portfolio_history[i]["BLC"], label="Balance (BLC)", color="red")
    plt.title(f"Balance Progression in Environment {i + 1}")
    plt.xlabel("Steps")
    plt.ylabel("Balance Value")
    plt.legend()
    plt.grid()
    plt.show()



NameError: name 'envs' is not defined