In [None]:
!pip install sb3-contrib torch

In [2]:
import pandas as pd
import numpy as np
import os
import torch as th
import torch.nn as nn
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from gymnasium import spaces

In [3]:
def get_forex_data():
    # Load the dataset
    data_set = pd.read_csv('/kaggle/input/forex-dataset/Foreign_Exchange_Rates.csv', na_values='ND')

    # Interpolate missing values to handle missing data
    data_set = data_set.infer_objects(copy=False)  # Ensure non-numeric columns are correctly inferred
    data_set.interpolate(inplace=True)

    # Select only the columns for EUR/USD and JPY/USD exchange rates
    df = data_set[['EURO AREA - EURO/US$', 'JAPAN - YEN/US$']].copy()

    # Add derived column for YEN/EURO exchange rate
    df['YEN/EURO'] = df['JAPAN - YEN/US$'] / df['EURO AREA - EURO/US$']


    return df


  and should_run_async(code)


In [4]:
import gymnasium as gym
from gymnasium import Env
from gymnasium.spaces import Discrete, Box
import math


class State:
    def __init__(self):
        self._prices = None
        self._first_diff = None
        self._offset = None
        self.balance = None
        self.portfolio = None
        self.euro_buy_value = None
        self.yen_buy_value = None
        self.trade_max_percentage = None

    def reset(self, prices, offset, initial_balance, trade_max_percentage ):
        self._prices = prices
        first_differences = prices.diff()
        # Normalize the first differences using Min-Max normalization
        self._first_diff =  (first_differences - first_differences.min()) / (first_differences.max() - first_differences.min())
        self._offset = offset
        self.balance = initial_balance
        self.trade_max_percentage = trade_max_percentage
        self.portfolio = {'USD': initial_balance, 'EUR': 0, 'JPY':0}
        self.euro_buy_value = 0
        self.yen_buy_value = 0

    def step(self, action, reward_type = "Direct"):
        reward = 0
        rew = 0
        current_price = self._prices.iloc[self._offset][['EURO AREA - EURO/US$', 'JAPAN - YEN/US$', 'YEN/EURO']].values
        next_price = self._prices.iloc[self._offset+1][['EURO AREA - EURO/US$', 'JAPAN - YEN/US$', 'YEN/EURO']].values
        eur_usd, jpy_usd, jpy_eur = current_price
        eur_usd_1, jpy_usd_1, jpy_eur_1 = next_price
        max_trade_amount = self.balance * self.trade_max_percentage
        # action is an array of floats between -1 and 1
        # USD and EUR, positive means buy EUR
        if action[0] > 0:  
            trade_amount =  abs(max_trade_amount*action[0])
            trade_volume = min(self.portfolio['USD'], trade_amount)

            self.portfolio['EUR'] += trade_volume * eur_usd
            self.portfolio['USD'] -= trade_volume
            self.euro_buy_value += trade_volume
            if trade_volume == 0:
                rew = -10
            else:
                rew = (eur_usd_1-eur_usd)*10
        elif action[0] < 0:
                
            trade_amount =  abs(max_trade_amount*action[0])
            trade_volume = min(self.portfolio['EUR'], trade_amount * eur_usd)
            if trade_volume > 0:
                reward = trade_volume * (1/eur_usd_1-(self.euro_buy_value/self.portfolio['EUR']))
            self.portfolio['USD'] += trade_volume / eur_usd
            self.portfolio['EUR'] -= trade_volume
            self.euro_buy_value -= trade_volume / eur_usd
            if trade_volume == 0:
                rew = -10
            else:
                rew = (eur_usd_1-eur_usd)*(-10)
            
        # USD and YEN, positive means buy YEN
        if action[1] > 0:

            trade_amount =  abs(max_trade_amount*action[1])
            trade_volume = min(self.portfolio['USD'], trade_amount)
            reward = 0
            self.portfolio['JPY'] += trade_volume * jpy_usd
            self.portfolio['USD'] -= trade_volume
            self.yen_buy_value += trade_volume
            if trade_volume == 0:
                rew = -10
            else:
                rew = (jpy_usd_1-jpy_usd)*10/100
        elif action[1] < 0:
            
            trade_amount =  abs(max_trade_amount*action[1])
            trade_volume = min(self.portfolio['JPY'], trade_amount * jpy_usd)
            if trade_volume > 0:
                reward = trade_volume * (1/jpy_usd_1- self.yen_buy_value/self.portfolio['JPY'])
            self.portfolio['USD'] += trade_volume / jpy_usd
            self.portfolio['JPY'] -= trade_volume
            self.yen_buy_value -= trade_volume / jpy_usd
            if trade_volume == 0:
                rew = -10
            else:
                rew = (jpy_usd_1-jpy_usd)*(-10)/100
                
        # # EUR and YEN, positive means buy YEN
        # if action[2] > 0:
            
        #     trade_amount =  abs(max_trade_amount*action[2])
        #     trade_volume = min(self.portfolio['EUR'], trade_amount * eur_usd)
        #     if trade_volume > 0:
        #         reward = trade_volume * (1/eur_usd_1 - self.euro_buy_value/self.portfolio['EUR'])

        #     self.portfolio['JPY'] += trade_volume * jpy_eur
        #     self.portfolio['EUR'] -= trade_volume
        #     self.euro_buy_value -= trade_volume / eur_usd
        #     self.yen_buy_value += trade_volume / eur_usd
        # elif action[2] < 0:
        #     trade_amount =  abs(max_trade_amount*action[2])
        #     trade_volume = min(self.portfolio['JPY'], trade_amount * jpy_usd)
        #     if trade_volume > 0:
        #         reward = trade_volume * (1/jpy_usd_1 - self.yen_buy_value/self.portfolio['JPY'])

        #     self.portfolio['EUR'] += trade_volume / jpy_eur
        #     self.portfolio['JPY'] -= trade_volume
        #     self.euro_buy_value += trade_volume / jpy_usd
        #     self.yen_buy_value -= trade_volume / jpy_usd

        portfolio_value = (self.portfolio['USD'] + self.portfolio['EUR'] / eur_usd + self.portfolio['JPY'] / jpy_usd)
        portfolio_value_1 = (self.portfolio['USD'] + self.portfolio['EUR'] / eur_usd_1 + self.portfolio['JPY'] / jpy_usd_1)
        # if reward_type == "Direct":
        #     reward = portfolio_value - self.balance
        # self.balance = portfolio_value

        self.balance = portfolio_value
        if reward_type == "Direct":
            reward =  portfolio_value_1- self.balance
            if reward>0:
                reward*=10
        
        self._offset += 1
        done = self._offset >= len(self._prices) - 2
        # reward = 100*(action[0]-action[1]-action[2])
        reward=rew
        return reward, done

    def encode(self):
        # Extract historical prices
        current_prices = self._first_diff.iloc[self._offset]
        encoded_prices = np.array(current_prices[['EURO AREA - EURO/US$', 'JAPAN - YEN/US$', 'YEN/EURO']]).flatten()
        portfolio_fraction = np.array([self.portfolio['USD'],self.portfolio['EUR'],self.portfolio['JPY']])/self.balance


        # Combine all features into a single array
        # use log to normalize balance
        encoded_features = np.concatenate([
            encoded_prices,
            portfolio_fraction,
            # [self.euro_buy_value, self.yen_buy_value/100, self.trade_max_percentage]
        ])
        return encoded_features

    @property
    def shape(self):
        # Update the shape to match the new number of encoded features
        return (3+3,)  # 3 prices + 3 portfolio + 3 additional values


In [5]:
class ForexTradingEnv(Env):
    def __init__(self, df, initial_balance=1000):
        super(ForexTradingEnv, self).__init__()
        self.df = df
        self.initial_balance = initial_balance
        self.state = State()
        # shape is all the currency combinations
        self.action_space = spaces.Box(low=-1, high=1, shape=(2,), dtype=np.float32)
        self.observation_space = Box(
            low=0, high=np.inf, shape=self.state.shape, dtype=np.float32
        )
    def seed(self, seed):
        np.random.seed(seed)
    def reset(self,sequence_length, seed=None):
        super().reset(seed=seed)
        rng = np.random.default_rng(seed)
        offset = np.random.randint(sequence_length, len(self.df) - 2)
        print(offset)
        self.state.reset(prices=self.df, offset=offset, initial_balance=self.initial_balance, trade_max_percentage= 0.2)
        return self.state.encode()

    def step(self, action):
        reward, terminated = self.state.step(action)
        truncated = self.state._offset >= len(self.df) - 1
        observation = self.state.encode()
        info = {
            "balance": self.state.balance,  # Include the current balance
            "portfolio": self.state.portfolio,
            # Add any other relevant fields from the State object if needed
        }
        return observation, reward, terminated, truncated, info

    def render(self, mode='human'):
        if mode != 'human':
            raise NotImplementedError("Only 'human' rendering mode is supported.")
        print(f"Step: {self.state._offset}")
        print(f"Portfolio: {self.state.portfolio}")
        print(f"Balance: {self.state.balance}")

In [6]:
class SequenceEnvironment(gym.Env):
    def __init__(self, df, sequence_length):
        super(SequenceEnvironment, self).__init__()
        self.original_env = ForexTradingEnv(df)
        self.sequence_length = sequence_length
        self.buffer = []
        self.observation_space = spaces.Box(
            low=np.repeat(self.original_env.observation_space.low[None, :], sequence_length, axis=0),
            high=np.repeat(self.original_env.observation_space.high[None, :], sequence_length, axis=0),
            dtype=self.original_env.observation_space.dtype
        )
        self.action_space = self.original_env.action_space

    def reset(self,seed=None):
        obs = self.original_env.reset(self.sequence_length)
        self.buffer = [obs] * self.sequence_length
        return np.array(self.buffer), {}

    def step(self, action):
        obs, reward, terminated, truncated, info = self.original_env.step(action)
        self.buffer.pop(0)
        self.buffer.append(obs)
        return np.array(self.buffer), reward, terminated, truncated, info
        
    def get_balance(self):
        return self.original_env.state.balance
                    


In [7]:
import torch as th
import torch.nn as nn
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from gymnasium import spaces

class LSTMFeatureExtractor(BaseFeaturesExtractor):
    def __init__(self, observation_space: spaces.Box, features_dim: int = 256, lstm_hidden_size: int = 128):
        super().__init__(observation_space, features_dim)
        self.lstm_hidden_size = lstm_hidden_size

        # Extract input size from observation space
        self.sequence_length, self.num_features = observation_space.shape

        # Define LSTM
        self.lstm = nn.LSTM(input_size=self.num_features, hidden_size=lstm_hidden_size, num_layers=1, batch_first=True)

        # Linear layers for feature extraction
        self.linear1 = nn.Linear(lstm_hidden_size, 128)
        self.linear2 = nn.Linear(128, features_dim)
        self.activation = nn.ReLU()

    def forward(self, observations: th.Tensor) -> th.Tensor:
        # Initialize LSTM hidden and cell states
        batch_size = observations.size(0)
        h_0 = th.zeros(1, batch_size, self.lstm_hidden_size).to(observations.device)
        c_0 = th.zeros(1, batch_size, self.lstm_hidden_size).to(observations.device)

        # Pass through LSTM
        lstm_out, _ = self.lstm(observations, (h_0, c_0))

        # Use the output of the last time step
        last_time_step_out = lstm_out[:, -1, :]

        # Pass through linear layers
        x = self.activation(self.linear1(last_time_step_out))
        features = self.activation(self.linear2(x))

        return features

In [None]:
import os
import torch
import numpy as np
from sb3_contrib import RecurrentPPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3.common.env_checker import check_env
device = "cuda" if torch.cuda.is_available() else "cpu"

# Define your ForexTradingEnv and get_forex_data() as per your implementation
# from your_forex_env import ForexTradingEnv, get_forex_data

# Load and prepare data
data = get_forex_data()

# # testing whether the environment is correct
# test_env= SequenceEnvironment(df=data,sequence_length=60)
# check_env(test_env, warn=True)

kwargs = {"df": data,"sequence_length": 60}
envs = make_vec_env(SequenceEnvironment, n_envs=10, env_kwargs=kwargs)

checkpoint_dir = './checkpoints/'
os.makedirs(checkpoint_dir, exist_ok=True)
save_freq = max(2000000 // envs.num_envs, 1)
checkpoint_callback = CheckpointCallback(save_freq=save_freq, save_path=checkpoint_dir, name_prefix='ppo_forex')
# adding custom feature extractor
policy_kwargs = dict(
    features_extractor_class=LSTMFeatureExtractor,
    features_extractor_kwargs=dict(features_dim=256, lstm_hidden_size=128),
    lstm_hidden_size=256,
    n_lstm_layers=1,
    shared_lstm=False,
    enable_critic_lstm=True,
)

def get_latest_checkpoint(checkpoint_dir):
    checkpoint_files = [f for f in os.listdir(checkpoint_dir) if f.startswith('ppo_forex')]
    if checkpoint_files:
        checkpoint_files.sort(key=lambda x: os.path.getmtime(os.path.join(checkpoint_dir, x)), reverse=True)
        return os.path.join(checkpoint_dir, checkpoint_files[0])
    return None

model_path = get_latest_checkpoint(checkpoint_dir)
if False:
    print(f"Loading model from {model_path}")
    model = RecurrentPPO.load(model_path, env=envs, device=device)
else:
    print("No checkpoint found, initializing new model.")
    model = RecurrentPPO(
        "MlpLstmPolicy",
        envs,
        policy_kwargs=policy_kwargs,
        learning_rate = 0.0001,
        ent_coef = 0.1,
        verbose=2,
        device=device)

try:
    model.learn(
        total_timesteps=6000000,
        progress_bar=False,
        callback=checkpoint_callback)
except KeyboardInterrupt:
    print("Training interrupted. Saving current model...")
    model.save(os.path.join(checkpoint_dir, 'ppo_forex_interrupt'))
    print(f"Model saved to {os.path.join(checkpoint_dir, 'ppo_forex_interrupt')}")


In [None]:
import matplotlib.pyplot as plt
import numpy as np

balance_history = [[] for _ in range(envs.num_envs)]
portfolio_history = [
    {"USD": [], "EUR": [], "JPY": []} for _ in range(envs.num_envs)
]

obs = envs.reset()
lstm_states = None
episode_starts = np.ones((envs.num_envs,), dtype=bool)


while True:
    action, lstm_states = model.predict(obs, state=lstm_states, episode_start=episode_starts, deterministic=True)
    obs, rewards, dones, infos = envs.step(action)
    # print(obs)
    
    
    # Append the balance for each environment
    for i, info in enumerate(infos):
        balance_history[i].append(info["balance"])
        # print(info["balance"])
        # print(info["portfolio"])

        portfolio = info["portfolio"]
        portfolio_history[i]["USD"].append(portfolio.get("USD"))
        portfolio_history[i]["EUR"].append(portfolio.get("EUR"))
        portfolio_history[i]["JPY"].append(portfolio.get("JPY")/100)


    print(f"action = \n {action}\n")
    episode_starts = dones
    
    i+=1
    if dones.any():
        break

print("Evaluation complete!")

# Plot the balance history
plt.figure(figsize=(10, 6))
for i in range(envs.num_envs):
    plt.plot(balance_history[i], label=f"Env {i + 1}")

plt.title("Balance Over Steps for All Environments")
plt.xlabel("Steps")
plt.ylabel("Balance")
plt.legend()
plt.grid()
plt.show()

for i in range(envs.num_envs):
    plt.figure(figsize=(10, 6))
    plt.plot(portfolio_history[i]["USD"], label="USD", color="blue")
    plt.plot(portfolio_history[i]["EUR"], label="EUR", color="green")
    plt.plot(portfolio_history[i]["JPY"], label="JPY", color="red")
    plt.title(f"Portfolio Progression in Environment {i + 1}")
    plt.xlabel("Steps")
    plt.ylabel("Portfolio Value")
    plt.legend()
    plt.grid()
    plt.show()


# obs = envs.reset()
# lstm_states = None
# episode_starts = np.ones((envs.num_envs,), dtype=bool)
# while True:
#     action, lstm_states = model.predict(obs, state=lstm_states, episode_start=episode_starts, deterministic=True)
#     obs, rewards, dones, info = envs.step(action)
#     print(f"action = \n {action}\n")
#     episode_starts = dones
#     if dones.any():
#         break
# print("Evaluation complete!")


In [None]:
from IPython.display import FileLink
model_path = get_latest_checkpoint(checkpoint_dir)
FileLink(model_path)