In [None]:
!pip install tianshou

In [15]:
import pandas as pd
import numpy as np
import os
import torch as th
import torch.nn as nn
import gymnasium as gym
from gymnasium import Env
from gymnasium.spaces import Discrete, Box
import math

In [16]:
def get_forex_data():
    # Load the dataset
    data_set = pd.read_csv('/kaggle/input/train-dataset/Foreign_Exchange_Rates.csv', na_values='ND')

    # Interpolate missing values to handle missing data
    data_set = data_set.infer_objects(copy=False)  # Ensure non-numeric columns are correctly inferred
    data_set.interpolate(inplace=True)

    # Select only the columns for EUR/USD and JPY/USD exchange rates
    df = data_set[['EURO AREA - EURO/US$', 'JAPAN - YEN/US$']].copy()

    # Add derived column for YEN/EURO exchange rate
    df['YEN/EURO'] = df['JAPAN - YEN/US$'] / df['EURO AREA - EURO/US$']


    return df

In [17]:
class State:
    def __init__(self):
        self._prices = None
        self._first_diff = None
        self._offset = None
        self.balance = None
        self.portfolio = None
        self.euro_buy_value = None
        self.yen_buy_value = None
        self.trade_max_percentage = None

    def reset(self, prices, offset, initial_balance, trade_max_percentage ):
        self._prices = prices
        first_differences = prices.diff()
        # Normalize the first differences using Min-Max normalization
        self._first_diff =  (first_differences - first_differences.min()) / (first_differences.max() - first_differences.min())
        self._offset = offset
        self.balance = initial_balance
        self.trade_max_percentage = trade_max_percentage
        self.portfolio = {'USD': initial_balance, 'EUR': 0, 'JPY':0}
        self.euro_buy_value = 0
        self.yen_buy_value = 0

    def step(self, action, reward_type = "inDirect"):
        reward = 0
        current_price = self._prices.iloc[self._offset][['EURO AREA - EURO/US$', 'JAPAN - YEN/US$', 'YEN/EURO']].values
        eur_usd, jpy_usd, jpy_eur = current_price
        max_trade_amount = self.balance * self.trade_max_percentage
        # action is an array of floats between -1 and 1
        # remove the looping transaction
        if(action[0] < 0 and action[1] > 0 and action[2] < 0):
            common = min(-action[0], action[1], -action[2])
            action[0] += common
            action[1] -= common
            action[2] += common
        elif(action[0] > 0 and action[1] < 0 and action[2] > 0):
            common = min(action[0], -action[1], action[2])
            action[0] -= common
            action[1] += common
            action[2] -= common
        # USD and EUR, positive means buy EUR
        if action[0] > 0:
            trade_amount =  abs(max_trade_amount*action[0])
            trade_volume = min(self.portfolio['USD'], trade_amount)
            reward = 0

            self.portfolio['EUR'] += trade_volume * eur_usd
            self.portfolio['USD'] -= trade_volume
            self.euro_buy_value += trade_volume
        elif action[0] < 0:
            trade_amount =  abs(max_trade_amount*action[0])
            trade_volume = min(self.portfolio['EUR'], trade_amount * eur_usd)
            if trade_volume > 0:
                reward = trade_volume * (1/eur_usd-(self.euro_buy_value/self.portfolio['EUR']))
            self.portfolio['USD'] += trade_volume / eur_usd
            self.portfolio['EUR'] -= trade_volume
            self.euro_buy_value -= trade_volume / eur_usd
        # USD and YEN, positive means buy YEN
        if action[1] > 0:
            trade_amount =  abs(max_trade_amount*action[1])
            trade_volume = min(self.portfolio['USD'], trade_amount)
            reward = 0
            self.portfolio['JPY'] += trade_volume * jpy_usd
            self.portfolio['USD'] -= trade_volume
            self.yen_buy_value += trade_volume
        elif action[1] < 0:
            trade_amount =  abs(max_trade_amount*action[1])
            trade_volume = min(self.portfolio['JPY'], trade_amount * jpy_usd)
            if trade_volume > 0:
                reward = trade_volume * (1/jpy_usd - self.yen_buy_value/self.portfolio['JPY'])
            self.portfolio['USD'] += trade_volume / jpy_usd
            self.portfolio['JPY'] -= trade_volume
            self.yen_buy_value -= trade_volume / jpy_usd
        # EUR and YEN, positive means buy YEN
        if action[2] > 0:
            trade_amount =  abs(max_trade_amount*action[2])
            trade_volume = min(self.portfolio['EUR'], trade_amount * eur_usd)
            if trade_volume > 0:
                reward = trade_volume * (1/eur_usd - self.euro_buy_value/self.portfolio['EUR'])

            self.portfolio['JPY'] += trade_volume * jpy_eur
            self.portfolio['EUR'] -= trade_volume
            self.euro_buy_value -= trade_volume / eur_usd
            self.yen_buy_value += trade_volume / eur_usd
        elif action[2] < 0:
            trade_amount =  abs(max_trade_amount*action[2])
            trade_volume = min(self.portfolio['JPY'], trade_amount * jpy_usd)
            if trade_volume > 0:
                reward = trade_volume * (1/jpy_usd - self.yen_buy_value/self.portfolio['JPY'])

            self.portfolio['EUR'] += trade_volume / jpy_eur
            self.portfolio['JPY'] -= trade_volume
            self.euro_buy_value += trade_volume / jpy_usd
            self.yen_buy_value -= trade_volume / jpy_usd

        portfolio_value = (self.portfolio['USD'] + self.portfolio['EUR'] / eur_usd + self.portfolio['JPY'] / jpy_usd)
        if reward_type == "Direct":
            reward = portfolio_value - self.balance
        self.balance = portfolio_value
        self._offset += 1
        done = self._offset >= len(self._prices) - 1
        # reward = 100*(action[0]-action[1]-action[2])
        return reward, done

    def encode(self):
        # Extract historical prices
        current_prices = self._first_diff.iloc[self._offset]
        encoded_prices = np.array(current_prices[['EURO AREA - EURO/US$', 'JAPAN - YEN/US$', 'YEN/EURO']]).flatten()
        portfolio_fraction = np.array([self.portfolio['USD'],self.portfolio['EUR'],self.portfolio['JPY']])/self.balance


        # Combine all features into a single array
        # use log to normalize balance
        encoded_features = np.concatenate([
            encoded_prices,
            portfolio_fraction,
            [self.euro_buy_value, self.yen_buy_value/100, self.trade_max_percentage]
        ])
        return encoded_features

    @property
    def shape(self):
        # Update the shape to match the new number of encoded features
        return (3 + 3 + 3,)  # 3 prices + 3 portfolio + 3 additional values

In [18]:
class ForexTradingEnv(Env):
    def __init__(self, df, initial_balance=1000):
        super(ForexTradingEnv, self).__init__()
        self.df = df
        self.initial_balance = initial_balance
        self.state = State()
        # shape is all the currency combinations
        self.action_space = Box(low=-1, high=1, shape=(3,), dtype=np.float32)
        self.observation_space = Box(
            low=0, high=np.inf, shape=self.state.shape, dtype=np.float32
        )
    def seed(self, seed):
        np.random.seed(seed)
    def reset(self,sequence_length, seed=None):
        super().reset(seed=seed)
        rng = np.random.default_rng(seed)
        offset = np.random.randint(sequence_length, len(self.df) - 1)
        self.state.reset(prices=self.df, offset=offset, initial_balance=self.initial_balance, trade_max_percentage= 1)
        return self.state.encode()

    def step(self, action):
        reward, terminated = self.state.step(action)
        truncated = self.state._offset >= len(self.df) - 1
        observation = self.state.encode()
        info = {
            "balance": self.state.balance,  # Include the current balance
            # Add any other relevant fields from the State object if needed
        }
        return observation, reward, terminated, truncated, info

    def render(self, mode='human'):
        if mode != 'human':
            raise NotImplementedError("Only 'human' rendering mode is supported.")
        print(f"Step: {self.state._offset}")
        print(f"Portfolio: {self.state.portfolio}")
        print(f"Balance: {self.state.balance}")
class SequenceEnvironment(gym.Env):
    def __init__(self, df, sequence_length):
        super(SequenceEnvironment, self).__init__()
        self.original_env = ForexTradingEnv(df)
        self.sequence_length = sequence_length
        self.buffer = []
        self.observation_space = Box(
            low=np.repeat(self.original_env.observation_space.low[None, :], sequence_length, axis=0),
            high=np.repeat(self.original_env.observation_space.high[None, :], sequence_length, axis=0),
            dtype=self.original_env.observation_space.dtype
        )
        self.action_space = self.original_env.action_space

    def reset(self,seed=None):
        obs = self.original_env.reset(self.sequence_length)
        self.buffer = [obs] * self.sequence_length
        return np.array(self.buffer), {}

    def step(self, action):
        obs, reward, terminated, truncated, info = self.original_env.step(action)
        self.buffer.pop(0)
        self.buffer.append(obs)
        return np.array(self.buffer), reward, terminated, truncated, info
        
    def get_balance(self):
        return self.original_env.state.balance

In [19]:
import torch.nn.functional as F

class LSTMFeatureExtractor(nn.Module):
    def __init__(self, input_dim, lstm_hidden_size=128, features_dim=256):
        super().__init__()
        self.device= "cuda" if torch.cuda.is_available() else "cpu"
        self.lstm_hidden_size = lstm_hidden_size
        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=lstm_hidden_size, batch_first=True)
        self.linear1 = nn.Linear(lstm_hidden_size, 128)
        self.linear2 = nn.Linear(128, features_dim)

    def forward(self, x, hidden_state=None):
        if isinstance(x, np.ndarray):
            x = torch.tensor(x, dtype=torch.float32, device=self.device)
        if hidden_state is None:
            h_0 = torch.zeros(1, x.size(0), self.lstm_hidden_size).to(self.device)
            c_0 = torch.zeros(1, x.size(0), self.lstm_hidden_size).to(self.device)
        else:
            h_0, c_0 = hidden_state

        lstm_out, (h_n, c_n) = self.lstm(x, (h_0, c_0))
        last_time_step_out = lstm_out[:, -1, :]
        x = F.relu(self.linear1(last_time_step_out))
        features = F.relu(self.linear2(x))
        return features, (h_n, c_n)

class ActorLSTM(nn.Module):
    def __init__(self, input_dim, lstm_hidden_size=128, features_dim=256, action_dim=3):
        super().__init__()
        self.device= "cuda" if torch.cuda.is_available() else "cpu"
        self.feature_extractor = LSTMFeatureExtractor(input_dim, lstm_hidden_size, features_dim)
        self.actor_head = nn.Linear(features_dim, action_dim)
        self.log_std = nn.Parameter(torch.zeros(action_dim))

    def forward(self, obs, state=None, **kwargs):
        if isinstance(obs, np.ndarray):
            obs = torch.tensor(obs, dtype=torch.float32, device=self.device)
        features, state = self.feature_extractor(obs, state)
        logits = self.actor_head(features)
        return logits, state


class CriticLSTM(nn.Module):
    def __init__(self, input_dim, lstm_hidden_size=128, features_dim=256):
        super().__init__()
        self.device= "cuda" if torch.cuda.is_available() else "cpu"
        self.feature_extractor = LSTMFeatureExtractor(input_dim, lstm_hidden_size, features_dim)
        self.critic_head = nn.Linear(features_dim, 1)

    def forward(self, obs, state=None, **kwargs):
        if isinstance(obs, np.ndarray):
            obs = torch.tensor(obs, dtype=torch.float32, device=self.device)
        features, state = self.feature_extractor(obs, state)
        value = self.critic_head(features)
        return value, state

In [None]:
from tianshou.policy import PPOPolicy
import torch
from torch.optim import Adam
from tianshou.env import DummyVectorEnv
from torch.distributions import Independent, Normal
from tianshou.utils.net.common import ActorCritic, Net
from tianshou.utils.net.continuous import ActorProb, Critic
data =  get_forex_data()
# Initialize your environment
seed = 434
device= "cuda" if torch.cuda.is_available() else "cpu"
env = SequenceEnvironment(df=data, sequence_length=60)
envs = DummyVectorEnv([lambda: SequenceEnvironment(df=data, sequence_length=60)] * 10)
# Define network dimensions
input_dim = env.observation_space.shape[1]  # Number of features per time step
action_dim = env.action_space.shape[0]      # Number of possible actions
preprocess_output_dim = 256

# actor = ActorLSTM(input_dim=input_dim, lstm_hidden_size=128, features_dim=256, action_dim=action_dim)
# critic = CriticLSTM(input_dim=input_dim, lstm_hidden_size=128, features_dim=256)
np.random.seed(seed)
torch.manual_seed(seed)
envs.seed(seed)
# model
net = LSTMFeatureExtractor(input_dim=input_dim,features_dim=preprocess_output_dim)
actor = ActorProb(net, action_dim, unbounded=True, device=device, preprocess_net_output_dim=preprocess_output_dim).to(device)
critic = Critic(net,device=device,preprocess_net_output_dim=preprocess_output_dim).to(device)
actor_critic = ActorCritic(actor, critic)
# orthogonal initialization
for m in actor_critic.modules():
    if isinstance(m, torch.nn.Linear):
        torch.nn.init.orthogonal_(m.weight)
        torch.nn.init.zeros_(m.bias)
def dist(loc_scale: tuple[torch.Tensor, torch.Tensor]):
    loc, scale = loc_scale
    return Independent(Normal(loc, scale), 1)
optim = torch.optim.Adam(actor_critic.parameters(), lr=3e-4)
policy = PPOPolicy(
    actor=actor,
    critic=critic,
    optim=optim,
    dist_fn=dist,
    action_space=env.action_space,
    max_grad_norm=0.5,
    discount_factor=1,
    gae_lambda=0.95,
    vf_coef=0.5,
    ent_coef=0.01,
    action_bound_method="clip",
    reward_normalization=True,
    dual_clip=None,
    value_clip=True,
    advantage_normalization=True,
    recompute_advantage=True,
)

In [None]:
from tianshou.env import DummyVectorEnv
from tianshou.data import Collector, VectorReplayBuffer
from tianshou.policy import PPOPolicy
from tianshou.trainer import OnpolicyTrainer
from tianshou.utils import TensorboardLogger
from torch.distributions import Independent, Normal
from torch.utils.tensorboard import SummaryWriter
from tianshou.utils.net.common import ActorCritic, Net
from tianshou.utils.net.continuous import ActorProb, Critic
import os
import torch
import numpy as np

def dist(loc,scale):
    return Independent(Normal(loc, scale), 1)
data = get_forex_data()
# Environment setup
env = SequenceEnvironment(df=data, sequence_length=60)
train_envs = DummyVectorEnv([lambda: SequenceEnvironment(df=data, sequence_length=60)] * 10)
test_envs = DummyVectorEnv([lambda: SequenceEnvironment(df=data, sequence_length=60)] * 10)

# Seed setup
seed = 434
np.random.seed(seed)
torch.manual_seed(seed)
train_envs.seed(seed)
test_envs.seed(seed)

# Model setup
input_dim = env.observation_space.shape[1]
action_dim = env.action_space.shape[0]
preprocess_output_dim = 256
device = "cuda" if torch.cuda.is_available() else "cpu"

net = LSTMFeatureExtractor(input_dim=input_dim, features_dim=preprocess_output_dim)
actor = ActorProb(net, action_dim, unbounded=True, device=device, preprocess_net_output_dim=preprocess_output_dim).to(device)
critic = Critic(net, device=device, preprocess_net_output_dim=preprocess_output_dim).to(device)
actor_critic = ActorCritic(actor, critic)

# Orthogonal initialization
for m in actor_critic.modules():
    if isinstance(m, torch.nn.Linear):
        torch.nn.init.orthogonal_(m.weight)
        torch.nn.init.zeros_(m.bias)

optim = torch.optim.Adam(actor_critic.parameters(), lr=3e-4)

policy = PPOPolicy(
    actor=actor,
    critic=critic,
    optim=optim,
    dist_fn=dist,
    discount_factor=1,
    max_grad_norm=0.5,
    eps_clip=0.2,
    vf_coef=0.5,
    ent_coef=0.01,
    reward_normalization=True,
    advantage_normalization=True,
    recompute_advantage=True,
    value_clip=True,
    gae_lambda=0.95,
    action_space=env.action_space,
)

# Collector setup
train_collector = Collector(
    policy,
    train_envs,
    VectorReplayBuffer(20000, len(train_envs))
)
test_collector = Collector(policy, test_envs)

# Logging setup
log_path = os.path.join("log", "forex_trading", "ppo")
writer = SummaryWriter(log_path)
logger = TensorboardLogger(writer, save_interval=1)

# Save best policy
save_best_fn = lambda policy: torch.save(policy.state_dict(), os.path.join(log_path, "policy.pth"))

def stop_fn(mean_rewards: float):
    return mean_rewards >= 1e5  # Set an appropriate threshold for stopping

def save_checkpoint_fn(epoch: int, env_step: int, gradient_step: int):
    ckpt_path = os.path.join(log_path, "checkpoint.pth")
    torch.save({
        "model": policy.state_dict(),
        "optim": optim.state_dict(),
    }, ckpt_path)
    return ckpt_path

# Training
trainer = OnpolicyTrainer(
    policy=policy,
    train_collector=train_collector,
    test_collector=test_collector,
    max_epoch=50,
    step_per_epoch=1000,
    repeat_per_collect=10,
    episode_per_test=10,
    batch_size=64,
    episode_per_collect=200,
    stop_fn=stop_fn,
    save_best_fn=save_best_fn,
    logger=logger,
    save_checkpoint_fn=save_checkpoint_fn,
)

for epoch_stat in trainer:
    print(f"Epoch: {epoch_stat.epoch}")
    print(epoch_stat)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from tianshou.data import Collector, VectorReplayBuffer
from tianshou.trainer import onpolicy_trainer

# Set up training and test collectors
train_collector = Collector(policy, envs, VectorReplayBuffer(20000, 10))
test_collector = Collector(policy, envs)

# Training configuration
result = onpolicy_trainer(
    policy=policy,
    train_collector=train_collector,
    test_collector=test_collector,
    max_epoch=50,  # Total number of epochs to train
    step_per_epoch=1000,  # Total steps per epoch
    repeat_per_collect=10,  # Gradient updates per data collection
    episode_per_test=10,  # Episodes for testing after each epoch
    batch_size=64,  # Batch size for updates
    step_per_collect=200,  # Steps per data collection
    save_best_fn=lambda policy: torch.save(policy.state_dict(), 'ppo_policy.pth')
)

# Print training results
print(f"Training complete. Best policy reward: {result['best_reward']}")

# Save the trained policy
torch.save(policy.state_dict(), 'ppo_policy_final.pth')

# Load the trained policy (if needed)
policy.load_state_dict(torch.load('ppo_policy_final.pth'))

# --- Evaluation ---
# Set up balance tracking
balance_history = [[] for _ in range(10)]  # Assuming 10 environments

obs = envs.reset()
lstm_states = None
episode_starts = np.ones((10,), dtype=bool)  # Assuming 10 environments

while True:
    # Get actions from the policy
    action, lstm_states = policy.forward(obs, state=lstm_states, episode_start=episode_starts)
    obs, rewards, terminated, truncated, infos = envs.step(action)

    # Track balances for each environment
    for i, info in enumerate(infos):
        balance_history[i].append(info["balance"])

    episode_starts = terminated | truncated
    if np.any(terminated | truncated):  # Stop if any environment is done
        break

print("Evaluation complete!")

# --- Plot Results ---
# Plot the balance history for all environments
plt.figure(figsize=(10, 6))
for i in range(len(balance_history)):
    plt.plot(balance_history[i], label=f"Env {i + 1}")

plt.title("Balance Over Steps for All Environments")
plt.xlabel("Steps")
plt.ylabel("Balance")
plt.legend()
plt.grid()
plt.show()
