In [1]:

import sys
import gymnasium as gym
sys.path.insert(0, '/Users/james/Desktop/DRL_Hedging/')
import numpy as np
import torch
import matplotlib.pyplot as plt
from financial_models.asset_price_models import GBM
from financial_models.option_price_models import BSM
from torch import nn
from hedging_env_gymnasium_a import HedgingEnv
from baseline_agent import DeltaNeutralAgent


seed = 345
np.random.seed(seed)
torch.manual_seed(seed)

mu = 0
D = 5
T = 10
num_steps = T*D
s_0 = 100
strike_price = s_0
sigma = 0.15
r = 0
trading_cost_para=0.005
L=100

apm = GBM(mu=mu, dt=1/(252*D), s_0=s_0, sigma=sigma)
opm = BSM(strike_price=strike_price, risk_free_interest_rate=r, volatility=sigma, T=T/252, dt=1/(252*D))
env = HedgingEnv(asset_price_model=apm, D=D, T=T, num_steps=num_steps, trading_cost_para=trading_cost_para,r=r,
                 L=L, strike_price=strike_price, initial_holding_delta=False, mode="RA",
                 option_price_model=opm)




In [2]:


class NormalizeObservationWrapper(gym.Wrapper):
    def __init__(self, env):
        super().__init__(env)
        self.low = env.observation_space.low
        self.high = env.observation_space.high
        

    def observation(self, observation):
        normalized_obs = (observation - self.low) / (self.high - self.low)
        return normalized_obs

    def step(self, action):
        observation, reward, done,_, info = self.env.step(action)
        normalized_observation = self.observation(observation)
        return normalized_observation, reward, done,_,info

    def reset(self, **kwargs):
        observation, info = self.env.reset(**kwargs)
        normalized_observation = self.observation(observation)
        return normalized_observation, info


# Usage
normalized_env = NormalizeObservationWrapper(env)



In [4]:
# Run simulations
reward_array = np.zeros(D*T)
state,info = normalized_env.reset()
done = False
count=0
while not done:
    action = np.ones(1)
    state, reward, done, _ ,info = normalized_env.step(action)
    print(state)
    reward_array[count] = reward
    count+=1

reward_array

[1.         0.50392455 0.98       0.01618268]
[1.         0.50539666 0.96       0.01791972]
[1.         0.50811505 0.94       0.02151997]
[1.         0.5095865  0.92       0.02357482]
[1.         0.5081625  0.9        0.02137435]
[1.         0.5045183  0.88       0.01632452]
[1.         0.50789386 0.86       0.0207697 ]
[1.         0.510645   0.84       0.02480079]
[1.         0.5125196  0.82       0.02772047]
[1.         0.5160982  0.8        0.03376484]
[1.         0.51846415 0.78       0.0379712 ]
[1.        0.5180701 0.76      0.0371947]
[1.        0.5208237 0.74      0.0422754]
[1.         0.5195187  0.72       0.03977701]
[1.         0.51166755 0.7        0.02577105]
[1.         0.51096034 0.68       0.02451512]
[1.         0.5108748  0.66       0.02427612]
[1.         0.51077646 0.64       0.02401493]
[1.         0.51265574 0.62       0.02707143]
[1.         0.51405054 0.6        0.02942487]
[1.         0.51351666 0.58       0.02840278]
[1.         0.5115217  0.56       0.024861

  d_1 = (np.log(asset_price / self.strike_price) + (self.risk_free_interest_rate + self.volatility**2 / 2)


array([-2.41215206e+01,  4.78545001e+00,  1.50106897e+00,  4.93745870e+00,
       -8.57343785e+00, -4.74433267e+01, -3.52926510e+00,  3.89052056e+00,
        4.85473122e+00,  4.93636848e+00,  3.87501603e+00, -1.16930357e-01,
        3.35521352e+00, -1.17886482e+00, -3.13489997e+01, -1.71086713e+00,
        6.56708500e-01,  6.24255759e-01,  4.55609057e+00,  3.41071221e+00,
       -4.67115761e-01, -5.48710779e+00, -1.09675859e+00,  4.20995254e+00,
       -2.97229021e+01,  1.62372092e+00, -2.91839456e+01, -1.41983006e+01,
        3.01819841e+00,  3.88802295e+00, -4.78825023e+00, -1.05709044e+00,
        1.89556214e+00, -6.91003178e+00, -9.12754912e+00,  4.65795142e+00,
        3.78142674e+00,  4.36763464e+00, -1.25091372e+01,  4.95196650e+00,
        9.83586543e-01,  4.23078264e+00, -2.52610552e+00,  2.36665335e+00,
        2.62028822e-01, -4.33349563e-01,  6.04296366e-01,  3.74105966e-03,
        4.35295296e-06,  1.42108547e-12])

In [3]:
normalized_env.step([0])

(array([0.        , 0.5033172 , 0.98      , 0.01622231], dtype=float32),
 -35.07453294392704,
 False,
 {'delta': 0.589145273459667})

  d_1 = (np.log(asset_price / self.strike_price) + (self.risk_free_interest_rate + self.volatility**2 / 2)


array([ 2.58113292e+01, -5.68915671e+01,  3.52109980e+00,  2.24147321e+00,
        4.06331029e+01,  1.00944499e+01, -3.97203349e+01,  3.37152979e-01,
        1.68331575e+01,  2.37596332e+01, -1.42322828e+00, -1.31913042e+01,
        1.27335767e+01,  6.52256211e+00, -6.67644051e+00, -1.07765685e+01,
        2.78897354e+01,  2.83353085e+01, -2.90320922e-01, -2.57538222e+00,
       -1.74660714e+01,  9.97953938e+00,  3.28300988e+01,  8.64718092e+00,
        5.08175856e+00,  8.33407182e+00,  3.77470581e+00,  2.16075266e+00,
       -6.41458726e-01,  2.29992164e+00,  1.84445357e+00, -2.60579269e-01,
        2.27341003e-01,  7.09484486e-01,  1.64131919e-01,  4.66167642e-01,
       -1.62239650e+00,  1.88728671e+00,  7.38051844e-01,  1.64452844e-02,
        4.76309017e-01,  3.15130103e-01,  1.58854576e-02,  2.86509221e-03,
        3.39029148e-05,  6.43224350e-07,  1.09261757e-07,  1.22221591e-09,
        8.09250699e-15,  4.89046434e-33,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  

In [21]:
reward_array.sum()

0.12821581392691417

In [9]:
from stable_baselines3.common.env_checker import check_env
check_env(env)

## Train PPO


In [10]:
from stable_baselines3 import PPO
from stable_baselines3.ppo.policies import MlpPolicy

model = PPO(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=1e7)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


  d_1 = (np.log(asset_price / self.strike_price) + (self.risk_free_interest_rate + self.volatility**2 / 2)


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 128      |
|    ep_rew_mean     | -0.138   |
| time/              |          |
|    fps             | 1444     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 128          |
|    ep_rew_mean          | 0.322        |
| time/                   |              |
|    fps                  | 1192         |
|    iterations           | 2            |
|    time_elapsed         | 3            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0058937264 |
|    clip_fraction        | 0.0519       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.41        |
|    explained_variance   | 0.0629       |
|    learning_r

KeyboardInterrupt: 

In [None]:
from stable_baselines3 import SAC

from stable_baselines3.sac.policies import MlpPolicy

model = SAC(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=10000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 128      |
|    ep_rew_mean     | -91      |
| time/              |          |
|    episodes        | 4        |
|    fps             | 86       |
|    time_elapsed    | 5        |
|    total_timesteps | 512      |
| train/             |          |
|    actor_loss      | 0.721    |
|    critic_loss     | 1.26     |
|    ent_coef        | 0.886    |
|    ent_coef_loss   | -0.173   |
|    learning_rate   | 0.0003   |
|    n_updates       | 411      |
---------------------------------


KeyboardInterrupt: 