# Preconditioning ADAM
#### Author: JP Melo

In this file we explore how preconditioning ADAM and incorporating curvature information changes convergence speed.

### Imports

In [16]:
from derpinns.nn import *
from derpinns.utils import *
from derpinns.trainer import *
import torch
import kfac

## Parameters

In [None]:
# Fix seed for reproducibility
torch.manual_seed(0)
np.random.seed(0)

# Global parameters
assets = 2

sampler = "Sobol"               
nn_shape = "64x3"               
device = torch.device("cpu") 
dtype = torch.float32

# Define option valuation params
params = OptionParameters(
    n_assets=assets,
    tau=1.0,
    sigma=np.array([0.2] * assets),
    rho=np.eye(assets) + 0.25 * (np.ones((assets, assets)) - np.eye(assets)),
    r=0.05,
    strike=100,
    payoff=payoff
)

# Create dataset to traing over
batch_size = 500
total_iter = 1_000
boundary_samples = 20_000
interior_samples = boundary_samples*assets*2
initial_samples = boundary_samples*assets*2

dataset = SampledDataset(
    params, interior_samples, initial_samples, boundary_samples, sampler, dtype, device, seed=0)

## Training

We train the same model arquitecture with and without loss balancing. This technique is implemented inside the closure ```LossBalancingDimlessBS```.

### With KFAC

In [18]:
model = build_nn(
    nn_shape=nn_shape,
    input_dim=assets,
    dtype=torch.float32
).apply(weights_init).to(device)
model.train()

# we use the same optimizer for both cases
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
preconditioner = kfac.preconditioner.KFACPreconditioner(model)

closure = DimlessBS()\
    .with_dataset(dataset, loader_opts={'batch_size': batch_size, "shuffle": True, "pin_memory": True})\
    .with_model(model)\
    .with_device(device)\
    .with_dtype(dtype)

trainer = PINNTrainer()\
    .with_optimizer(optimizer)\
    .with_device(device)\
    .with_dtype(dtype)\
    .with_training_step(closure)\
    .with_preconditioner(preconditioner)\
    .with_epochs(total_iter)\

trainer.train()

Adam training: 100%|██████████| 1000/1000 [03:05<00:00,  5.38it/s, Interior=0.000769, Boundary=0.000340, Initial=0.000784, Total=0.001894, Max Error=29.7139282227, L2 Error=0.0463983119]


In [19]:
with_pre_state = trainer.closure.get_state()
plot_loss(with_pre_state, smooth=True, smooth_window=10)

with_pre_results = compare_with_mc(model, params, n_prices=200,
                          n_simulations=10_000, dtype=dtype, device=device, seed=42)['l2_rel_error']
print("L2 Error: ", with_pre_results*100)

L2 Error:  3.1773658


## Without KFAC

In [20]:
# Build the net to be used
model = build_nn(
    nn_shape=nn_shape,
    input_dim=assets,
    dtype=torch.float32
).apply(weights_init).to(device)

# we use the same optimizer for both cases
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

model.train()

closure = DimlessBS()\
    .with_dataset(dataset, loader_opts={'batch_size': batch_size, "shuffle": True, "pin_memory": True})\
    .with_model(model)\
    .with_device(device)\
    .with_dtype(dtype)

trainer = PINNTrainer()\
    .with_optimizer(optimizer)\
    .with_device(device)\
    .with_dtype(dtype)\
    .with_training_step(closure)\
    .with_epochs(total_iter)\

trainer.train()

Adam training: 100%|██████████| 1000/1000 [03:02<00:00,  5.49it/s, Interior=0.000759, Boundary=0.000654, Initial=0.005610, Total=0.007022, Max Error=85.5677490234, L2 Error=0.1341786534]


In [21]:
without_pre_state = trainer.closure.get_state()
plot_loss(without_pre_state, smooth=True, smooth_window=10)

without_pre_results = compare_with_mc(model, params, n_prices=200,
                          n_simulations=10_000, dtype=dtype, device=device, seed=42)['l2_rel_error']
print("L2 Error: ", without_pre_results*100)

L2 Error:  3.9504097


6.2328877/3.9504097

### Compare both runs

In [26]:
compare_loss_histories(
    [with_pre_state,without_pre_state],
    labels=["With preconditioning", "Without preconditioning"],
    smooth=True,
    smooth_window=10,
)

Preconditioning has a significant impact in training via ADAM. With KFAC, after 1.000 iterations, we are to achieve:

In [31]:
print("Boundary: ", (1 - with_pre_state['boundary_loss'][-1]/without_pre_state['boundary_loss'][-1])*100)
print("Interior: ", (1 - with_pre_state['interior_loss'][-1]/without_pre_state['interior_loss'][-1])*100)
print("Initial: ", (1 - with_pre_state['initial_loss'][-1]/without_pre_state['initial_loss'][-1])*100)
total_with = with_pre_state['initial_loss'][-1]+with_pre_state['interior_loss'][-1]+with_pre_state['boundary_loss'][-1]
total_without = without_pre_state['initial_loss'][-1]+without_pre_state['interior_loss'][-1]+without_pre_state['boundary_loss'][-1]
print("Total: ", (1-total_with/total_without)*100)

Boundary:  47.90573123249503
Interior:  -1.3537768690625285
Initial:  86.01604817761297
Total:  73.02870165239695
