The purpose of this notebook is to determine the best learning rate for each of the architecture after Making sure that trainable parameters are the same scale


First section: Checking number of parameters and updating them until they are in the same scale of magnitute

In [33]:
import torch
import normflows as nf
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from torch.utils.data import DataLoader as dl
from torch.utils.data import TensorDataset as ds

In [34]:
enable_cuda = True
device = torch.device('cuda' if torch.cuda.is_available() and enable_cuda else 'cpu')
base = nf.distributions.DiagGaussian(2)
Models=[]

In [35]:
def RotationalQuadraticSpline():
    K = 7

    latent_size = 2
    hidden_units = 270
    hidden_layers = 2

    flows = []
    for i in range(K):
        flows += [nf.flows.AutoregressiveRationalQuadraticSpline(latent_size, hidden_layers, hidden_units)]
        flows += [nf.flows.LULinearPermute(latent_size)]

    # Set base distribution
    q0 = base
        
    # Construct flow model
    model = nf.NormalizingFlow(q0, flows)

    return(model)

Models.append(RotationalQuadraticSpline())

In [36]:
#Real NVP Model here
def Real_nvp_init():
    num_layers = 160
    Real_NVP_flows = []
    for i in range(num_layers):
        parameters = nf.nets.MLP([1,128,32,84,64,16,2],init_zeros=True)
        Real_NVP_flows.append(nf.flows.AffineCouplingBlock(param_map=parameters))
        Real_NVP_flows.append(nf.flows.Permute(2,mode="swap"))

    Real_NVP_model = nf.NormalizingFlow(q0=base,flows=Real_NVP_flows)

    return Real_NVP_model

Models.append(Real_nvp_init())

In [37]:
def MAF_init():
    MAF_flows = []
    hidden = 256 #number of hidden units
    
    latent_size = 2 #input dimensions
    
    K=8

    for i in range(K):
        MAF_flows += [nf.flows.MaskedAffineAutoregressive(latent_size, hidden)]
        MAF_flows += [nf.flows.Permute(2,mode="swap")]

    MAF_model = nf.NormalizingFlow(q0=base,flows=MAF_flows)

    return MAF_model

Models.append(MAF_init())

In [38]:
#Neural Spline Flow
def Neural_Spline_Flow():
    K = 8
    torch.manual_seed(0)

    latent_size = 2
    hidden_units = 252
    hidden_layers = 2

    neural_flows = []
    for i in range(K):
        neural_flows += [nf.flows.AutoregressiveRationalQuadraticSpline(latent_size, hidden_layers, hidden_units)]
        neural_flows += [nf.flows.LULinearPermute(latent_size)]


    Neural_Spline_model= nf.NormalizingFlow(q0=base, flows=neural_flows)

    return Neural_Spline_model

Models.append(Neural_Spline_Flow())


In [39]:
for model in Models:
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])

    print(f'model has {params} trainable parameters')

model has 2141738 trainable parameters
model has 2187204 trainable parameters
model has 2119716 trainable parameters
model has 2139396 trainable parameters


Second Section: Training the Models and seeing how they perform. Will come then adapt this section to create a 
funtion that will facilitate hyperparameter search

As seen above, all the models have similar number of trainable parameters. These are the models that I will use to compare and find their optimum learning rate

In [40]:
def train_model(model, target_data_loader,learning_rate):
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 
    tolerance = 0.0001
    previous_loss = float('inf')
    early_stop = False
    losses = []
    
    for epoch in range(1000):
        if not early_stop:
            model.train()
            for data in target_data_loader:
                optimizer.zero_grad()
                
                data = data[0].to(device)  # Extract data from TensorDataset and move to device
                loss = model.forward_kld(data)
                
                if ~(torch.isnan(loss) | torch.isinf(loss)):
                    loss.backward()
                    optimizer.step()
                
                current_loss = loss.item()
                
                if epoch > 0:
                    loss_diff = abs(current_loss - previous_loss)
                    if loss_diff < tolerance:
                        early_stop = True
                        print(f"Early stopping at epoch {epoch+1} with loss difference {loss_diff:.6f}")
                        break
                
                losses.append(loss.item())
                
                previous_loss = current_loss
    
    plt.figure()
    plt.plot(losses)
    plt.show()

    return model