# Training a neural network model

## 1. Importing necessary libraries

First, we will import the necessary libraries from python to define our neural network models and other small processes.

In [None]:
import os
from pathlib import Path
import sys

import time
from timeit import default_timer as timer 
import datetime

from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader

import numpy as np

from tqdm import tqdm

import matplotlib.pyplot as plt


from torch.utils.tensorboard import SummaryWriter

from torchinfo import summary

Second, we will import a module that defines a class which will help us to upload the atmosphere quantities and the stokes parameters just as it was done in the notebook about [Charging the data](./Charging_the_data.ipynb]).

In [None]:
sys.path.append("../modules")
from ChargeData import MURaM

## 2. Dataset

### 3. Charge the data

Using the `MURaM` class we will charge the granular-intergranular leveraged data for various filenames.

In [None]:
#filenames to be readed for creating the dataset
filenames = ["080000", "085000", "090000"]

Let's concatenate all the files data to create a unified dataset of atmophere magnitudes with their corresponding stokes parameters spectra.

In [None]:
#Arrays for saving the whole dataset
atm_data = []
stokes_data = []

for fln in filenames:
    #Creation of the MURaM object for each filename for charging the data.
    muram = MURaM(filename=fln)
    muram.charge_quantities()
    muram.optical_depth_stratification()
    muram.degrade_spec_resol()
    muram.scale_quantities()
    muram.gran_intergran_balance()

    atm_data.append(muram.atm_quant)
    stokes_data.append(muram.stokes)

atm_data = np.concatenate(atm_data, axis=0)
stokes_data = np.concatenate(stokes_data, axis=0)
    

In [None]:
atm_data.shape, stokes_data.shape

### 2.2 Training and testing sets

Once we have the data uploaded, we may need to separate it between training and testing sets for applying the data set to the neural network learning process. The train set will have a 70% size of the whole dataset while the test set will have a 30%.

In [None]:
in_train, in_test, out_train, out_test = train_test_split(stokes_data, atm_data, test_size=0.33, random_state=42)

In [None]:
print("in_train shape:", in_train.shape)
print("out_train shape:", out_train.shape)
print("in_test shape:", in_test.shape)
print("out_test shape:", out_test.shape)

Having the test and train sets defined, let's charge them inside pytorch dataloaders for the training of the model.

In [None]:
# Setup device agnostic code
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Tensors stored in: {device}")

In [None]:
#Converting the test and train datasets to tensors
in_train = torch.from_numpy(in_train).to(device)
in_test = torch.from_numpy(in_test).to(device)
out_train = torch.from_numpy(out_train).to(device)
out_test = torch.from_numpy(out_test).to(device)

In [None]:
print("in_train shape:", in_train.size())
print("out_train shape:", out_train.size())
print("in_test shape:", in_test.size())
print("out_test shape:", out_test.size())

Because the output of the neural network model we are going to use is linear, it is necessary to flatten the last two axis of the out datasets corresponding to the atmosphere magnitudes.

In [None]:
#Flattening of the output external axes
out_train = torch.reshape(out_train, (out_train.size()[0], out_train.size()[1]*out_train.size()[2]))
out_test = torch.reshape(out_test, (out_test.size()[0], out_test.size()[1]*out_test.size()[2]))

In [None]:
print("in_train shape:", in_train.size())
print("out_train shape:", out_train.size())
print("in_test shape:", in_test.size())
print("out_test shape:", out_test.size())

Finally, having both datasets converted to tensors, let's save them in their corresponding pytorch dataloaders. Here we will define the batch size hyperparameter for the training process.

In [None]:
#Batch size
batch_size = 80

#Train and test datasets
train_dataset = TensorDataset(in_train.to(device), out_train.to(device))
test_dataset = TensorDataset(in_test.to(device), out_test.to(device))

#Train and test dataloaders
train_dataloader = DataLoader(train_dataset,
        batch_size=batch_size, # how many samples per batch? 
        shuffle=True # shuffle data every epoch?
)

test_dataloader = DataLoader(test_dataset,
    batch_size=batch_size,
    shuffle=False # don't necessarily have to shuffle the testing data
)    

print(f"Length of train dataloader: {len(train_dataloader)} batches of {batch_size}")
print(f"Length of test dataloader: {len(test_dataloader)} batches of {batch_size}")
train_features_batch, train_labels_batch = next(iter(train_dataloader))
print(f"""
Shape of each batch input and output:
train input batch shape: {train_features_batch.shape}, 
train output batch shape: {train_labels_batch.shape}
        """ )

## 3. Neural network models

Now that we have the datasets defined, let's create our neural network model. 

### 3.1 Model creation

First let's use a very simple convolutional 1D neural network arquitecture with a linear output.

In [None]:
class SimpleModel(nn.Module):
    def __init__(self, in_shape, out_shape, hidden_units):
        super().__init__()
        padding = 1
        self.simple_conv = nn.Sequential(
        nn.Conv1d(in_channels=in_shape, out_channels=72, kernel_size = 2, stride=1, padding=padding),
        nn.ReLU(),
        nn.Flatten(),
        nn.Dropout(p=0.5, inplace=False),
        nn.Linear(in_features = 360, out_features = out_shape)
        )
    def forward(self, x):
        return self.simple_conv(x)

simple_model = SimpleModel(36,6*20,hidden_units=4096).float()

In [None]:
summary(model)

It's important that the datasets and the model are both in the same device for performind the training.

In [None]:
print("\nThe model will be runned in:", device)
model.to(device)

Define the model loss function and optimizer, along with the learning rate and epochs hyperparameters.

In [None]:
#Epochs
epochs = 20

#Learning rate
lr = 1e-5

#Loss function
loss_fn = nn.MSELoss() # this is also called "criterion"/"cost function" in some places

#Optimizer
optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)

We must write down the path for the model to save the results of the training along with the trained weights.

In [None]:
results_out = "Results/"
if not os.path.exists(results_out):
    os.mkdir(results_out)
    
#Training folder for the specific hyperparameters
pth_out = results_out+f"{epochs}E_"+f"{lr}lr/"
if not os.path.exists(pth_out):
    os.mkdir(pth_out)
#Create model save path
MODEL_PATH = Path(pth_out+"model_weights/")
MODEL_PATH.mkdir(parents=True, exist_ok=True)
MODEL_NAME = "inversion_"+str(epochs)+"E"+str(lr)+"lr"+".pth"
MODEL_SAVE_PATH = MODEL_PATH / MODEL_NAME
#Charge the weights in case there have been some training before
if MODEL_SAVE_PATH.exists():
    model.load_state_dict(torch.load(f=MODEL_SAVE_PATH))

### 3.2 Training the model

We have everything settled! Let's code the training process. First let's define functions for the training and testing.

In [None]:
def train_step(model: nn.Module, train_dataloader: DataLoader, loss_fn, optimizer, device):
    """
    Function that performs the training step over all the batches in the train dataloader.
    
    Args:
        model(nn.Module): Model to be trained.
        train_dataloader(torch.utils.data.Dataloader): Dataloader of the training dataset.
        loss_fn: Loss function for the training process.
        optimizer: Optimizer function for the training process.
        device: Agnostic device defined por allocating the data and the model.
    Returns:
        None
    """
    # Add a loop to loop through training batches
    train_loss = 0

    for batch, (X, y) in enumerate(train_dataloader):
        model.train() 
        # 1. Forward pass
        X, y = X.to(device), y.to(device)
        y_pred = model.double()(X.double())

        # 2. Calculate loss (per batch)
        loss = loss_fn(y_pred.double(), y.double())
        train_loss += loss # accumulatively add up the loss per epoch 

        # 3. Optimizer zero grad
        optimizer.zero_grad()

        # 4. Loss backward
        loss.backward()

        # 5. Optimizer step
        optimizer.step()

        # Print out how many samples have been seen
        if batch % 400 == 0:
            print(f"Looked at {batch * len(X)}/{len(train_dataloader.dataset)} samples")

    # Divide total train loss by length of train dataloader (average loss per batch per epoch)
    train_loss /= len(train_dataloader)

    return train_loss

def test_step(model: nn.Module, test_dataloader: DataLoader, loss_fn):
    """
    Function that performs the testing step over the testin dataloader.
    
    Args:
        model(nn.Module): Model to be tested.
        test_dataloader(torch.utils.data.Dataloader): Dataloader of the testing dataset.
        loss_fn: Loss function for the testing the model results.
    Returns:
        None
    """
    # Setup variables for accumulatively adding up loss and accuracy 
    test_loss, test_acc = 0, 0 
    model.eval()
    with torch.inference_mode():
        for X, y in test_dataloader:
            # 1. Forward pass
            test_pred = model(X)
        
            # 2. Calculate loss (accumatively)
            test_loss += loss_fn(test_pred, y) # accumulatively add up the loss per epoch

        # Calculations on test metrics need to happen inside torch.inference_mode()

        # Divide total test loss by length of test dataloader (per batch)
        test_loss /= len(test_dataloader)
    
    return test_loss

def train(model: torch.nn.Module, 
          train_dataloader: torch.utils.data.DataLoader, 
          test_dataloader: torch.utils.data.DataLoader, 
          optimizer: torch.optim.Optimizer,
          loss_fn: torch.nn.Module,
          epochs: int,
          device: torch.device) -> Dict[str, List]:
    """Trains and tests a PyTorch model.

    Passes a target PyTorch models through train_step() and test_step()
    functions for a number of epochs, training and testing the model
    in the same epoch loop.

    Calculates, prints and stores evaluation metrics throughout.

    Args:
      model: A PyTorch model to be trained and tested.
      train_dataloader: A DataLoader instance for the model to be trained on.
      test_dataloader: A DataLoader instance for the model to be tested on.
      optimizer: A PyTorch optimizer to help minimize the loss function.
      loss_fn: A PyTorch loss function to calculate loss on both datasets.
      epochs: An integer indicating how many epochs to train for.
      device: A target device to compute on (e.g. "cuda" or "cpu").
      
    Returns:
      A dictionary of training and testing loss as well as training and
      testing accuracy metrics. Each metric has a value in a list for 
      each epoch.
      In the form: {train_loss: [...],
                test_loss: [...],
      For example if training for epochs=2: 
              {train_loss: [2.0616, 1.0537],
                test_loss: [1.2641, 1.5706],
    """
    # Create empty results dictionary
    results = {"train_loss": [],
               "test_loss": [],
    }

    # Loop through training and testing steps for a number of epochs
    for epoch in tqdm(range(epochs)):
        train_loss = train_step(model=model,
                                           dataloader=train_dataloader,
                                           loss_fn=loss_fn,
                                           optimizer=optimizer,
                                           device=device)
        test_loss = test_step(model=model,
                                        dataloader=test_dataloader,
                                        loss_fn=loss_fn,
                                        device=device)

        # Print out what's happening
        print(
          f"Epoch: {epoch+1} | "
          f"train_loss: {train_loss:.4f} | "
          f"test_loss: {test_loss:.4f} | "
        )

        # Update results dictionary
        results["train_loss"].append(train_loss)
        results["test_loss"].append(test_loss)

        ### New: Experiment tracking ###
        # Add loss results to SummaryWriter
        writer.add_scalars(main_tag="Loss", 
                           tag_scalar_dict={"train_loss": train_loss,
                                            "test_loss": test_loss},
                           global_step=epoch)
        
        # Track the PyTorch model architecture
        writer.add_graph(model=model, 
                         # Pass in an example input
                         input_to_model=torch.randn(32, 3, 224, 224).to(device))
    
    # Close the writer
    writer.close()
    
    ### End new ###

    # Return the filled results at the end of the epochs
    return results

def print_train_time(start: float, end: float, device: torch.device = None):
    """Prints difference between start and end time.

    Args:
        start (float): Start time of computation (preferred in timeit format). 
        end (float): End time of computation.
        device ([type], optional): Device that compute is running on. Defaults to None.

    Returns:
        float: time between start and end in seconds (higher is longer).
    """
    total_time = end - start
    print(f"Train time on {device}: {total_time:.3f} seconds")
    return total_time

Let's train!

In [None]:
train_loss_history = np.zeros((epochs,))
test_loss_history = np.zeros((epochs,))

total_train_time_model = 0
# Set timers
train_time_start_on_cpu = timer()
start = time.time()
for epoch in tqdm(range(epochs)):
    print(f"Epoch: {epoch}\n-------")
    ### Training
    train_loss = train_step(model, train_dataloader, loss_fn, optimizer, device)
    train_loss_history[epoch] = train_loss
    
    ### Testing)
    test_loss = test_step(model, test_dataloader, loss_fn)
    test_loss_history[epoch] = test_loss
    
    ## Print out what's happening
    print(f"\nTrain loss: {train_loss:.5f} | Test loss: {test_loss:.5f}")    

    # Calculate training time      
    train_time_end_on_cpu = timer()
    total_train_time_model += print_train_time(start=train_time_start_on_cpu, 
                                            end=train_time_end_on_cpu,
                                            device=str(next(model.parameters()).device))

    # the model state dict after training
    print(f"Saving model to: {MODEL_SAVE_PATH}")
    torch.save(obj=model.state_dict(), # only saving the state_dict() only saves the models learned parameters
            f=MODEL_SAVE_PATH)
    
metrics_out = pth_out+"loss_metrics/"
if not os.path.exists(metrics_out):
    os.mkdir(metrics_out)
    
train_loss_history_path = metrics_out+"train_loss_history"+str(epochs)+"E"+str(lr)+"lr"+".npy"
test_loss_history_path = metrics_out+"test_loss_history"+str(epochs)+"E"+str(lr)+"lr"+".npy"

np.save(train_loss_history_path, train_loss_history)
np.save(test_loss_history_path, test_loss_history)

runtime = time.time()-start
with open(metrics_out+"runtime.txt", "w") as f:
    f.write(str(datetime.timedelta(seconds=runtime)))


### 3.3 Looking up the metrics

Let's check out the metrics on how the model got trained.

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(2*5,5))
epochs_a = range(epochs)

ax[0].plot(epochs_a, train_loss_history)
ax[0].set_yscale("log")
ax[0].set_xlabel("epochs")
ax[0].set_ylabel("Loss")
ax[0].set_title("Train")
ax[1].plot(epochs_a, test_loss_history)
ax[1].set_yscale("log")
ax[1].set_xlabel("epochs")
ax[1].set_title("Train")
fig.tight_layout()