# GeoProj Model Training

##### **Load libraries and Cuda**



In [3]:
import torch
import os
import torch.nn as nn
from torch.optim.lr_scheduler import StepLR
import math

from typing import Tuple

from dataloaderNetS import get_loader
from modelNetS_batch_size import EncoderNet, ModelNet, EPELoss
from models.utils import load_config

config = load_config('C:/Users/JoelVP/Desktop/UPV/ImageEnhancementTFG/imageenhancementtfg/data/config.ini')

In [4]:
if torch.cuda.is_available():
    gpu_info = torch.cuda.get_device_properties(0)
    print(f"GPU Name: {gpu_info.name}")
    print(f"GPU Memory: {gpu_info.total_memory / 1024**3:.2f} GB")
else:
    print("No GPU available")

GPU Name: Tesla T4
GPU Memory: 14.75 GB


##### **Define Params**

In [5]:
dataset_dir = config['lens_distortion']['dataset_dir']

# **Sweep Config**



In [6]:
sweep_config = {
    'method': 'bayes'
    }

In [None]:
early_terminate = {
    'type': 'hyperband',
    'min_iter': 3,  # Número mínimo de iteraciones antes de considerar la terminación
    'eta': 3  # Reduccion de configuraciones al alcanzar las 3 iteraciones
    }

sweep_config['early_terminate'] = early_terminate

In [7]:
metric = {
    'name': 'loss',
    'goal': 'minimize'
    }

sweep_config['metric'] = metric

In [8]:
parameters_dict = {
    'optimizer': {
        'values': ['adam', 'sgd']
        },
    'learning_rate': {
        'distribution': 'uniform',
        'min': 0.000001,
        'max': 0.001
      },
    'batch_size': {
        'values': [16,32,64]
        },
    'epochs': {
        'value': 10}

    }

sweep_config['parameters'] = parameters_dict

In [9]:
import pprint
pprint.pprint(sweep_config)

{'method': 'grid',
 'metric': {'goal': 'minimize', 'name': 'loss'},
 'parameters': {'batch_size': {'value': 32},
                'epochs': {'value': 10},
                'learning_rate': {'values': [1e-06, 1e-05, 0.0001]},
                'optimizer': {'value': 'adam'}}}


##### **AUX FUNCTIONS**

In [10]:
use_GPU = torch.cuda.is_available()

def build_dataset(batch_size):

    train_loader = get_loader(distortedImgDir = f'{dataset_dir}/train_distorted',
                      flowDir   = f'{dataset_dir}/train_flow',
                      batch_size = batch_size,
                      distortion_type = ['barrel'])

    val_loader = get_loader(distortedImgDir = f'{dataset_dir}/test_distorted',
                  flowDir   = f'{dataset_dir}/test_flow',
                  batch_size = batch_size,
                  distortion_type = ['barrel'])

    return train_loader, val_loader

def build_network(batch_size):
    model_1 = EncoderNet([1,1,1,1,2])
    model_2 = ModelNet('barrel', batch_size)
    criterion = EPELoss()

    if torch.cuda.is_available():
        model_1 = model_1.cuda()
        model_2 = model_2.cuda()
        criterion = criterion.cuda()

    return model_1, model_2, criterion

def build_optimizer(network, optimizer, learning_rate):
    if optimizer == 'adam':
        optimizer = torch.optim.Adam(network.parameters(), lr=learning_rate)

    elif optimizer == "sgd":
        optimizer = torch.optim.SGD(network.parameters(),
                               lr=learning_rate, momentum=0.9)

    scheduler = StepLR(optimizer, step_size=2, gamma=0.5)

    return optimizer, scheduler

##**TRAIN**

In [11]:
# Ejecutar en caso de error con wandb.login()
!pip install wandb -qU

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m52.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m300.2/300.2 kB[0m [31m37.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [12]:
# Api Key: da6f58c0b21d44fbbca8c02456e6cbe6c7a085cb
import wandb
wandb.login(key = "da6f58c0b21d44fbbca8c02456e6cbe6c7a085cb")

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [13]:
sweep_id = wandb.sweep(sweep_config, project="pytorch-sweeps-demo-tfg")

Create sweep with ID: ylkb2h6s
Sweep URL: https://wandb.ai/joeldev/pytorch-sweeps-demo-tfg/sweeps/ylkb2h6s


**Start training**

In [14]:
def train_epoch(epoch, model_1, model_2, train_loader, optimizer, criterion, batch_size):
    cumu_loss = 0.0
    batch_loss = 0.0

    n_steps_per_epoch = math.ceil(len(train_loader) / batch_size)

    for i, (disimgs, disx, disy) in enumerate(train_loader):
        if torch.cuda.is_available():
            disimgs = disimgs.cuda()
            disx = disx.cuda()
            disy = disy.cuda()

        optimizer.zero_grad()

        labels_x = disx
        labels_y = disy

        flow_truth = torch.cat([labels_x, labels_y], dim=1)

        # Forward pass
        flow_output_1 = model_1(disimgs)
        flow_output = model_2(flow_output_1)

        # Calculate loss
        loss = criterion(flow_output, flow_truth)

        # Backward pass and optimization step
        loss.backward()
        optimizer.step()

        # Accumulate total loss
        cumu_loss += loss.item()
        batch_loss += loss.item()

        if (i + 1) % n_steps_per_epoch == 0:
            last_loss = batch_loss / n_steps_per_epoch
            # step = epoch * len(train_loader) + i - 1
            print(f"Iter {i + 1} Loss {last_loss}")
            wandb.log({"train_loss": last_loss, "step": epoch * len(train_loader) + i + 1})
            batch_loss = 0.0  # Reset total loss after logging

    # Calculate average loss for the epoch
    average_loss = cumu_loss / len(train_loader)

    print("Average Epoch Loss",average_loss)

    return average_loss

In [15]:
def train(config=None):
    # Initialize a new wandb run
    with wandb.init(config=config):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config

        wandb.define_metric("epoch/step")
        wandb.define_metric("epoch/*", step_metric="epoch/step")

        train_loader, val_loader = build_dataset(config.batch_size)
        model_1, model_2, criterion = build_network(config.batch_size)
        optimizer, scheduler = build_optimizer(model_1, config.optimizer, config.learning_rate)

        for epoch in range(config.epochs):
            model_1.train()
            model_2.train()

            avg_train_loss = train_epoch(epoch, model_1, model_2, train_loader, optimizer, criterion, config.batch_size)
            wandb.log({"loss": avg_train_loss, "epoch": epoch})

            running_val_loss = 0.0
            # Set the model to evaluation mode, disabling dropout and using population
            # statistics for batch normalization.
            model_1.eval()
            model_2.eval()

            # Disable gradient computation and reduce memory consumption.
            with torch.no_grad():
                for i, (disimgs, disx, disy) in enumerate(val_loader):
                    if use_GPU:
                        torch.cuda.empty_cache()
                        disimgs = disimgs.cuda()
                        disx = disx.cuda()
                        disy = disy.cuda()

                    labels_x = disx
                    labels_y = disy

                    flow_truth = torch.cat([labels_x, labels_y], dim=1)

                    # In one step
                    flow_output = model_2(model_1(disimgs))

                    val_loss = criterion(flow_output, flow_truth)
                    running_val_loss += val_loss

            avg_val_loss = running_val_loss / len(val_loader)
            print(f'EPOCH {epoch}, LOSS train {avg_train_loss} LOSS val {avg_val_loss}')
            wandb.log({
            "epoch/avg_train_loss": avg_train_loss,
            "epoch/avg_val_loss": avg_val_loss,
            "epoch/step":epoch})

            scheduler.step()


In [16]:
wandb.agent(sweep_id, train, count=20)

[34m[1mwandb[0m: Agent Starting Run: o2ippa6w with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	learning_rate: 1e-06
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: Currently logged in as: [33mjoeldev[0m. Use [1m`wandb login --relogin`[0m to force relogin


Iter 2 Loss 13.715343475341797
Iter 4 Loss 12.33252239227295
Iter 6 Loss 11.641318321228027
Iter 8 Loss 10.450703144073486
Iter 10 Loss 11.887238502502441
Average Epoch Loss 12.00542516708374
Optimizer in epoch 0 Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 1e-06
    lr: 1e-06
    maximize: False
    weight_decay: 0
)
EPOCH 0, LOSS train 12.00542516708374 LOSS val 10.482401847839355
Iter 2 Loss 10.166038990020752
Iter 4 Loss 9.462498188018799
Iter 6 Loss 9.34909439086914
Iter 8 Loss 11.296338081359863
Iter 10 Loss 9.366063594818115
Average Epoch Loss 9.928006649017334
Optimizer in epoch 1 Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 1e-06
    lr: 1e-06
    maximize: False
    weight_decay: 0
)
EPOCH 1, LOSS train 9.928

VBox(children=(Label(value='0.001 MB of 0.016 MB uploaded\r'), FloatProgress(value=0.07378513577894236, max=1.…

0,1
epoch,▁▂▃▃▄▅▆▆▇█
epoch/avg_train_loss,█▅▃▃▁▂▁▁▁▁
epoch/avg_val_loss,█▅▂▁▁▁▁▁▁▁
epoch/step,▁▂▃▃▄▅▆▆▇█
loss,█▅▃▃▁▂▁▁▁▁
step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train_loss,█▇▆▅▅▄▄▆▅▃▄▂▂▂▃▄▂▂▃▂▂▄▃▃▂▁▂▂▂▂▂▂▂▁▃▂▂▃▁▁

0,1
epoch,9.0
epoch/avg_train_loss,7.07973
epoch/avg_val_loss,7.62096
epoch/step,9.0
loss,7.07973
step,100.0
train_loss,6.52036


[34m[1mwandb[0m: Agent Starting Run: s7qm9tig with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	optimizer: adam


Iter 2 Loss 15.95506477355957
Iter 4 Loss 10.850293159484863
Iter 6 Loss 9.042210578918457
Iter 8 Loss 7.042848587036133
Iter 10 Loss 8.392436027526855
Average Epoch Loss 10.256570625305176
Optimizer in epoch 0 Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 1e-05
    lr: 1e-05
    maximize: False
    weight_decay: 0
)
EPOCH 0, LOSS train 10.256570625305176 LOSS val 9.048460960388184
Iter 2 Loss 11.600654602050781
Iter 4 Loss 20.18429946899414
Iter 6 Loss 11.715482711791992
Iter 8 Loss 26.280030250549316
Iter 10 Loss 15.644510269165039
Average Epoch Loss 17.084995460510253
Optimizer in epoch 1 Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 1e-05
    lr: 1e-05
    maximize: False
    weight_decay: 0
)
EPOCH 1, LOSS train 17.

VBox(children=(Label(value='0.001 MB of 0.016 MB uploaded\r'), FloatProgress(value=0.07360104550314839, max=1.…

0,1
epoch,▁▂▃▃▄▅▆▆▇█
epoch/avg_train_loss,▁▅▃▂▄█▂▃▃▂
epoch/avg_val_loss,▇▃▁▂▄▇████
epoch/step,▁▂▃▃▄▅▆▆▇█
loss,▁▅▃▂▄█▂▃▃▂
step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train_loss,▂▂▁▁▂▃▂▄▂▂▁▁▁▁▁▁▁▅▁▁█▁▂▃▂▂▁▁▁▂▂▁▂▂▂▁▁▂▂▂

0,1
epoch,9.0
epoch/avg_train_loss,11.60439
epoch/avg_val_loss,9.12511
epoch/step,9.0
loss,11.60439
step,100.0
train_loss,12.0567


[34m[1mwandb[0m: Agent Starting Run: f8ttcym0 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: adam


Iter 2 Loss 13.547564506530762
Iter 4 Loss 10.222711086273193
Iter 6 Loss 13.007150173187256
Iter 8 Loss 9.502224922180176
Iter 10 Loss 9.34099817276001
Average Epoch Loss 11.12412977218628
Optimizer in epoch 0 Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.0001
    lr: 0.0001
    maximize: False
    weight_decay: 0
)
EPOCH 0, LOSS train 11.12412977218628 LOSS val 8.59827995300293
Iter 2 Loss 47.30213737487793
Iter 4 Loss 180.280029296875
Iter 6 Loss 465.9822692871094
Iter 8 Loss 464.2354431152344
Iter 10 Loss 326.4822311401367
Average Epoch Loss 296.8564220428467
Optimizer in epoch 1 Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.0001
    lr: 0.0001
    maximize: False
    weight_decay: 0
)
EPOCH 1, LOSS train 296.856

VBox(children=(Label(value='0.001 MB of 0.016 MB uploaded\r'), FloatProgress(value=0.07367357851583388, max=1.…

0,1
epoch,▁▂▃▃▄▅▆▆▇█
epoch/avg_train_loss,▁▅▅▄██▃▃▃▃
epoch/avg_val_loss,▁▁▁▁▄▆▅▇█▇
epoch/step,▁▂▃▃▄▅▆▆▇█
loss,▁▅▅▄██▃▃▃▃
step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train_loss,▁▁▁▁▁▂▃▃▂▂▂▂▂▂▂▃▂█▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▂▁▁▁▁▁▂

0,1
epoch,9.0
epoch/avg_train_loss,143.73981
epoch/avg_val_loss,81.37956
epoch/step,9.0
loss,143.73981
step,100.0
train_loss,203.18204
