In [1]:
import argparse
import datetime
import json
import numpy as np
import os
import time
from pathlib import Path


import pdb


In [2]:

import torch
import torch.backends.cudnn as cudnn
from torch.utils.tensorboard import SummaryWriter
import torchvision.transforms as transforms
import torchvision.datasets as datasets


In [3]:
import timm
#assert timm.__version__ == "0.3.2"  # version check
import timm.optim.optim_factory as optim_factory

  from .autonotebook import tqdm as notebook_tqdm


In [4]:


import util.misc as misc
from util.misc import NativeScalerWithGradNormCount as NativeScaler
from util.custom_dataset import CustomDataset
import util.transform_npy as transform_npy


import models_mae

from engine_pretrain import train_one_epoch


In [5]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import transforms, datasets
import logging
import os


from torch.nn import DataParallel

from functools import partial

from tqdm import tqdm 

In [6]:


# Configurar o diretório de logs e salvamento de modelos
log_dir = 'D:/dados_tcc/output_dir/logs'  # Substitua 'logs' pelo diretório desejado
output_dir = 'D:/dados_tcc/output_dir/models'  # Substitua 'saved_models' pelo diretório desejado


In [7]:

# Definir hiperparâmetros
batch_size = 64
learning_rate = 0.001
input_size = 224
epochs = 50
start_epoch = 0
num_workers = 2

resume_path = ''

pin_mem = True


seed = 42


In [8]:
def get_args_parser():
    parser = argparse.ArgumentParser('train_one_epoch-args', add_help=False)

    parser.add_argument('--accum_iter', default=1, type=int,
                        help='Accumulate gradient iterations (for increasing the effective batch size under memory constraints)')
    
    parser.add_argument('--epochs', default=100, type=int)
    
    parser.add_argument('--mask_ratio', default=0.75, type=float,
                        help='Masking ratio (percentage of removed patches).')


    parser.add_argument('--warmup_epochs', type=int, default=5, metavar='N',
                        help='epochs to warmup LR')
    
    parser.add_argument('--lr', type=float, default=None, metavar='LR',
                        help='learning rate (absolute lr)')
    parser.add_argument('--min_lr', type=float, default=0., metavar='LR',
                        help='lower lr bound for cyclic schedulers that hit 0')
    
    return parser
    



args_train_epoch = get_args_parser()
args_train_epoch = args_train_epoch.parse_args([])

args_train_epoch.epochs = epochs
args_train_epoch.lr = learning_rate

    


In [9]:
data_path = "D:/dados_tcc"

# Set the device for training (e.g., 'cuda' or 'cpu')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Fix the random seed for reproducibility
seed = seed
torch.manual_seed(seed)  # Set PyTorch random seed
np.random.seed(seed)  # Set NumPy random seed

cudnn.benchmark = True  # Enable CuDNN benchmark mode for optimized performance


os.makedirs(log_dir, exist_ok=True)
log_writer = SummaryWriter(log_dir=log_dir)

os.makedirs(output_dir, exist_ok=True)



# Carregar dados
# Define the transformation
transform_train = transforms.Compose([
    transform_npy.AddingPad((512,512)),
    transform_npy.ResizeNumpy((input_size, input_size)),
    transform_npy.RandomHorizontalFlipNpy(),
    transform_npy.RandomRotationNpy(degrees=(-30, 30)),  # Random rotation between -15 and 15 degrees
    transforms.Lambda(lambda data: data.copy()),  # Copy the data to avoid grad error
    transforms.ToTensor(),
    transforms.Normalize(mean=[130.10511327778235], std=[316.09062860899644])
])

    
# Create a training dataset using the defined transformations
train_dataset = CustomDataset(data_path=os.path.join(data_path, 'train'), transform=transform_train)


sampler_train = torch.utils.data.RandomSampler(train_dataset)

# Crie um DataLoader personalizado
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Create data loader for training dataset
train_loader = torch.utils.data.DataLoader(
    train_dataset, 
    batch_size= batch_size,
    pin_memory= pin_mem,
    drop_last = True,
    shuffle=True
)
    


In [10]:

# Definir o modelo (incluindo a definição da classe MaskedAutoencoderViT)

# Criar uma instância do modelo e movê-lo para a GPU
model = models_mae.MaskedAutoencoderViT(
        img_size = input_size, patch_size=16, in_chans=1, embed_dim=768, depth=6, num_heads=6,
        decoder_embed_dim=512, decoder_depth=4, decoder_num_heads=8,
        mlp_ratio=4, norm_layer= partial(nn.LayerNorm, eps=1e-6))

model.to(device)
model = model.float()
model = DataParallel(model)  # Habilite o treinamento paralelo


model_without_ddp = model.module  # Obtenha o modelo sem o wrapper DataParallel


# Definir a função de perda e otimizador
criterion = NativeScaler()
optimizer = torch.optim.AdamW(model.parameters(), lr= learning_rate, betas=(0.9, 0.95))



In [11]:
import logging
import time

# ...

# Configurar o logger
log_filename = os.path.join(log_dir, 'training_v0_224.log')
logging.basicConfig(filename=log_filename, level=logging.INFO, format='%(asctime)s [%(levelname)s] - %(message)s')

# Loop de treinamento
for epoch in range(epochs):
    start_time = time.time()  # Tempo de início da época
    
    logging.info(f'Época [{epoch+1}/{epochs}]')
    
    # Use tqdm para obter uma barra de progresso
    progress_bar = tqdm(train_loader, desc=f'Época [{epoch+1}/{epochs}]', leave=False, dynamic_ncols=True)
    
    epoch_losses = []  # Lista para armazenar as perdas da época
    
    for batch_idx, batch in enumerate(progress_bar):
        images = batch.to(device).float()
        
        # Zero os gradientes
        optimizer.zero_grad()
        
        # Passe para a frente
        loss, _, _ = model(images)
        
        # Retropropagação e otimização
        loss.backward()
        optimizer.step()
        
        # Atualize a string exibida na barra de progresso
        progress_bar.set_postfix(loss=f'{loss.item():.4f}')
        
        # Adicione a perda atual à lista de perdas da época
        epoch_losses.append(loss.item())
    
    end_time = time.time()  # Tempo de término da época
    epoch_time = end_time - start_time  # Tempo total da época
    
    # Calcule a média das perdas da época
    avg_epoch_loss = sum(epoch_losses) / len(epoch_losses)
    
    # Registre a perda média da época no arquivo de log
    logging.info(f'Tempo da Época [{epoch+1}/{epochs}]: {epoch_time:.2f} segundos - Perda Média: {avg_epoch_loss:.4f}')
    
    # Salve o modelo a cada 5 épocas
    if (epoch) % 5 == 0:
        model_checkpoint = os.path.join(output_dir, f'model_v0_224.pt')
        torch.save(model.state_dict(), model_checkpoint)


                                                                               

KeyboardInterrupt: 

In [None]:


# # Load model checkpoint and optimizer state if available
# misc.load_model_new(resume_path, 
#                     model_without_ddp=model_without_ddp, 
#                     optimizer=optimizer, 
#                     loss_scaler=loss_scaler)


# # Start training loop for specified number of epochs
# print(f"Start training for {epochs} epochs")
# start_time = time.time()



# for epoch in range(start_epoch, epochs):
    
#     # Perform one epoch of training and get training statistics
#     train_stats = train_one_epoch(
#         model, 
#         data_loader_train,
#         optimizer, 
#         device, 
#         epoch, 
#         loss_scaler,
#         log_writer=log_writer,
#         args= args_train_epoch
#     )
    
#     # Save model checkpoint and statistics periodically
#     if output_dir and (epoch % 5 == 0 or epoch + 1 == epochs):
#         misc.save_model_new(
#             output_dir, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer,
#             loss_scaler=loss_scaler, epoch=epoch)
    
#     # Prepare log statistics for logging
#     log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
#                     'epoch': epoch,}
    
#     # Write log statistics to file if applicable
#     if output_dir and misc.is_main_process():
#         if log_writer is not None:
#             log_writer.flush()
#         with open(os.path.join(output_dir, "log.txt"), mode="a", encoding="utf-8") as f:
#             f.write(json.dumps(log_stats) + "\n")

# # Calculate total training time and print
# total_time = time.time() - start_time
# total_time_str = str(datetime.timedelta(seconds=int(total_time)))
# print('Training time {}'.format(total_time_str))

Start training for 10 epochs
log_dir: D:/dados_tcc/output_dir/logs
Epoch: [0]  [  0/552]  eta: 1:17:23  lr: 0.000000  loss: 2.2565 (2.2565)  time: 8.4125  data: 2.6350  max mem: 1452
Epoch: [0]  [ 10/552]  eta: 0:44:04  lr: 0.000004  loss: 2.1298 (2.0922)  time: 4.8791  data: 0.7350  max mem: 1907
Epoch: [0]  [ 20/552]  eta: 0:44:57  lr: 0.000007  loss: 1.7570 (1.8240)  time: 4.9029  data: 0.5318  max mem: 1907
Epoch: [0]  [ 30/552]  eta: 0:42:14  lr: 0.000011  loss: 1.3481 (1.6289)  time: 4.8434  data: 0.5238  max mem: 1907
Epoch: [0]  [ 40/552]  eta: 0:41:53  lr: 0.000014  loss: 1.0799 (1.4868)  time: 4.7418  data: 0.6080  max mem: 1907
Epoch: [0]  [ 50/552]  eta: 0:41:58  lr: 0.000018  loss: 1.0107 (1.3983)  time: 5.2653  data: 0.6795  max mem: 1907
Epoch: [0]  [ 60/552]  eta: 0:41:48  lr: 0.000022  loss: 0.9556 (1.3248)  time: 5.4826  data: 0.6752  max mem: 1907
Epoch: [0]  [ 70/552]  eta: 0:41:22  lr: 0.000025  loss: 0.8937 (1.2603)  time: 5.4903  data: 0.7022  max mem: 1907
Epoch

AttributeError: 'str' object has no attribute 'output_dir'