In [1]:
import os
from os.path import join as osp
from pathlib import Path
import sys
import logging 
import argparse
import torch
import torch.nn as nn
import numpy
import pickle

from src.dataset import get_dataloaders
from src.model import get_model
from src.hyps import get_hyps
from src.utils import run_training

In [2]:
class CONFIG:
    data = "data/imagewoof2/"
    batch_size = 4
    epochs = 10
    optimizer = 'RAdam'
    scheduler = "CosineAnnealingLR"
    checkpoint = ""

In [3]:
def train(opt):
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda:0" if use_cuda else "cpu")
    logging.info("Device name:", device)

    train_dl, val_dl = get_dataloaders(opt)

    model = get_model()
    model.to(device)
    
    if opt.checkpoint:
        model.load_state_dict(torch.load(opt.checkpoint))

    optimizer, scheduler, criterion = get_hyps(opt.optimizer, opt.scheduler, model)
    
    model, history = run_training(model, optimizer, scheduler, train_dl, 
                                    val_dl, criterion, device, opt.epochs)

    with open("history_loss.pkl", "wb") as pkl_handle:
	    pickle.dump(history, pkl_handle)

In [4]:
def main(cfg):
    train(cfg)

In [5]:
main(CONFIG)

[INFO] Using GPU: NVIDIA GeForce RTX 3060 Laptop GPU



  0%|          | 0/2256 [00:00<?, ?it/s]

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  ..\torch\csrc\utils\python_arg_parser.cpp:1174.)
  p.data.mul_(self.alpha).add_(1.0 - self.alpha, param_state['cached_params'])  # crucial line


Train epoch loss: 2.244933715367571


  0%|          | 0/492 [00:00<?, ?it/s]

Val epoch loss: 2.2625553141298047
[34mValidation Loss Improved (inf ---> 2.2625553141298047)
Model Saved[0m



  0%|          | 0/2256 [00:00<?, ?it/s]

Train epoch loss: 2.1058261191591305


  0%|          | 0/492 [00:00<?, ?it/s]

Val epoch loss: 2.012172334641408
[34mValidation Loss Improved (2.2625553141298047 ---> 2.012172334641408)
Model Saved[0m



  0%|          | 0/2256 [00:00<?, ?it/s]

Train epoch loss: 2.0151145668528603


  0%|          | 0/492 [00:00<?, ?it/s]

Val epoch loss: 1.8231561761619663
[34mValidation Loss Improved (2.012172334641408 ---> 1.8231561761619663)
Model Saved[0m



  0%|          | 0/2256 [00:00<?, ?it/s]

Train epoch loss: 1.9623277686694836


  0%|          | 0/492 [00:00<?, ?it/s]

Val epoch loss: 1.8433713672057337



  0%|          | 0/2256 [00:00<?, ?it/s]

Train epoch loss: 1.8815308959877237


  0%|          | 0/492 [00:00<?, ?it/s]

Val epoch loss: 1.7039527819034677
[34mValidation Loss Improved (1.8231561761619663 ---> 1.7039527819034677)
Model Saved[0m



  0%|          | 0/2256 [00:00<?, ?it/s]

Train epoch loss: 1.7999426773782317


  0%|          | 0/492 [00:00<?, ?it/s]

Val epoch loss: 1.5984682591400186
[34mValidation Loss Improved (1.7039527819034677 ---> 1.5984682591400186)
Model Saved[0m



  0%|          | 0/2256 [00:00<?, ?it/s]

Train epoch loss: 1.705149888648843


  0%|          | 0/492 [00:00<?, ?it/s]

Val epoch loss: 1.5952890367415697
[34mValidation Loss Improved (1.5984682591400186 ---> 1.5952890367415697)
Model Saved[0m



  0%|          | 0/2256 [00:00<?, ?it/s]

Train epoch loss: 1.6198515339311979


  0%|          | 0/492 [00:00<?, ?it/s]

Val epoch loss: 1.4797512689417147
[34mValidation Loss Improved (1.5952890367415697 ---> 1.4797512689417147)
Model Saved[0m



  0%|          | 0/2256 [00:00<?, ?it/s]

Train epoch loss: 1.5543889460675682


  0%|          | 0/492 [00:00<?, ?it/s]

Val epoch loss: 1.3756312911035329
[34mValidation Loss Improved (1.4797512689417147 ---> 1.3756312911035329)
Model Saved[0m



  0%|          | 0/2256 [00:00<?, ?it/s]

Train epoch loss: 1.496486041668459


  0%|          | 0/492 [00:00<?, ?it/s]

Val epoch loss: 1.3614422963089723
[34mValidation Loss Improved (1.3756312911035329 ---> 1.3614422963089723)
Model Saved[0m

Training complete in 3h 17m 56s
Best Loss: 1.3614


In [6]:
#train more for 15 epochs
CONFIG.checkpoint = "Loss1.3614_epoch10.pt"
CONFIG.epochs = 15

In [7]:
main(CONFIG)

[INFO] Using GPU: NVIDIA GeForce RTX 3060 Laptop GPU



  0%|          | 0/2256 [00:00<?, ?it/s]

Train epoch loss: 1.3907491633443967


  0%|          | 0/492 [00:00<?, ?it/s]

Val epoch loss: 1.3180871802481964
[34mValidation Loss Improved (inf ---> 1.3180871802481964)
Model Saved[0m



  0%|          | 0/2256 [00:00<?, ?it/s]

Train epoch loss: 1.392816056543631


  0%|          | 0/492 [00:00<?, ?it/s]

Val epoch loss: 1.303227211370757
[34mValidation Loss Improved (1.3180871802481964 ---> 1.303227211370757)
Model Saved[0m



  0%|          | 0/2256 [00:00<?, ?it/s]

Train epoch loss: 1.3675616993601862


  0%|          | 0/492 [00:00<?, ?it/s]

Val epoch loss: 1.303143229973653
[34mValidation Loss Improved (1.303227211370757 ---> 1.303143229973653)
Model Saved[0m



  0%|          | 0/2256 [00:00<?, ?it/s]

Train epoch loss: 1.3264105333715466


  0%|          | 0/492 [00:00<?, ?it/s]

Val epoch loss: 1.2844667123395026
[34mValidation Loss Improved (1.303143229973653 ---> 1.2844667123395026)
Model Saved[0m



  0%|          | 0/2256 [00:00<?, ?it/s]

Train epoch loss: 1.284739937210548


  0%|          | 0/492 [00:00<?, ?it/s]

Val epoch loss: 1.2148337700525353
[34mValidation Loss Improved (1.2844667123395026 ---> 1.2148337700525353)
Model Saved[0m



  0%|          | 0/2256 [00:00<?, ?it/s]

Train epoch loss: 1.2640770372293941


  0%|          | 0/492 [00:00<?, ?it/s]

Val epoch loss: nan



  0%|          | 0/2256 [00:00<?, ?it/s]

Train epoch loss: 1.2371733504618314


  0%|          | 0/492 [00:00<?, ?it/s]

Val epoch loss: 1.2795117698755845



  0%|          | 0/2256 [00:00<?, ?it/s]

Train epoch loss: 1.2195923009240035


  0%|          | 0/492 [00:00<?, ?it/s]

Val epoch loss: 1.1955707440791346
[34mValidation Loss Improved (1.2148337700525353 ---> 1.1955707440791346)
Model Saved[0m



  0%|          | 0/2256 [00:00<?, ?it/s]

Train epoch loss: 1.1846748946353476


  0%|          | 0/492 [00:00<?, ?it/s]

Val epoch loss: 22.00307343627515



  0%|          | 0/2256 [00:00<?, ?it/s]

Train epoch loss: 1.1584187981454617


  0%|          | 0/492 [00:00<?, ?it/s]

Val epoch loss: nan



  0%|          | 0/2256 [00:00<?, ?it/s]

Train epoch loss: 1.140413059991725


  0%|          | 0/492 [00:00<?, ?it/s]

Val epoch loss: nan



  0%|          | 0/2256 [00:00<?, ?it/s]

Train epoch loss: 1.1174053487942575


  0%|          | 0/492 [00:00<?, ?it/s]

Val epoch loss: 1.1404172915573731
[34mValidation Loss Improved (1.1955707440791346 ---> 1.1404172915573731)
Model Saved[0m



  0%|          | 0/2256 [00:00<?, ?it/s]

Train epoch loss: 1.1016994436793293


  0%|          | 0/492 [00:00<?, ?it/s]

Val epoch loss: nan



  0%|          | 0/2256 [00:00<?, ?it/s]

Train epoch loss: 1.0868972000923562


  0%|          | 0/492 [00:00<?, ?it/s]

Val epoch loss: 1.1287894243501044
[34mValidation Loss Improved (1.1404172915573731 ---> 1.1287894243501044)
Model Saved[0m



  0%|          | 0/2256 [00:00<?, ?it/s]

Train epoch loss: 1.06737120238179


  0%|          | 0/492 [00:00<?, ?it/s]

Val epoch loss: 1.0989954318063317
[34mValidation Loss Improved (1.1287894243501044 ---> 1.0989954318063317)
Model Saved[0m

Training complete in 4h 44m 44s
Best Loss: 1.0990
