In [1]:
import os
import time
import shutil
import numpy as np
import tensorflow as tf
import core.utils as utils
from tqdm import tqdm
from core.dataset import Dataset
from core.yolov3 import YOLOv3, decode, compute_loss
from core.config import cfg

trainset = Dataset('train')
logdir = "./data/log_try3"
steps_per_epoch = len(trainset)
global_steps = tf.Variable(1, trainable=False, dtype=tf.int64)
warmup_steps = cfg.TRAIN.WARMUP_EPOCHS * steps_per_epoch
total_steps = cfg.TRAIN.EPOCHS * steps_per_epoch

input_tensor = tf.keras.layers.Input([416, 416, 3])
conv_tensors = YOLOv3(input_tensor)

output_tensors = []
for i, conv_tensor in enumerate(conv_tensors):
    pred_tensor = decode(conv_tensor, i)
    output_tensors.append(conv_tensor)
    output_tensors.append(pred_tensor)

model = tf.keras.Model(input_tensor, output_tensors)
optimizer = tf.keras.optimizers.Adam()
if os.path.exists(logdir): shutil.rmtree(logdir)
writer = tf.summary.create_file_writer(logdir)

def train_step(image_data, target):
    with tf.GradientTape() as tape:
        pred_result = model(image_data, training=True)
        giou_loss=conf_loss=prob_loss=0

        # optimizing process
        for i in range(3):
            conv, pred = pred_result[i*2], pred_result[i*2+1]
            loss_items = compute_loss(pred, conv, *target[i], i)
            giou_loss += loss_items[0]
            conf_loss += loss_items[1]
            prob_loss += loss_items[2]

        total_loss = giou_loss + conf_loss + prob_loss

        gradients = tape.gradient(total_loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        tf.print("=> STEP %4d   lr: %.6f   giou_loss: %4.2f   conf_loss: %4.2f   "
                 "prob_loss: %4.2f   total_loss: %4.2f" %(global_steps, optimizer.lr.numpy(),
                                                          giou_loss, conf_loss,
                                                          prob_loss, total_loss))
        # update learning rate
        global_steps.assign_add(1)
        if global_steps < warmup_steps:
            lr = global_steps / warmup_steps *cfg.TRAIN.LR_INIT
        else:
            lr = cfg.TRAIN.LR_END + 0.5 * (cfg.TRAIN.LR_INIT - cfg.TRAIN.LR_END) * (
                (1 + tf.cos((global_steps - warmup_steps) / (total_steps - warmup_steps) * np.pi))
            )
        optimizer.lr.assign(lr.numpy())

        # writing summary data
        with writer.as_default():
            tf.summary.scalar("lr", optimizer.lr, step=global_steps)
            tf.summary.scalar("loss/total_loss", total_loss, step=global_steps)
            tf.summary.scalar("loss/giou_loss", giou_loss, step=global_steps)
            tf.summary.scalar("loss/conf_loss", conf_loss, step=global_steps)
            tf.summary.scalar("loss/prob_loss", prob_loss, step=global_steps)
        writer.flush()


for epoch in range(cfg.TRAIN.EPOCHS):
    for image_data, target in trainset:
        train_step(image_data, target)
    model.save_weights("./yolov3_try3")



=> STEP    1   lr: 0.001000   giou_loss: 75.73   conf_loss: 1917.41   prob_loss: 106.43   total_loss: 2099.56
=> STEP    2   lr: 0.001000   giou_loss: 71.57   conf_loss: 2130.75   prob_loss: 109.59   total_loss: 2311.92
=> STEP    3   lr: 0.001000   giou_loss: 95.48   conf_loss: 1461.70   prob_loss: 146.66   total_loss: 1703.85
=> STEP    4   lr: 0.001000   giou_loss: 43.77   conf_loss: 1244.27   prob_loss: 62.07   total_loss: 1350.11
=> STEP    5   lr: 0.001000   giou_loss: 89.90   conf_loss: 910.81   prob_loss: 130.83   total_loss: 1131.54
=> STEP    6   lr: 0.001000   giou_loss: 46.98   conf_loss: 700.04   prob_loss: 62.80   total_loss: 809.82
=> STEP    7   lr: 0.001000   giou_loss: 31.22   conf_loss: 558.36   prob_loss: 40.51   total_loss: 630.09
=> STEP    8   lr: 0.001000   giou_loss: 72.91   conf_loss: 451.13   prob_loss: 88.83   total_loss: 612.87
=> STEP    9   lr: 0.001000   giou_loss: 60.49   conf_loss: 368.84   prob_loss: 67.77   total_loss: 497.10
=> STEP   10   lr: 0.001

ResourceExhaustedError: {{function_node __wrapped__Mul_device_/job:localhost/replica:0/task:0/device:GPU:0}} failed to allocate memory [Op:Mul]