In [1]:
# write by hy, in 2023/9/5
import torch
import import_ipynb
import my_dataset_coco
import torchvision
from train_utils import train_eval_utils as utils
# from hy_tools.net_resnet50 import get_model_instance_segmentation_resnet50_fpn_v2 as model_v2
# from net_resnet50 import get_model_instance_segmentation_resnet50_fpn 
from hy_tools.nets_option import create_model
import transforms
import datetime
import time
# from torch.utils.tensorboard import SummaryWriter

importing Jupyter notebook from D:\qk_maskrcnn_trs\hy_tools\nets_option.ipynb




In [2]:
import os
os.getcwd()
os.chdir(r"D:\qk_maskrcnn_trs")

In [3]:
def train_qk(model, root, num_epochs, savewt2path, lr, name, batch_size=2, nw=0):
    # root:表示data的路径
    # num_epohs
    # savewt2path: 表示保存的权重路径
    # nw: 表示导入数据的并行数量
    # default `log_dir` is "runs" - we'll be more specific here
    # writer = SummaryWriter('runs/qk_experiment_1')
    print('hy listen for me model training',datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
    
    now = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    det_results_file = f"det_results{now}_{name}.txt" 
    seg_results_file = f"seg_results{now}_{name}.txt"
    
    train_loss = []
    learning_rate = []
    val_map = []
    # train on the GPU or on the CPU, if a GPU is not available
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    print('now is using device:', device)
    
    num_classes = 2 + 1 # 2为2个类别，1为背景
    # use our dataset and defined transformations
    data_transform = {
        "train": transforms.Compose([transforms.ToTensor(),
                                     transforms.RandomHorizontalFlip(0.5)]
                                    ),
        "val": transforms.Compose([transforms.ToTensor()])
    }
    train_dataset = my_dataset_coco.CocoDetection(root, transforms=data_transform["train"])
    
    train_data_loader = torch.utils.data.DataLoader(train_dataset,
                                                        batch_size=batch_size,
                                                        shuffle=True,
                                                        pin_memory=True,
                                                        num_workers=nw,
                                                        collate_fn=train_dataset.collate_fn)

    # load validation data set
    # coco2017 -> annotations -> instances_val2017.json
    # val_dataset = CocoDetection(data_root, "val", data_transform["val"])
    # VOCdevkit -> VOC2012 -> ImageSets -> Main -> val.txt
    val_dataset =my_dataset_coco.CocoDetection(root, dataset='val',
                                               transforms=data_transform["val"])
    val_data_loader = torch.utils.data.DataLoader(val_dataset,
                                                  batch_size=1,
                                                  shuffle=False,
                                                  pin_memory=True, 
                                                  num_workers=nw,
                                                  collate_fn=train_dataset.collate_fn)

    # get the model using our helper function
    

    # move model to the right device
    model.to(device)

    # construct an optimizer
    params = [p for p in model.parameters() if p.requires_grad] #返回需要梯度的参数
    optimizer = torch.optim.SGD(params, lr=lr,
                                momentum=0.9, weight_decay=0.0005) #sgd随机梯度下降
    # and a learning rate scheduler
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   step_size=10,
                                                   gamma=0.1) #学习率衰减
    
    for epoch in range(num_epochs):
        start_time = time.time()
        # train for one epoch, printing every 10 iterations
        mloss, lr = utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=10)
        train_loss.append(mloss.item())
        learning_rate.append(lr)
        
        # update the learning rate
        lr_scheduler.step()
              
        # evaluate on the test dataset
        det_info, seg_info =  utils.evaluate(model, val_data_loader, device=device)
        end_time = time.time()
        print('第{0}次运行,运行一次的时间为{1},共有{2}次运行，还有{3}次'.format(epoch,start_time-end_time,num_epochs,num_epochs-epoch-1))
        
        with open(det_results_file, "a") as f:
            # 写入的数据包括coco指标还有loss和learning rate
            result_info = [f"{i:.4f}" for i in det_info + [mloss.item()]] + [f"{lr:.6f}"]
            txt = "epoch:{} {}".format(epoch, '  '.join(result_info))
            f.write(txt + "\n")
            

        # write seg into txt
        with open(seg_results_file, "a") as f:
            # 写入的数据包括coco指标还有loss和learning rate
            result_info = [f"{i:.4f}" for i in seg_info + [mloss.item()]] + [f"{lr:.6f}"]
            txt = "epoch:{} {}".format(epoch, '  '.join(result_info))
            f.write(txt + "\n")

        val_map.append(det_info[1])  # pascal mAP

        # save weights
        save_files = {
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'lr_scheduler': lr_scheduler.state_dict(),
            'epoch': epoch}
        
        if os.path.exists(savewt2path) is False:
            os.makedirs(savewt2path)
                
        torch.save(save_files, os.path.join(savewt2path, f"epoch_{epoch}.pth"))
        
    # plot loss and lr curve
    if len(train_loss) != 0 and len(learning_rate) != 0:
        from plot_curve import plot_loss_and_lr
        plot_loss_and_lr(train_loss, learning_rate, name)

    # plot mAP curve
    if len(val_map) != 0:
        from plot_curve import plot_map
        plot_map(val_map, name)
        
    print("That's it!",'model training is over',datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))

In [None]:
model_r34 = create_model(num_classes=3, model_name= 'resnet34')
train_qk(model_r34, root=r"./qk_data/coco2017", num_epochs=30, 
        savewt2path=r"./save_weights_v2_3",batch_size=2,lr=0.01,name='R34') # 开始训练.

In [8]:
model_mb = create_model(num_classes=3, model_name= 'mobilenet_v3_large')
train_qk(model_mb, root=r"./qk_data/coco2017", num_epochs=30, 
         savewt2path=r"./save_weights_mb",batch_size=2,lr=0.01,name='mb') # 开始训练.

hy listen for me model training 20230917-101848
now is using device: cuda
loading annotations into memory...
Done (t=3.72s)
creating index...
index created!
loading annotations into memory...
Done (t=0.24s)
creating index...
index created!
Epoch: [0]  [  0/551]  eta: 0:48:03.064322  lr: 0.010000  loss: 6.0377 (6.0377)  loss_classifier: 1.0690 (1.0690)  loss_box_reg: 0.0365 (0.0365)  loss_mask: 4.1926 (4.1926)  loss_objectness: 0.6977 (0.6977)  loss_rpn_box_reg: 0.0419 (0.0419)  time: 5.2324  data: 0.3434  max mem: 2917
Epoch: [0]  [ 10/551]  eta: 0:10:15.422363  lr: 0.010000  loss: 1.1694 (3.8888)  loss_classifier: 0.2353 (0.3542)  loss_box_reg: 0.1773 (0.0983)  loss_mask: 0.5980 (2.9812)  loss_objectness: 0.1437 (0.4246)  loss_rpn_box_reg: 0.0151 (0.0305)  time: 1.1376  data: 0.2449  max mem: 3020
Epoch: [0]  [ 20/551]  eta: 0:08:19.258540  lr: 0.010000  loss: 1.5887 (2.7152)  loss_classifier: 0.3384 (0.3365)  loss_box_reg: 0.3004 (0.1715)  loss_mask: 0.4973 (1.8488)  loss_objectness:

In [7]:
model_vgg = create_model(num_classes=3, model_name= 'vgg16')
train_qk(model_vgg, root=r"./qk_data/coco2017", num_epochs=30, 
savewt2path=r"./save_weights_vgg",batch_size=2,lr=0.01,name='vgg') # 开始训练.

hy listen for me model training 20230917-143255
now is using device: cuda
loading annotations into memory...
Done (t=0.11s)
creating index...
index created!
loading annotations into memory...
Done (t=0.05s)
creating index...
index created!


OutOfMemoryError: CUDA out of memory. Tried to allocate 50.00 MiB (GPU 0; 4.00 GiB total capacity; 3.39 GiB already allocated; 0 bytes free; 3.48 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF