## torch_to_trt.ipynb 를 통해 결과의 아웃풋의 크기도 다르고 성능도 제대로 나오지않아서 구글링한 결과 torch2trt_dynamic이라는 것을 알게 됨
## 이것은 CRAFT와 같이 정해진 인풋 이미지 크기의 영향을 받지 않는 모델을 위해서 torch2trt를 커스텀한 것임

#### https://github.com/grimoire/torch2trt_dynamic

In [1]:
import os, torch

In [2]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
os.environ['NVIDIA_VISIBLE_DEVICES'] = "0"

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print('Device:', device)
print('Current cuda device:', torch.cuda.current_device())
print('Count of using GPUs:', torch.cuda.device_count())

Device: cuda
Current cuda device: 0
Count of using GPUs: 1


In [4]:
import sys
import os
import time
import argparse

import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
from torch.autograd import Variable

from PIL import Image

import cv2
from skimage import io
import numpy as np
from text_detection import craft_utils
from text_detection import imgproc
import text_detection.file_utils
import json
import zipfile

from text_detection.craft import CRAFT

from collections import OrderedDict

import matplotlib.pyplot as plt

def copyStateDict(state_dict):
    if list(state_dict.keys())[0].startswith("module"):
        start_idx = 1
    else:
        start_idx = 0
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        name = ".".join(k.split(".")[start_idx:])
        new_state_dict[name] = v
    return new_state_dict

def str2bool(v):
    return v.lower() in ("yes", "y", "true", "t", "1")

def test_net(net, image, text_threshold, link_threshold, low_text, cuda, poly,refine_net=None):
    t0 = time.time()

    # resize
    img_resized, target_ratio, size_heatmap = imgproc.resize_aspect_ratio(image, args.canvas_size, interpolation=cv2.INTER_LINEAR, mag_ratio=args.mag_ratio)
    ratio_h = ratio_w = 1 / target_ratio

    # preprocessing
    x = imgproc.normalizeMeanVariance(img_resized)
    
    x = torch.from_numpy(x).permute(2, 0, 1)    # [h, w, c] to [c, h, w]
    x = Variable(x.unsqueeze(0))                # [c, h, w] to [b, c, h, w]
    if cuda:
        x = x.to(device)

    # forward pass
    with torch.no_grad():
        y, feature = net(x)
    
    # make score and link map
    score_text = y[0,:,:,0].cpu().data.numpy()
    score_link = y[0,:,:,1].cpu().data.numpy()

    # refine link
    if refine_net is not None:
        with torch.no_grad():
            y_refiner = refine_net(y, feature)
        score_link = y_refiner[0,:,:,0].cpu().data.numpy()


    # Post-processing
    boxes, polys = craft_utils.getDetBoxes(score_text, score_link, text_threshold, link_threshold, low_text, poly)

    # coordinate adjustment
    boxes = craft_utils.adjustResultCoordinates(boxes, ratio_w, ratio_h)
    polys = craft_utils.adjustResultCoordinates(polys, ratio_w, ratio_h)
    for k in range(len(polys)):
        if polys[k] is None: polys[k] = boxes[k]

    
    # render results (optional)
    render_img = score_text.copy()
    render_img = np.hstack((render_img, score_link))
    ret_score_text = imgproc.cvt2HeatmapImg(render_img)
    
    if args.show_time : print("\ninfer/postproc time : {:.3f}/{:.3f}".format(t0, t1))

    return boxes, polys, ret_score_text

In [5]:
class Args():
    def __init__(self,cuda=False, trained_model='weights/craft_mlt_25k.pth', text_threshold=0.7, low_text=0.4, link_threshold=0.4, canvas_size =1280, mag_ratio=1.5, poly=False, show_time=False,test_folder='/data/',refine=False, refiner_model='weights/craft_refiner_CTW1500.pth'):
        self.cuda = cuda
        self.trained_model = trained_model = trained_model
        self.text_threshold = text_threshold
        self.low_text = low_text
        self.link_threshold = link_threshold
        self.canvas_size = canvas_size
        self.mag_ratio = mag_ratio
        self.poly = poly
        self.show_time = show_time
        self.test_folder = test_folder
        self.refine = refine
        self.refiner_model = refiner_model
        
        
def img_show(img, size =(15,15)):
    plt.rcParams["figure.figsize"] = size
    imgplot = plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    plt.show()

In [6]:
import os
    
args = Args(test_folder='./',text_threshold=0.8,link_threshold=0.4,canvas_size=1200,refine=False,poly=False,cuda=True)

In [7]:
regist_img_num = 0
# test_img_num = 1

# load net
net = CRAFT()     # initialize

print('Loading weights from checkpoint (' + args.trained_model + ')')
# device=torch.device('cuda')

net.load_state_dict(copyStateDict(torch.load(args.trained_model,map_location="cuda")))
print(device)

net = net.to(device)
cudnn.benchmark = True

net.eval()

craft_memory = torch.cuda.memory_allocated()/1024/1024

print("torch.cuda.memory_allocated: %fMB"%(craft_memory))

Loading weights from checkpoint (weights/craft_mlt_25k.pth)
cuda
torch.cuda.memory_allocated: 81.041504MB


In [8]:
print("current_memory:", craft_memory)

# LinkRefiner
from refinenet import RefineNet
refine_net = RefineNet()
print('Loading weights of refiner from checkpoint (' + args.refiner_model + ')')

print("torch.cuda.memory_allocated: %fMB"%(torch.cuda.memory_allocated()/1024/1024))
if args.cuda:
    refine_net.load_state_dict(copyStateDict(torch.load(args.refiner_model,map_location="cuda")))
#     refine_net.load_state_dict(torch.load(args.refiner_model))
    refine_net = refine_net.to(device)
    print(device)
#     refine_net = torch.nn.DataParallel(refine_net)
else:
    refine_net.load_state_dict(copyStateDict(torch.load(args.refiner_model, map_location='cpu')))

refine_net.eval()
args.poly = True

refine_memory = torch.cuda.memory_allocated()/1024/1024
print("after_memory:", refine_memory)
print("torch.cuda.memory_allocated: %fMB"%(refine_memory - craft_memory))

current_memory: 81.04150390625
Loading weights of refiner from checkpoint (weights/craft_refiner_CTW1500.pth)
torch.cuda.memory_allocated: 81.041504MB
cuda
after_memory: 82.80859375
torch.cuda.memory_allocated: 1.767090MB


### 아래에서 torch2trt_dynamic 함수를 통해 모델을 로드 

### unknown interpolate type, use linear instead 이라는 오류를 직면 

### 이를 해결하기 위해 plugin을 다른 것으로 해보는 시도를 했지만 잘 안됨 

### 그런 와중 unknown interpolate type이 혹시 tensorRT docker 컨테이너의 버전이 낮아서 특정 레이어를 지원하지 않는 것이라는 것을 알게됨. 

### 실제로 github에서도 tensorRT 7.x버전 이상에서 잘 작동된다고 하여 해당 버전에 맞게 컨테이너를 다시 띄워주었다.

### 또한 torch2onnx, onnx2trt의 단계를 거치지 않고 torch2trt 모듈로만 구현하기에는 한계(특정레이어 지원X, github star수 낮음 등)가 있다고 판단되어 

### torch2onnx, onnx2trt의 두 단계로 변환을 시도하였음

In [18]:
from torch2trt_dynamic import torch2trt_dynamic

input_tensor = torch.randn((1, 3, 768, 768), requires_grad=False)
input_tensor=input_tensor.cuda()
input_tensor=input_tensor.to(device=device)

opt_shape_param = [
    [
        [1,3,256,256],   # min
        [1,3,700,700],   # opt
        [1,3,1200,1200]    # max
    ]
]

model_trt = torch2trt_dynamic(net, [input_tensor], fp16_mode=True, opt_shape_param=opt_shape_param)

output_path = './weights/'
output_detec = os.path.join(output_path, "detec_rt.engine")
# with open(output_detec, "wb") as f:
#         f.write(model_trt.engine.serialize())


unknown interpolate type, use linear instead.
unknown interpolate type, use linear instead.
unknown interpolate type, use linear instead.


In [15]:
x = torch.rand(1,3,256,256).cuda()
with torch.no_grad():
    y = net(x)
    y_trt = model_trt(x)

# check the output against PyTorch
print(torch.max(torch.abs(y - y_trt)))

AttributeError: module 'tensorrt' has no attribute 'bool'