# Predict NEOCR using Detectron2

In [1]:
# Some basic setup:
# Setup detectron2 logger
import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()

# import some common libraries
import numpy as np
import os, json, cv2, random
import torch, torchvision

# import some common detectron2 utilities
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog, DatasetCatalog

In [2]:
# define function of display_image for notebook
import matplotlib.pyplot as plt

def display_image(cv2_img):
    plt.figure(figsize=(15,8))
    plt.imshow(cv2.cvtColor(cv2_img, cv2.COLOR_BGR2RGB))
    plt.axis('off')
    plt.show()

## Define function to get detectron2's standard format

In [3]:
from xml.etree import ElementTree
from xml.etree.ElementTree import Element, SubElement
from lxml import etree
import codecs
import glob
from tqdm import tqdm

from detectron2.structures import BoxMode


def get_neocr_dicts(img_dir, xml_dir):
    ENCODE_METHOD = 'utf-8'
    num_files = len(os.listdir(xml_dir))

    dataset_dicts = []
    for idx, xml_file in tqdm(enumerate(glob.iglob(f'{xml_dir}/*.xml')), total=num_files):
        record = {}
        
        # process XML
        parser = etree.XMLParser(encoding=ENCODE_METHOD)
        xmltree = ElementTree.parse(xml_file, parser=parser).getroot()

        filename = os.path.join(img_dir, xmltree.find('filename').text)# ~.jpg
        height, width = cv2.imread(filename).shape[:2]
        
        record["file_name"] = filename
        record["image_id"] = idx
        record["height"] = height
        record["width"] = width
        
        objs = []
        for object_iter in xmltree.findall('object'):

            polygon_iter = object_iter.find("polygon")

            rectangle = []
            for pt_iter in polygon_iter.findall("pt"):
                x = int(pt_iter.find('x').text)
                y = int(pt_iter.find('y').text)
                coordinate = [x, y]
                rectangle.append(coordinate)
            rectangle = np.array(rectangle)

            x_min, y_min = np.min(rectangle, axis=0)
            x_max, y_max = np.max(rectangle, axis=0)
            
            obj = {
                    "bbox": [x_min, y_min, x_max, y_max],
                    "bbox_mode": BoxMode.XYXY_ABS,
                    "category_id": 0,
                }
            
            objs.append(obj)
            
        record["annotations"] = objs
        dataset_dicts.append(record)
    
    return dataset_dicts

In [4]:
for d in ["train", "val"]:
    DatasetCatalog.register("neocr_" + d, lambda d=d: get_neocr_dicts("images/" + d, "annotations/" + d))
    MetadataCatalog.get("neocr_" + d).set(thing_classes=["text"])
neocr_metadata = MetadataCatalog.get("neocr_train")

## Inference by using the trained model
Now, let's run inference with the trained model on the neocr validation dataset.\
First, let's create a predictor using the model we just trained:

In [5]:
cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml")) # detect only box / Not use mask images
cfg.DATASETS.TRAIN = ("neocr_train",)
cfg.DATASETS.TEST = ("neocr_val", )
cfg.DATALOADER.NUM_WORKERS = 2 # default 4
cfg.SOLVER.IMS_PER_BATCH = 2 # default 16
cfg.SOLVER.BASE_LR = 0.00025  # pick a good LR / default 0.001
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 128   # faster, and good enough for this toy dataset (default: 512)
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1  # only has one class (text)

cfg.MODEL.DEVICE = 'cpu' # Use CPU on Mac
cfg.OUTPUT_DIR = './output_prediction'
cfg.MODEL.WEIGHTS = "output/model_final.pth"
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.7 # set a custom testing threshold

os.makedirs(cfg.OUTPUT_DIR, exist_ok=True) # create directory

predictor = DefaultPredictor(cfg)

Then, we randomly select several samples to visualize the prediction results.

In [6]:
dataset_dicts = get_neocr_dicts("images/val", "annotations/val")

100%|██████████| 40/40 [00:05<00:00,  7.43it/s]


In [15]:
save_dir = 'output/annotated_images'
os.makedirs(save_dir, exist_ok=True)

for d in tqdm(dataset_dicts):
    filename = os.path.basename(d["file_name"])
    im = cv2.imread(d["file_name"])
    outputs = predictor(im)
    v = Visualizer(im[:, :, ::-1],
                   metadata=neocr_metadata, 
                   scale=0.5,
    )
    out = v.draw_instance_predictions(outputs["instances"].to("cpu"))
    
    cv2.imwrite(f'{save_dir}/annotated_{filename}', out.get_image()[:, :, ::-1])

100%|██████████| 40/40 [05:26<00:00,  8.16s/it]


## Evaluate trained model
We can also evaluate its performance using AP metric implemented in COCO API.\
It takes ? minutes on CPU.\
This gives an AP of ~0. Bad???

In [36]:
from detectron2.evaluation import COCOEvaluator, inference_on_dataset
from detectron2.data import build_detection_test_loader
from detectron2.engine import DefaultTrainer

evaluator = COCOEvaluator("neocr_val", cfg, False, output_dir="./output_prediction/")
val_loader = build_detection_test_loader(cfg, "neocr_val")

trainer = DefaultTrainer(cfg)

print(inference_on_dataset(trainer.model, val_loader, evaluator))
# another equivalent way is to use trainer.test

100%|██████████| 131/131 [00:09<00:00, 13.61it/s]

[32m[08/24 02:04:27 d2.data.common]: [0mSerializing 131 elements to byte tensors and concatenating them all ...
[32m[08/24 02:04:27 d2.data.common]: [0mSerialized dataset takes 0.12 MiB
[32m[08/24 02:04:27 d2.data.dataset_mapper]: [0mAugmentations used in training: [ResizeShortestEdge(short_edge_length=(800, 800), max_size=1333, sample_style='choice')]





[32m[08/24 02:04:27 d2.engine.defaults]: [0mModel:
GeneralizedRCNN(
  (backbone): FPN(
    (fpn_lateral2): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral3): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral4): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral5): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output5): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (top_block): LastLevelMaxPool()
    (bottom_up): ResNet(
      (stem): BasicStem(
        (conv1): Conv2d(
          3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
          (norm): FrozenBatchNorm2d(num_features=64, eps=1e-05)
        )
      )
 

100%|██████████| 528/528 [00:42<00:00, 12.39it/s]

[32m[08/24 02:05:10 d2.data.build]: [0mRemoved 0 images with no usable annotations. 528 images left.
[32m[08/24 02:05:10 d2.data.common]: [0mSerializing 528 elements to byte tensors and concatenating them all ...





[32m[08/24 02:05:10 d2.data.common]: [0mSerialized dataset takes 0.52 MiB
[32m[08/24 02:05:10 d2.data.dataset_mapper]: [0mAugmentations used in training: [ResizeShortestEdge(short_edge_length=(640, 672, 704, 736, 768, 800), max_size=1333, sample_style='choice'), RandomFlip()]
[32m[08/24 02:05:10 d2.data.build]: [0mUsing training sampler TrainingSampler
[32m[08/24 02:05:10 d2.evaluation.evaluator]: [0mStart inference on 131 images
[32m[08/24 02:05:16 d2.evaluation.evaluator]: [0mInference done 1/131. 5.7278 s / img. ETA=0:13:10
[32m[08/24 02:05:23 d2.evaluation.evaluator]: [0mInference done 2/131. 5.9915 s / img. ETA=0:13:16
[32m[08/24 02:05:29 d2.evaluation.evaluator]: [0mInference done 3/131. 6.2484 s / img. ETA=0:13:35
[32m[08/24 02:05:36 d2.evaluation.evaluator]: [0mInference done 4/131. 6.3178 s / img. ETA=0:13:33
[32m[08/24 02:05:42 d2.evaluation.evaluator]: [0mInference done 5/131. 6.2285 s / img. ETA=0:13:13
[32m[08/24 02:05:48 d2.evaluation.evaluator]: [0mIn