In [1]:
import json
import cv2
import numpy as np

In [2]:
def convert_data(item, prefix="train/", undirected=False):
    file_name = prefix + item["id"] + ".jpg"
    height, width = cv2.imread(file_name).shape[:2]
    annotations = []
    link_cls = []
    id2index = {j: i for i, j in enumerate([x["id"] for x in item['document']])}
    edges = list(set([(id2index[link[0]], id2index[link[1]]) for x in item["document"] for link in x["linking"]]))
    if undirected:
        edges.extend([(j, i) for i, j in edges])
    for doc in item["document"]:
        x1, y1, x3, y3 = doc["box"]
        x2 = x3
        y2 = y1
        x4 = x1
        y4 = y3
        box = [float(x1), float(y1), float(x2), float(y2),
               float(x3), float(y3), float(x4), float(y4)]
        text = doc["text"]
        label = label2id[doc['label']]
                
        annotations.append(
            {
                "box": box,
                "text": text,
                "label": label
            }
        )
    out = {
        "file_name": file_name,
        "height": height,
        "width": width,
        "annotations": annotations,
        "edges": edges
    }
    return out

# load xfun training data

In [11]:
with open("zh.train.json", "r") as f:
    d = json.load(f)
    
with open("zh.val.json", "r") as f:
    d_val = json.load(f)

In [4]:
id2index = {j: i for i, j in enumerate([x["id"] for x in d["documents"][0]['document']])}

In [6]:
# get node edges

link = list(
    set([(id2index[link[0]], id2index[link[1]]) for x in d["documents"][0]["document"] for link in x["linking"]]))

In [15]:
# get all chars

chars = sorted(list(set([z for x in d["documents"] for y in x['document'] for z in y["text"]])))

In [9]:
# get label list

labels_ls = [y['label'] for x in d["documents"] for y in x['document']]
label2id = {j:i for i,j in enumerate(sorted(list(set(labels_ls))))}
label2id

{'answer': 0, 'header': 1, 'other': 2, 'question': 3}

## convert to mmocr format

In [10]:
# training data

with open("zh_train.txt", "w") as f:
    for item in d["documents"]:
        out = convert_data(item, undirected=False)
        f.write(json.dumps(out, ensure_ascii=False))
        f.write("\n")

In [12]:
# val data

with open("zh_val.txt", "w") as f:
    for item in d_val["documents"]:
        out = convert_data(item, prefix="val/", undirected=False)
        f.write(json.dumps(out, ensure_ascii=False))
        f.write("\n")

In [13]:
# save class list

with open("class_list.txt", "w") as f:
    for c, idx in label2id.items():
        f.write(f"{idx} {c}\n")

In [16]:
# save char dict

with open("dict.txt", "w") as f:
    for ch in chars:
        f.write(ch)
        f.write("\n")