In [1]:
from bs4 import BeautifulSoup
import cv2
import numpy as np
import os
import scipy
from scipy import sparse
from tqdm import tqdm

In [2]:
file_names = []

for file in os.listdir('prima'):
    if len(file) == 12:
        file_names.append(file[:8])

In [3]:
def extract_mask(xml_string, img_shape):
    soup_xml = BeautifulSoup(xml_string, 'lxml')

    text_region_points = []

    ordered_list = soup_xml.find_all('regionrefindexed')

    for region in ordered_list:
        ref_id = region['regionref']
        coords = soup_xml.find(
            'textregion', {'id': ref_id},
        ).findAll('point')

        if coords:
            region = []
            for point in coords:
                region.append([int(point['x']), int(point['y'])])

            text_region_points.append(np.array(region))

    canvas = np.zeros(img_shape, np.uint8)
    # color set to 1 b.c we want binary
    cv2.drawContours(canvas, text_region_points, -1, (1), -1)
    cv2.polylines(canvas, text_region_points, isClosed=True, color=(1), thickness=2)        

    return canvas

In [5]:
for file_name in tqdm(file_names):
    img = cv2.imread(f'prima/{file_name}.tif')

    with open(f'prima/pc-{file_name}.xml') as f:
        xml_string = f.read()
    
    mask = extract_mask(xml_string, img.shape[:2])
    sparse_mask = sparse.csr_matrix(mask)
    
    scipy.sparse.save_npz(f'prima/mask_{file_name}.npz', sparse_mask)

100%|██████████| 50/50 [05:40<00:00,  6.80s/it]
