In [81]:
import pandas as pd
import numpy as np
import os
import cv2
from PIL import Image
import json
import shutil

In [82]:
txtpath = './../../data/ocr/docbank/txt_processed/'
imgpath = './../../data/ocr/docbank/images_processed/'

txt_files = os.listdir(txtpath)
rec_train_path = './../../data/ocr/docbank/train/recognition_train/'
det_train_path = './../../data/ocr/docbank/train/detection_train/'
rec_val_path = './../../data/ocr/docbank/val/recognition_train/'
det_val_path = './../../data/ocr/docbank/val/detection_train/'

### Make Directories

In [83]:
if not (os.path.exists(rec_train_path + 'images/')):
    os.makedirs(rec_train_path + 'images/')

if not (os.path.exists(det_train_path + 'images/')):
    os.makedirs(det_train_path + 'images/')

if not (os.path.exists(rec_val_path + 'images/')):
    os.makedirs(rec_val_path + 'images/')

if not (os.path.exists(det_val_path + 'images/')):
    os.makedirs(det_val_path + 'images/')

## Get Train Labels

In [84]:
val_files = txt_files[:len(txt_files)//5]
train_files = txt_files[len(txt_files)//5:]

### Get Cropped Images

In [85]:
count = 0
labels = {}

imgs_path = rec_train_path + 'images/'
for file in train_files:
    df = pd.read_csv(txtpath + file, delimiter=' ')
    img = Image.open(imgpath + file[:-4] + ".jpg")
    for i in range(df.shape[0]):
        X1, Y1, X2, Y2  = (df['X1'][i], df['Y1'][i], df['X2'][i], df['Y2'][i])
        crop = img.crop((X1, Y1, X2, Y2))
        try:
            file_name = str(count) + '.jpg'
            crop.save(imgs_path + file_name)
            count +=1
            labels[file_name] = df['token'][i]
        except:
            continue

with open(rec_train_path + 'labels.json', 'w', encoding='utf-8') as f:
    json.dump(labels, f, ensure_ascii=False, indent=4)

### Save Detection Data

In [86]:
det_labels = {}

for file in train_files:
    df = pd.read_csv(txtpath + file, delimiter=' ')
    file_name = file[:-4] + ".jpg"
    img = cv2.imread(imgpath + file_name)
    shutil.copy(imgpath+file_name, det_val_path+'images/' + file_name)
    temp = {}
    temp['img_dimensions'] = img.shape

    boxes = []
    for _, row in df.iterrows():
        box = []
        box.append([row['X1'], row['Y1']])
        box.append([row['X2'], row['Y1']])
        box.append([row['X1'], row['Y2']])
        box.append([row['X2'], row['Y2']])
        boxes.append(box)

    temp['img_hash'] = ''
    temp['polygons'] = boxes

    det_labels[file[:-4] + ".jpg"] = temp

with open(det_train_path + 'labels.json', 'w', encoding='utf-8') as f:
    json.dump(det_labels, f, ensure_ascii=False, indent=4)


## Get Val Labels

In [87]:
count = 0
labels = {}

imgs_path = rec_val_path + 'images/'
for file in val_files:
    df = pd.read_csv(txtpath + file, delimiter=' ')
    img = Image.open(imgpath + file[:-4] + ".jpg")
    for i in range(df.shape[0]):
        X1, Y1, X2, Y2  = (df['X1'][i], df['Y1'][i], df['X2'][i], df['Y2'][i])
        crop = img.crop((X1, Y1, X2, Y2))
        try:
            file_name = str(count) + '.jpg'
            crop.save(imgs_path + file_name)
            count +=1
            labels[file_name] = df['token'][i]
        except:
            continue

with open(rec_val_path + 'labels.json', 'w', encoding='utf-8') as f:
    json.dump(labels, f, ensure_ascii=False, indent=4)

In [88]:
det_labels = {}

for file in val_files:
    df = pd.read_csv(txtpath + file, delimiter=' ')
    file_name = file[:-4] + ".jpg"
    img = cv2.imread(imgpath + file_name)
    shutil.copy2(imgpath+file_name, det_val_path+'images/' + file_name)
    temp = {}
    temp['img_dimensions'] = img.shape

    boxes = []
    for _, row in df.iterrows():
        box = []
        box.append([row['X1'], row['Y1']])
        box.append([row['X2'], row['Y1']])
        box.append([row['X1'], row['Y2']])
        box.append([row['X2'], row['Y2']])
        boxes.append(box)

    temp['img_hash'] = ''
    temp['polygons'] = boxes

    det_labels[file_name] = temp

with open(det_val_path + 'labels.json', 'w', encoding='utf-8') as f:
    json.dump(det_labels, f, ensure_ascii=False, indent=4)
