### Imports

In [1]:
import os
import json
from tqdm import tqdm
from PIL import ImageDraw, Image
from fuzzywuzzy import fuzz

from PIL import Image
from surya.recognition import RecognitionPredictor
from surya.detection import DetectionPredictor


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
recognition_predictor = RecognitionPredictor()
detection_predictor = DetectionPredictor()

In [3]:
NAME = "telugu"

MAX_MATCHES = 5

CUT_OFF_THRESHOLD = 70
QUESTION_WEIGHT = 0.2
ANSWER_WEIGHT = 0.8

DATA_DIR = "/data/BADRI/FINAL/THESIS/GRVQA/gr-doc-vqa-grounding/data/"

LEVEL = "line" # or "word_level"

IMG_DIR = f"{DATA_DIR}/input/{NAME}/"
JSON_FILE = f"{DATA_DIR}/input/data2.json"
OUT_DIR = f"{DATA_DIR}/output/grounding/{NAME}/surya/{LEVEL}/"

LANGS = ["en", "te"]


# IMG_DIR = "/data/BADRI/FINAL/THESIS/GRVQA/data/CircularsVQA/BHASHINI_TESTSET/final/"
# JSON_FILE = "/data/BADRI/FINAL/THESIS/GRVQA/data/CircularsVQA/BHASHINI_TESTSET/final_annotations.json"
# OUT_DIR = f"/data/BADRI/FINAL/THESIS/GRVQA/gr-doc-vqa-grounding/data/output/grounding/{NAME}/doctr/{LEVEL}/"

OUT_DET_DIR = os.path.join(OUT_DIR, "detections")
OUT_IMG_DIR = os.path.join(OUT_DIR, "images")
OUT_JSON_DIR = os.path.join(OUT_DIR, "json")

if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR)
if not os.path.exists(OUT_IMG_DIR):
    os.makedirs(OUT_IMG_DIR)
if not os.path.exists(OUT_JSON_DIR):
    os.makedirs(OUT_JSON_DIR)
if not os.path.exists(OUT_DET_DIR):
    os.makedirs(OUT_DET_DIR)



stop_words = {'what', 'is', 'the', 'this', 'that', 'these', 'those', 'which', 'how', 'why', 'where', 'when', 'who', 'will', 'be', 'and', 'or', 'in', 'at', 'to', 'for', 'of', 'with', 'by'}

# # Read json data
# with open(JSON_FILE, "r") as f:
#     data = json.load(f)




### Functions

In [4]:
def get_surya_predictions(img_path = None, langs = ["en"]):
    image = Image.open(img_path)
    preds = recognition_predictor([image], [langs], detection_predictor)

    predictions = []
    for pred in preds[0].text_lines:
        bbox = pred.bbox
        text = pred.text
        predictions.append({"bbox":bbox,"text":text})

        # Draw bounding boxes on the image
        draw = ImageDraw.Draw(image)
        draw.rectangle(bbox, outline='red', width=2)

    

    return predictions, image



        
def longest_consecutive_range(indices):
    if not indices:
        return []

    indices = sorted(set(indices))
    longest = []
    current = [indices[0]]

    for i in range(1, len(indices)):
        if indices[i] == indices[i - 1] + 1:
            current.append(indices[i])
        else:
            if len(current) > len(longest):
                longest = current
            current = [indices[i]]

    if len(current) > len(longest):
        longest = current

    return longest


def get_word_level_matches(answer_text, top_k_matches):
    bboxes = []
    for match in top_k_matches:
        indices = []
        for index, word in enumerate(match['words']):
            if word['text'].lower() in answer_text.lower():
                # bboxes.append(word['bbox'])
                indices.append(index)
        longest_indices = longest_consecutive_range(indices)
        for index in longest_indices:
            bboxes.append(match['words'][index]['bbox'])
    return bboxes

In [5]:
for file in tqdm(os.listdir(IMG_DIR)):
    IMG_PATH = os.path.join(IMG_DIR, file)
    predictions, image = get_surya_predictions(IMG_PATH, LANGS)

     # save predictions in json
    with open(os.path.join(OUT_JSON_DIR, f"{file}.json"), "w") as f:
        json.dump(predictions, f, indent=4, ensure_ascii=False)

    # save image
    image.save(os.path.join(OUT_DET_DIR, f"{file}.png"))

  0%|          | 0/13 [00:00<?, ?it/s]

Loaded recognition model s3://text_recognition/2025_02_18 on device cuda with dtype torch.float16
Loaded detection model s3://text_detection/2025_02_28 on device cuda with dtype torch.float16


Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  1.10it/s]
Recognizing Text: 100%|██████████| 1/1 [00:01<00:00,  1.97s/it]
  8%|▊         | 1/13 [00:04<00:50,  4.19s/it]

Loaded recognition model s3://text_recognition/2025_02_18 on device cuda with dtype torch.float16
Loaded detection model s3://text_detection/2025_02_28 on device cuda with dtype torch.float16


Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  1.81it/s]
Recognizing Text: 100%|██████████| 1/1 [00:01<00:00,  1.42s/it]
 15%|█▌        | 2/13 [00:07<00:40,  3.69s/it]

Loaded recognition model s3://text_recognition/2025_02_18 on device cuda with dtype torch.float16
Loaded detection model s3://text_detection/2025_02_28 on device cuda with dtype torch.float16


Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  2.05it/s]
Recognizing Text: 100%|██████████| 1/1 [00:00<00:00,  1.08it/s]
 23%|██▎       | 3/13 [00:10<00:31,  3.17s/it]

Loaded recognition model s3://text_recognition/2025_02_18 on device cuda with dtype torch.float16
Loaded detection model s3://text_detection/2025_02_28 on device cuda with dtype torch.float16


Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  2.11it/s]
Recognizing Text: 100%|██████████| 1/1 [00:01<00:00,  1.34s/it]
 31%|███       | 4/13 [00:13<00:28,  3.13s/it]

Loaded recognition model s3://text_recognition/2025_02_18 on device cuda with dtype torch.float16
Loaded detection model s3://text_detection/2025_02_28 on device cuda with dtype torch.float16


Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  1.96it/s]
Recognizing Text: 100%|██████████| 1/1 [00:00<00:00,  1.49it/s]
 38%|███▊      | 5/13 [00:15<00:23,  2.93s/it]

Loaded recognition model s3://text_recognition/2025_02_18 on device cuda with dtype torch.float16
Loaded detection model s3://text_detection/2025_02_28 on device cuda with dtype torch.float16


Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  2.07it/s]
Recognizing Text: 100%|██████████| 1/1 [00:01<00:00,  1.19s/it]
 46%|████▌     | 6/13 [00:18<00:20,  2.96s/it]

Loaded recognition model s3://text_recognition/2025_02_18 on device cuda with dtype torch.float16
Loaded detection model s3://text_detection/2025_02_28 on device cuda with dtype torch.float16


Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  1.88it/s]
Recognizing Text: 100%|██████████| 1/1 [00:01<00:00,  1.41s/it]
 54%|█████▍    | 7/13 [00:22<00:19,  3.21s/it]

Loaded recognition model s3://text_recognition/2025_02_18 on device cuda with dtype torch.float16
Loaded detection model s3://text_detection/2025_02_28 on device cuda with dtype torch.float16


Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  2.35it/s]
Recognizing Text: 100%|██████████| 1/1 [00:01<00:00,  1.23s/it]
 62%|██████▏   | 8/13 [00:25<00:15,  3.14s/it]

Loaded recognition model s3://text_recognition/2025_02_18 on device cuda with dtype torch.float16
Loaded detection model s3://text_detection/2025_02_28 on device cuda with dtype torch.float16


Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  1.73it/s]
Recognizing Text: 100%|██████████| 1/1 [00:00<00:00,  1.12it/s]
 69%|██████▉   | 9/13 [00:28<00:12,  3.10s/it]

Loaded recognition model s3://text_recognition/2025_02_18 on device cuda with dtype torch.float16
Loaded detection model s3://text_detection/2025_02_28 on device cuda with dtype torch.float16


Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  2.30it/s]
Recognizing Text: 100%|██████████| 1/1 [00:01<00:00,  1.26s/it]
 77%|███████▋  | 10/13 [00:31<00:09,  3.07s/it]

Loaded recognition model s3://text_recognition/2025_02_18 on device cuda with dtype torch.float16
Loaded detection model s3://text_detection/2025_02_28 on device cuda with dtype torch.float16


Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  1.93it/s]
Recognizing Text: 100%|██████████| 1/1 [00:01<00:00,  1.34s/it]
 85%|████████▍ | 11/13 [00:34<00:06,  3.05s/it]

Loaded recognition model s3://text_recognition/2025_02_18 on device cuda with dtype torch.float16
Loaded detection model s3://text_detection/2025_02_28 on device cuda with dtype torch.float16


Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  2.11it/s]
Recognizing Text: 100%|██████████| 1/1 [00:01<00:00,  1.26s/it]
 92%|█████████▏| 12/13 [00:38<00:03,  3.19s/it]

Loaded recognition model s3://text_recognition/2025_02_18 on device cuda with dtype torch.float16
Loaded detection model s3://text_detection/2025_02_28 on device cuda with dtype torch.float16


Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  1.84it/s]
Recognizing Text: 100%|██████████| 1/1 [00:01<00:00,  1.28s/it]
100%|██████████| 13/13 [00:41<00:00,  3.19s/it]


In [None]:
for image_name, qna_pairs in tqdm(data.items()):
    IMG_PATH = os.path.join(IMG_DIR, image_name)
    

    json_file = os.path.join(OUT_JSON_DIR, f"{image_name}.json")
    with open(json_file, "r") as f:
        predictions = json.load(f)

    # print(predictions)

    qna_count = 0
    for qna_pair in qna_pairs:
        image = Image.open(IMG_PATH)

        question_text = qna_pair['question']
        answer_text = qna_pair['answer']
        
        top_k_matches = get_matched_regions(question_text, answer_text, predictions)


        if LEVEL == "word":

            word_level_matches = get_word_level_matches(answer_text, top_k_matches)
            for bbox in word_level_matches:
                draw = ImageDraw.Draw(image)
                draw.rectangle(bbox, outline='green', width=2)

        else :
            for match in top_k_matches:
                draw = ImageDraw.Draw(image)
                draw.rectangle(match['bbox'], outline='blue', width=2)

        # # write qna pair in the image
        draw.text((10, 10), "Question:" + question_text, fill='red')
        draw.text((10, 25), "Answer: "+answer_text, fill='red')

        image.save(os.path.join(OUT_IMG_DIR, f"{image_name}_{qna_count}.png"))
        qna_count += 1

        # break
        