In [1]:
import requests

import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM 


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [3]:


torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-large-ft", torch_dtype=torch_dtype, trust_remote_code=True).to(device)
processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large-ft", trust_remote_code=True)




In [4]:
import os 
# Path to the folder containing images
training_images_folder = 'dataset/training_data/images'

# List all files in the image folder
training_files = [f for f in os.listdir(training_images_folder) if f.endswith(('png', 'jpg', 'jpeg'))]


training_images = {}
# Loop through and open images
for training_file in training_files:
    image_path = os.path.join(training_images_folder, training_file)
    img = Image.open(image_path)
    
    training_images[training_file] = img

training_images_names = list(training_images.keys())

In [5]:
len(training_images)

149

In [6]:
import os
from PIL import Image

# Path to the folder containing images
testing_images_folder = 'dataset/testing_data/images'

# List all files in the image folder
testing_files = [f for f in os.listdir(testing_images_folder) if f.endswith(('png', 'jpg', 'jpeg'))]

testing_images = {}
# Loop through and open images
for testing_file in testing_files:
    image_path = os.path.join(testing_images_folder, testing_file)
    img = Image.open(image_path)
    
    testing_images[testing_file] = img
testing_images_names = list(testing_images.keys())

In [7]:
def quad_to_rect(quad_box):
    """
    Convert a single quadrilateral box to a rectangular bounding box.

    Args:
        quad_box (list): List of 8 coordinates [x1, y1, x2, y2, x3, y3, x4, y4] representing
                         the corners of a quadrilateral.

    Returns:
        list: List of 4 coordinates [x_min, y_min, x_max, y_max] representing the smallest enclosing rectangle.
    """
    # Extract coordinates
    x1, y1, x2, y2, x3, y3, x4, y4 = quad_box
    
    # Calculate the min and max coordinates to form the rectangular bounding box
    x_min = min(x1, x2, x3, x4)
    y_min = min(y1, y2, y3, y4)
    x_max = max(x1, x2, x3, x4)
    y_max = max(y1, y2, y3, y4)
    
    # Return the rectangular bounding box
    return [x_min, y_min, x_max, y_max]

# Example usage
quad_box = [50, 30, 200, 40, 190, 110, 40, 100]  # Example quadrilateral box

rect_box = quad_to_rect(quad_box)
print(rect_box)


[40, 30, 200, 110]


In [8]:
def florence_ocr(image):
    prompt = "<OCR_WITH_REGION>"


    inputs = processor(text=prompt, images=image, return_tensors="pt").to(device, torch_dtype)
    
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        pixel_values=inputs["pixel_values"], 
        max_new_tokens=5000,
        num_beams=2,
        do_sample=False
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]

    parsed_answer = processor.post_process_generation(generated_text, task=prompt, image_size=(image.width, image.height))
    
    result = {
        "text_lines": [],
    }
    
    
    for box, label in zip(parsed_answer['<OCR_WITH_REGION>']['quad_boxes'] , parsed_answer['<OCR_WITH_REGION>']['labels']):
        rect_box = quad_to_rect(box)
         # Loop through the text lines
        result["text_lines"].append({
            "text": label,
            "bbox": rect_box
        })    
    return result



In [9]:
from tqdm import tqdm

def run_ocr_on_images(images_dict, langs =  ["en"]):
    images = list(images_dict.values())
    names = list(images_dict.keys())
   
    predictions = {}
    for i, image in tqdm(enumerate(images), total=len(images)):
        if len(image.getbands()) == 1:  # Grayscale images have only one band
            image = image.convert("RGB")
        
            
        prediction = florence_ocr(image)
        predictions[names[i]] = prediction
        del prediction  # Delete prediction to free memory
        torch.cuda.empty_cache()  # Clear unused memory cache

    return predictions 


In [10]:
for k,v in training_images.items():
    if k == '93380187.png':
        ex = {k:v}
        break
ex,list(ex.values())[0].convert("RGB").size 

({'93380187.png': <PIL.PngImagePlugin.PngImageFile image mode=L size=754x1000>},
 (754, 1000))

In [11]:
testing_images_ocr = run_ocr_on_images(testing_images)

  attn_output = torch.nn.functional.scaled_dot_product_attention(
 12%|█▏        | 6/50 [01:03<08:08, 11.09s/it]This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (1024). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
 14%|█▍        | 7/50 [01:31<09:21, 13.06s/it]


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
training_images_ocr = run_ocr_on_images(training_images)

In [119]:
import os
import json

# Path to the folder where results will be stored
testing_data_folder = 'dataset/testing_data'
surya_results_folder = os.path.join(testing_data_folder, 'florence_ocr')

# Create the directory if it doesn't exist
if not os.path.exists(surya_results_folder):
    os.makedirs(surya_results_folder)


testing_data_folder = 'dataset/training_data'
surya_results_folder = os.path.join(testing_data_folder, 'florence_ocr')

# Create the directory if it doesn't exist
if not os.path.exists(surya_results_folder):
    os.makedirs(surya_results_folder)


In [None]:
import json
c=0
for key , val in testing_images_ocr.items():
    file_path = os.path.join('dataset/testing_data/florence_ocr', key.split('.')[0]+'.json')
    # Write the dictionary to a JSON file
    with open(file_path, 'w') as json_file:
        json.dump(val, json_file, indent=4)
    c+=1
c

In [None]:
training_images_ocr.items()

In [None]:
c = 0
for key , val in training_images_ocr.items():
    file_path = os.path.join('dataset/training_data/florence_ocr', key.split('.')[0]+'.json')
    with open(file_path, 'w') as json_file:
        json.dump(val, json_file, indent=4)
    c+=1
c

In [74]:
from PIL import Image, ImageDraw, ImageFont
import numpy as np 
import random


colormap = ['blue','orange','green','purple','brown','pink','gray','olive','cyan','red',
            'lime','indigo','violet','aqua','magenta','coral','gold','tan','skyblue']
def draw_ocr_bboxes(image, prediction, scale=1):
    image_cp = image.copy()
    draw = ImageDraw.Draw(image_cp)
    bboxes, labels = prediction['quad_boxes'], prediction['labels']
    for box, label in zip(bboxes, labels):
        color = random.choice(colormap)
        new_box = (np.array(box) * scale).tolist()
        draw.polygon(new_box, width=3, outline=color)
        draw.text((new_box[0]+8, new_box[1]+2),
                    "{}".format(label),
                    align="right",
        
                    fill=color)
       
    display(image_cp)
    return image_cp

In [None]:
image

In [None]:
image_boxes = draw_ocr_bboxes(image , parsed_answer['<OCR_WITH_REGION>'])