In [None]:
!sudo apt-get install tesseract-ocr

In [None]:
%pip install gradio pytesseract opencv-python-headless
%pip install -qU langchain-groq
%pip install -qU json5
%pip install langchain
%pip install langchain-community

In [11]:
import numpy as np
import gradio as gr
import pytesseract
from pytesseract import Output
import cv2
import numpy as np
from PIL import Image, ImageDraw

In [12]:
from google.colab import userdata
groq_key = userdata.get('GROQ_API_KEY')

In [13]:
from langchain_groq import ChatGroq

llm = ChatGroq(
    model="mixtral-8x7b-32768",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    groq_api_key=groq_key,
    # other params...
)

In [15]:
import os
import json5 as json
import pandas as pd
from time import sleep
from langchain import PromptTemplate, LLMChain
from langchain.llms import OpenAI

In [16]:
def get_extracted_json(str_data):
    # Define the prompt template
    template = """
    You are an assistant that extracts information from invoices. Given the following text input, extract the Customer Name, Invoice No., Invoice Date, and Total Amount. Return the extracted information in a JSON format as shown below:

    Input:
    {invoice_text}

    Output format:
    {{
      "Customer Name": {{
        "value": "<Customer Name>"
      }},
      "Invoice No.": {{
        "value": "<Invoice No.>"
      }},
      "Invoice Date": {{
        "value": "<Invoice Date>"
      }},
      "Total Amount": {{
        "value": "<Total Amount>"
      }}
    }}

    Example Input:
    "Invoice no: 40378170 Date of issue: Seller: Patel, Thompson and Montgomery 356 Kyle Vista New James, MA 46228 Tax Id: 958-74-3511 IBAN: GB77WRBQ31965128414006 ITEMS No. Description Qty 1. Leed's Wine Companion Bottle 1,00 Corkscrew Opener Gift Box Set with Foil Cutter SUMMARY VAT [%] 10% Total 10/15/2012 Client: Jackson, Odonnell and Jackson 267 John Track Suite 841 Jenniferville, PA 98601 Tax Id: 998-87-7723 UM Net price Net worth VAT [%] each 7,50 7,50 10% Net worth VAT 7,50 0,75 $ 7,50 $ 0,75 Gross worth 8,25 Gross worth 8,25 $ 8,25"

    Example Output:
    {{
      "Customer Name": {{
        "value": "Jackson, Odonnell and Jackson"
      }},
      "Invoice No.": {{
        "value": "40378170"
      }},
      "Invoice Date": {{
        "value": "10/15/2012"
      }},
      "Total Amount": {{
        "value": "$ 8,25"
      }}
    }}

    Now, provide the output for the given input.
    """
    prompt = PromptTemplate(
        input_variables=["invoice_text"],
        template=template,
    )

    chain = LLMChain(prompt=prompt, llm=llm)

    invoice_text = str_data
    result = chain.run(invoice_text=invoice_text)

    return result


In [17]:
def extract_json(text):
  """Extracts JSON data from a text string.

    Args:
        text (str): The text string containing the JSON data.

    Returns:
        dict: The parsed JSON data as a dictionary.

    Raises:
        json.JSONDecodeError: If the JSON data is invalid.
    """

  start = text.find('{')
  end = text.rfind('}') + 1
  json_string = text[start:end]
  return json.loads(json_string)

In [18]:
def update_json(json_data, coordinates):
  """Updates a JSON object with coordinate information.

  Args:
    json_data: The JSON object to be updated.
    coordinates: A dictionary mapping values to their corresponding coordinates.

  Returns:
    The updated JSON object.
  """
  updated_json = {}
  for key, value in json_data.items():
    updated_value = value.copy()
    if key != 'Invoice No.':
      updated_value['coordinate'] = coordinates.get(value['value'], "NA")
    else:
      invoice = f"Invoice no: {value['value']}"
      updated_value['coordinate'] = coordinates.get(invoice, "NA")
    updated_json[key] = updated_value
  return updated_json

In [19]:
def ocr_using_teseract(image):
    """
    Extracts text from an image using Tesseract OCR.

    Args:
        image: The input image.

    Returns:
        A list of dictionaries, where each dictionary represents a paragraph extracted from the image.
        Each dictionary contains two keys:
            - `coordinates`: A list of four integers representing the bounding box coordinates (x1, y1, x2, y2) of the paragraph.
            - `text`: The extracted text from the paragraph.
    """
    data = pytesseract.image_to_data(image, output_type=Output.DICT)

    paragraphs = []
    current_paragraph = {'coordinates': [float('inf'), float('inf'), 0, 0], 'text': ""}

    for i in range(len(data['text'])):
        text = data['text'][i].strip()
        if text and int(data['conf'][i]) > 0:
            x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
            current_paragraph['coordinates'][0] = min(current_paragraph['coordinates'][0], x)  # x1
            current_paragraph['coordinates'][1] = min(current_paragraph['coordinates'][1], y)  # y1
            current_paragraph['coordinates'][2] = max(current_paragraph['coordinates'][2], x + w)  # x2
            current_paragraph['coordinates'][3] = max(current_paragraph['coordinates'][3], y + h)  # y2
            current_paragraph['text'] += " " + text

            if i < len(data['text']) - 1:
                next_y = data['top'][i + 1]
                if abs(next_y - y) > 10:
                    paragraphs.append(current_paragraph)
                    current_paragraph = {'coordinates': [float('inf'), float('inf'), 0, 0], 'text': ""}

    if current_paragraph['text']:
        paragraphs.append(current_paragraph)
    return paragraphs

In [20]:
def get_context(paragraphs):
    """
    Extracts context and coordinates from a list of paragraphs.

    Args:
        paragraphs (list): A list of dictionaries, each containing 'text' and 'coordinates' of a paragraph.

    Returns:
        tuple: A tuple containing:
            - The concatenated text of all paragraphs.
            - A dictionary mapping paragraph text to its coordinates.
    """
    context = ""
    dict_of_paragraphs = {}
    for idx, para in enumerate(paragraphs):
        para_text = para['text'].strip()
        coordinates = para['coordinates']
        # context += f"Paragraph {idx + 1}:\n{para_text}\nCoordinates: {coordinates}\n\n"
        dict_of_paragraphs[para_text] = coordinates
        context += f"{para_text} \t\t"
    return context, dict_of_paragraphs

In [44]:
def ocr_with_bounding_boxes(image: np.ndarray):
    """Performs OCR on the given image and draws bounding boxes around detected text.

    Args:
        image: A NumPy array representing the image to be processed.

    Returns:
        A tuple containing:
            - The processed image with bounding boxes drawn.
            - A JSON string representing the extracted text and their coordinates.
    """

    open_cv_image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

    data = ocr_using_teseract(open_cv_image)

    context, dict_of_paragraphs = get_context(data)
    json_result = get_extracted_json(context)
    json_data = extract_json(json_result)
    updated_json = update_json(json_data, dict_of_paragraphs)
    print(updated_json)

    for key, value in updated_json.items():
        coords = value['coordinate']
        extracted_text = key

        cv2.rectangle(open_cv_image, (coords[0], coords[1]), (coords[2], coords[3]), color=(255, 0, 0), thickness=2)

        text_position = (coords[0], coords[1] - 20)
        cv2.putText(open_cv_image, extracted_text, text_position, cv2.FONT_HERSHEY_SIMPLEX, 1.5, (255, 0, 0), 2, cv2.LINE_AA)

    result_image = cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2RGB)
    json_output = json.dumps(updated_json, indent=4)
    return result_image, json_output

interface = gr.Interface(
    fn=ocr_with_bounding_boxes,
    inputs=gr.Image(type="numpy"),
    outputs=[gr.Image(type="numpy"), gr.Textbox(label="Extracted JSON")],
    title="OCR with Bounding Boxes and Text Extraction",
    description="Upload an image to extract text using OCR and visualize bounding boxes."
)

interface.launch()


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://77525079127e49245f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


