In [4]:
import xml.etree.ElementTree as ET
import os

def parse_xml_annotations(xml_file):
    annotations = []

    tree = ET.parse(xml_file)
    root = tree.getroot()

    for image in root.findall('image'):
        filename = image.get('name')
        width = int(image.get('width'))
        height = int(image.get('height'))

        for obj in image.findall('box'):
            label = obj.get('label')
            xtl = float(obj.get('xtl'))
            ytl = float(obj.get('ytl'))
            xbr = float(obj.get('xbr'))
            ybr = float(obj.get('ybr'))
            text = obj.find('attribute').text.strip() if obj.find('attribute') is not None else ""

            annotations.append((filename, width, height, label, xtl, ytl, xbr, ybr, text))

        for obj in image.findall('polygon'):
            label = obj.get('label')
            points = obj.get('points')
            # You can parse the points string here if needed
            text = obj.find('attribute').text.strip() if obj.find('attribute') is not None else ""

            annotations.append((filename, width, height, label, points, text))

    return annotations

# Example usage
xml_file = 'dataset/annotations.xml'
annotations = parse_xml_annotations(xml_file)
for annotation in annotations:
    print(annotation)


('images/0.jpg', 612, 1023, 'shop', 153.76, 194.59, 481.86, 245.4, 'WALMART')
('images/0.jpg', 612, 1023, 'total', 351.28, 604.95, 525.83, 624.91, 'TOTAL 5.11')
('images/0.jpg', 612, 1023, 'item', 106.25, 517.3, 555.1, 541.35, 'FRAP 001200010451 F 5.48 N')
('images/0.jpg', 612, 1023, 'item', 104.28, 473.2, 556.78, 519.9, 'BANANAS 000000004011KF 0.41 lb @ 1 lb /0.49 0.20 N')
('images/0.jpg', 612, 1023, 'date_time', 214.25, 881.94, 437.99, 894.78, '08/20/10 13:12:01')
('images/0.jpg', 612, 1023, 'receipt', '39.15,93.53;186.88,67.24;270.27,54.55;332.35,48.66;447.00,54.10;537.19,63.62;593.83,73.13;609.24,67.24;612.00,71.84;612.00,784.65;605.62,862.11;596.10,926.01;589.75,936.89;514.98,931.90;388.54,926.01;196.85,918.31;91.72,913.32;83.56,895.65;72.68,852.60;67.24,800.48;87.64,776.01;118.45,725.71;148.82,680.39;155.61,643.23;153.35,605.16;144.28,587.49;133.41,571.63;108.94,562.56;91.72,561.66;71.78,582.96;50.93,593.38;44.13,521.33;34.16,431.14;25.10,325.10;21.93,270.72;21.93,220.87;25.10,15

In [5]:
import xml.etree.ElementTree as ET
import csv

def parse_xml_to_csv(xml_file, csv_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    # Open or create the CSV file
    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        # Write the header row
        writer.writerow(['Image ID', 'Label', 'Attributes', 'Points'])

        for image in root.iter('image'):
            image_id = image.attrib['id']
            for box in image.findall('box'):
                label = box.attrib['label']
                xtl = box.attrib.get('xtl')
                ytl = box.attrib.get('ytl')
                xbr = box.attrib.get('xbr')
                ybr = box.attrib.get('ybr')
                rotation = box.attrib.get('rotation', '0')
                attributes = {attribute.attrib['name']: attribute.text for attribute in box.findall('attribute')}
                writer.writerow([image_id, label, attributes, f"({xtl}, {ytl}), ({xbr}, {ybr}), rotation={rotation}"])

            for polygon in image.findall('polygon'):
                label = polygon.attrib['label']
                points = polygon.attrib.get('points')
                writer.writerow([image_id, label, '{}', points])

# Usage
xml_file = 'dataset/annotations.xml'
csv_file = 'dataset/receipt.csv'
parse_xml_to_csv(xml_file, csv_file)


# box file

In [10]:
import xml.etree.ElementTree as ET

def parse_xml_annotations(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    annotations = []
    for image in root.findall('image'):
        for box in image.findall('box'):
            label = box.get('label')
            xmin = float(box.get('xtl'))
            ymin = float(box.get('ytl'))
            xmax = float(box.get('xbr'))
            ymax = float(box.get('ybr'))
            # Find the 'attribute' element with the name 'text' and get its text
            text = None
            for attribute in box.findall('attribute'):
                if attribute.get('name') == 'text':
                    text = attribute.text
                    break
            annotations.append({'label': label, 'xmin': xmin, 'ymin': ymin, 'xmax': xmax, 'ymax': ymax, 'text': text})
    return annotations

def write_box_file(annotations, output_file):
    with open(output_file, 'w') as f:
        for ann in annotations:
            # Some annotations may not have text; handle this case.
            text = ann['text'] if ann['text'] is not None else ""
            line = f"{ann['label']} {ann['xmin']} {ann['ymin']} {ann['xmax']} {ann['ymax']} 0 {text}\n"
            f.write(line)

annotations = parse_xml_annotations('dataset/annotations.xml')
write_box_file(annotations, 'dataset/annotations.box')


In [7]:
import csv
import re

def parse_csv_annotations(csv_file):
    annotations = []
    with open(csv_file, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            label = row['Label']
            attributes = eval(row['Attributes']) if row['Attributes'] else {}
            text = attributes.get('text', '')

            # Using regex to match coordinates before "rotation" part
            points_str = row['Points']
            matches = re.match(r"\(([^)]+)\), \(([^)]+)\)", points_str)
            if matches:
                point1 = matches.group(1)
                point2 = matches.group(2)
                xmin, ymin = map(float, point1.split(', '))
                xmax, ymax = map(float, point2.split(', '))
                xmin, ymin, xmax, ymax = map(int, [xmin, ymin, xmax, ymax])

                annotations.append({'ImageID': row['Image ID'], 'label': label, 'xmin': xmin, 'ymin': ymin, 'xmax': xmax, 'ymax': ymax, 'text': text})
    return annotations

def write_box_file(annotations, output_file):
    with open(output_file, 'w') as f:
        for ann in annotations:
            # Tesseract .box file format requires coordinates and text for each bounding box
            # Coordinates need to be adjusted if working with Tesseract's training process
            line = f"{ann['xmin']} {ann['ymin']} {ann['xmax']} {ann['ymax']} {ann['text']}\n"
            f.write(line)

annotations = parse_csv_annotations('dataset/receipt.csv')
write_box_file(annotations, 'dataset/receipt.box')


# TIF

In [14]:
def print_box_file_content(box_file):
    """
    Print the contents of a .box file.
    
    Args:
    - box_file (str): Path to the .box file.
    """
    with open(box_file, 'r') as f:
        box_content = f.read()
    print(f"Contents of {box_file}:")
    print(box_content)

# Paths to the .box files
receipt_box_file = 'dataset/receipt.box'
annotations_box_file = 'dataset/annotations.box'

# Print the contents of the .box files
print_box_file_content(receipt_box_file)
print_box_file_content(annotations_box_file)


Contents of dataset/receipt.box:
153 194 481 245 WALMART
351 604 525 624 TOTAL 5.11
106 517 555 541 FRAP 001200010451 F 5.48 N
104 473 556 519 BANANAS 000000004011KF 0.41 lb @ 1 lb /0.49 0.20 N
214 881 437 894 08/20/10 13:12:01
144 774 529 790 TOTAL $38.68
131 427 467 445 MINI-PEARL TOMATOES.. 2.49
133 446 468 464 PKG SHREDDED MOZZARELLA LITE T 3.99
135 464 468 482 EGGS 1 DOZ ORGANIC BROWN. 3.79
134 482 469 500 BEANS GARBANZO 0.89
137 573 476 610 GROCERY NON TAXABLE 0.98 2@ 0.49
134 516 471 535 A-AVOCADOS HASS BAG 4CT 3.99
135 499 469 517 SPROUTED CA STYLE 2.99
137 612 479 648 BANANAS ORGANIC 0.87 3EA @ 0.29/EA
138 650 483 669 CREAMY SALTED PEANUT BUTTER 2.49
139 670 484 688 WHL WHT PITA BREAD 1.69
140 690 487 728 GROCERY NON TAXABLE 1.38 2 2@ 0.69
124 331 465 349 R-CARROTS SHREDDED 10 OZ 1.29
190 138 495 166 TRADER JOE'S
135 535 474 553 A-APPLE BAG JAZZ 2 LB 2.99
135 553 474 571 A-PEPPER BELL EACH XL RED 0.99
125 350 466 368 R-CUCUMBERS PERSIAN 1 LB 1.99
127 370 466 388 TOMATOES CRUSH

In [15]:
def adjust_box_for_tesseract(box_file_path, output_file_path):
    with open(box_file_path, 'r') as box_file, open(output_file_path, 'w') as output_file:
        line_count = 0
        char_count = 0
        for line in box_file:
            parts = line.strip().split(' ')
            if len(parts) < 6:  # Check if the line has enough parts
                print(f"Skipping malformed line: {line.strip()}")
                continue
            left, bottom, right, top = map(int, parts[:4])
            text = ' '.join(parts[4:])
            page_number = 0  # Assuming a single page document; adjust if different.

            # Calculate the width of each character's bounding box.
            text_length = len(text)
            width_per_char = (right - left) / text_length

            for i, char in enumerate(text):
                char_left = int(left + i * width_per_char)
                char_right = int(left + (i + 1) * width_per_char)
                # Writing each character with its bounding box to the new box file.
                output_line = f"{char} {char_left} {bottom} {char_right} {top} {page_number}\n"
                output_file.write(output_line)
                char_count += 1
            line_count += 1

        print(f"Processed {line_count} lines and {char_count} characters.")

# Specify the input .box file and the output file for adjusted content.
input_box_file = 'dataset/receipt.box'
output_box_file = 'dataset/adjusted_receipt.box'

# Call the function with your box file and specify the output file name.
adjust_box_for_tesseract(input_box_file, output_box_file)

print(f"Adjusted .box file saved to: {output_box_file}")


Skipping malformed line: 153 194 481 245 WALMART
Skipping malformed line: 280 276 411 301 SPAR
Skipping malformed line: 220 225 541 271 WALMART
Skipping malformed line: 216 105 426 146 Walmart
Skipping malformed line: 872 503 1380 599 Walmart
Skipping malformed line: 253 196 432 237 Walmart
Skipping malformed line: 148 273 426 327 Walmart
Skipping malformed line: 320 264 528 302 Walmart
Skipping malformed line: 295 135 401 156 Walmart
Skipping malformed line: 304 317 421 342 Walmart
Skipping malformed line: 160 82 264 102 Waalmart
Skipping malformed line: 275 109 379 134 Walmart
Skipping malformed line: 120 21 285 53 Walmart
Skipping malformed line: 149 273 427 327 Walmart
Processed 220 lines and 6115 characters.
Adjusted .box file saved to: dataset/adjusted_receipt.box


In [16]:
from PIL import Image
import os

def convert_images_to_tiff(source_dir, dest_dir, image_extension):
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)

    for image_name in os.listdir(source_dir):
        if image_name.endswith(image_extension):
            file_path = os.path.join(source_dir, image_name)
            img = Image.open(file_path)
            tiff_path = os.path.join(dest_dir, os.path.splitext(image_name)[0] + '.tif')
            img.save(tiff_path, 'TIFF')

# Example usage:
convert_images_to_tiff('dataset/images', 'dataset/output_tif', '.jpg')  # or '.png' as needed
