In [82]:
from datasets import load_dataset 

# this dataset uses the new Image feature :)
funsd = load_dataset("nielsr/funsd-layoutlmv3")

In [2]:
import os
import json
from collections import defaultdict
from PIL import Image

class CustomFunsdDataset:
    def __init__(self, data_dir):
        self.data_dir = data_dir
        self.splits = defaultdict(list)  # For train/test split data
    
    def load_image(self, image_path):
        """Load an image from a given path."""
        image = Image.open(image_path)
        return image

    def parse_annotation(self, annotation_path):
        """Parse the JSON annotation file."""
        with open(annotation_path, "r", encoding="utf8") as f:
            data = json.load(f)
        return data

    def load_dataset(self, filepath):
        """Load and store each sample in the dataset."""
        ann_dir = os.path.join(filepath, "annotations")
        img_dir = os.path.join(filepath, "images")

        samples = []
        for guid, file in enumerate(sorted(os.listdir(ann_dir))):
            tokens = []
            boxes = []
            ner_tags = []
            line_boxes = [] 
            line_ids = []
            linkings = []

            # Load annotation
            file_path = os.path.join(ann_dir, file)
            data = self.parse_annotation(file_path)

            # Load corresponding image
            image_file = file.replace("json", "png")
            image_path = os.path.join(img_dir, image_file)
            image = self.load_image(image_path)

            # Extract tokens, boxes, and NER tags from annotation
            for item in data["form"]:
                words, label = item["words"], item["label"]
                words = [w for w in words if w["text"].strip() != ""]
                
                linking = []
                if len(item['linking'])>0:
                    for x in item['linking']:
                        linking.extend([it for it in x if it != item['id']])
                        
                    if len(linking)==0: print(item['linking'],linking,file, words)
                else : linking = None

                
                line_id =  item['id']



                if len(words) == 0:
                    continue
                
                if label == "other":
                    for w in words:
                        tokens.append(w["text"])
                        ner_tags.append("O")
                        boxes.append(w["box"])
                        line_boxes.append(item['box'])
                        linkings.append(linking)
                        line_ids.append(line_id)

                else:
                    tokens.append(words[0]["text"])
                    ner_tags.append("B-" + label.upper())
                    boxes.append(words[0]["box"])
                    line_boxes.append(item['box'])
                    linkings.append(linking)
                    line_ids.append(line_id)

                    for w in words[1:]:
                        tokens.append(w["text"])
                        ner_tags.append("I-" + label.upper())
                        boxes.append(w["box"])
                        line_boxes.append(item['box'])
                        linkings.append(linking)
                        line_ids.append(line_id)

            
            label_map = {
                "O": 0,
                "B-HEADER": 1,
                "I-HEADER": 2,
                "B-QUESTION": 3,
                "I-QUESTION": 4,
                "B-ANSWER": 5,
                "I-ANSWER": 6
                 }
            ner_tags = [label_map[tag] for tag in ner_tags]

            assert len(ner_tags) == len(tokens) == len(boxes) == len(linkings) == len(line_ids) , "Lengths of ner_tags, tokens, and boxes must be equal."
            samples.append({
                "id": str(guid),
                "tokens": tokens,
                'line_boxes':line_boxes,
                "bboxes": boxes,
                "ner_tags": ner_tags,
                'line_ids': line_ids,
                'linkings': linkings,
                "image": image,
                'image_name':file
            })

        return samples

    def split_generators(self):
        """Return train and test splits."""
        train_dir = os.path.join(self.data_dir, "training_data")
        test_dir = os.path.join(self.data_dir, "testing_data")

        # Load train and test data
        self.splits["train"] = self.load_dataset(train_dir)
        self.splits["test"] = self.load_dataset(test_dir)

    def __repr__(self):
        """Customize the printed representation of the dataset."""
        train_size = len(self.splits["train"])
        test_size = len(self.splits["test"])

        return (
            f"CustomFunsdDataset:\n"
            f"DatasetDict({{\n"
            f"    train: Dataset({{features: ['id', 'tokens', 'line_boxes', 'bboxes', 'ner_tags','line_ids','linkings','image','image_name'], num_rows: {train_size}}}),\n"
            f"    test: Dataset({{features: ['id', 'tokens', 'line_boxes', 'bboxes', 'ner_tags','line_ids','linkings','image','image_name'], num_rows: {test_size}}})\n"
            f"}})"
        )

    def __getitem__(self, split):
        """Allow access to train or test splits like dataset['train']."""
        return self.splits[split]

# Example usage
data_dir = ""
dataset = CustomFunsdDataset(data_dir)

# Load the train and test splits
dataset.split_generators()

# Print dataset summary
print(dataset)

# Access a specific sample
sample = dataset['train'][0]
print(sample)


CustomFunsdDataset:
DatasetDict({
    train: Dataset({features: ['id', 'tokens', 'line_boxes', 'bboxes', 'ner_tags','line_ids','linkings','image','image_name'], num_rows: 149}),
    test: Dataset({features: ['id', 'tokens', 'line_boxes', 'bboxes', 'ner_tags','line_ids','linkings','image','image_name'], num_rows: 50})
})
{'id': '0', 'tokens': ['R&D', ':', 'Suggestion:', 'Date:', 'Licensee', 'Yes', 'No', '597005708', 'R&D', 'QUALITY', 'IMPROVEMENT', 'SUGGESTION/', 'SOLUTION', 'FORM', 'Name', '/', 'Phone', 'Ext.', ':', 'M.', 'Hamann', 'P.', 'Harper,', 'P.', 'Martinez', '9/', '3/', '92', 'R&D', 'Group:', 'J.', 'S.', 'Wigand', 'Supervisor', '/', 'Manager', 'Discontinue', 'coal', 'retention', 'analyses', 'on', 'licensee', 'submitted', 'product', 'samples', '(Note', ':', 'Coal', 'Retention', 'testing', 'is', 'not', 'performed', 'by', 'most', 'licensees.', 'Other', 'B&W', 'physical', 'measurements', 'as', 'ends', 'stability', 'and', 'inspection', 'for', 'soft', 'spots', 'in', 'ciparettes', 'ar

In [233]:
dataset['train'][122]['tokens'][-1]

'Telford'

In [234]:
funsd['train'][122]['tokens'][-1]

'Telford'

In [3]:
dataset['train'][0]

{'id': '0',
 'tokens': ['R&D',
  ':',
  'Suggestion:',
  'Date:',
  'Licensee',
  'Yes',
  'No',
  '597005708',
  'R&D',
  'QUALITY',
  'IMPROVEMENT',
  'SUGGESTION/',
  'SOLUTION',
  'FORM',
  'Name',
  '/',
  'Phone',
  'Ext.',
  ':',
  'M.',
  'Hamann',
  'P.',
  'Harper,',
  'P.',
  'Martinez',
  '9/',
  '3/',
  '92',
  'R&D',
  'Group:',
  'J.',
  'S.',
  'Wigand',
  'Supervisor',
  '/',
  'Manager',
  'Discontinue',
  'coal',
  'retention',
  'analyses',
  'on',
  'licensee',
  'submitted',
  'product',
  'samples',
  '(Note',
  ':',
  'Coal',
  'Retention',
  'testing',
  'is',
  'not',
  'performed',
  'by',
  'most',
  'licensees.',
  'Other',
  'B&W',
  'physical',
  'measurements',
  'as',
  'ends',
  'stability',
  'and',
  'inspection',
  'for',
  'soft',
  'spots',
  'in',
  'ciparettes',
  'are',
  'thought',
  'to',
  'be',
  'sufficient',
  'measures',
  'to',
  'assure',
  'cigarette',
  'physical',
  'integrity.',
  'The',
  'proposed',
  'action',
  'will',
  'incre

In [240]:
print(dataset['train'][25]['image_name'])
for i in range(len( dataset['train'][25]['line_boxes'])):
    print(dataset['train'][25]['tokens'][i])
    print('line',dataset['train'][25]['line_boxes'][i])
    print('box',dataset['train'][25]['bboxes'][i])
    print('id',dataset['train'][25]['line_ids'][i])
    print('link to',dataset['train'][25]['linkings'][i])
    if i>10 :break



0011906503.json
0465E
line [627, 51, 666, 65]
box [627, 51, 666, 65]
id 0
link to None
($000)
line [338, 126, 379, 142]
box [338, 126, 379, 142]
id 1
link to None
Title
line [49, 147, 83, 157]
box [49, 147, 83, 157]
id 2
link to [64]
Purpose
line [46, 176, 96, 188]
box [46, 176, 96, 188]
id 3
link to [65, 66, 67, 68, 69, 70, 71]
1
line [75, 193, 85, 204]
box [75, 193, 85, 204]
id 4
link to None
2
line [74, 207, 84, 219]
box [74, 207, 84, 219]
id 5
link to None
3
line [74, 221, 84, 235]
box [74, 221, 84, 235]
id 6
link to None
4
line [74, 236, 85, 247]
box [74, 236, 85, 247]
id 7
link to None
7
line [365, 237, 377, 252]
box [365, 237, 377, 252]
id 8
link to None
5
line [364, 197, 375, 211]
box [364, 197, 375, 211]
id 9
link to None
6
line [365, 212, 380, 222]
box [365, 212, 380, 222]
id 10
link to None
Status
line [46, 259, 91, 271]
box [46, 259, 91, 271]
id 11
link to [12, 13]
