In [5]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')
import spacy
nlp = spacy.load('en_core_web_sm')


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\PX\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [49]:
import os
import json
from collections import defaultdict
from PIL import Image

class CustomFunsdDataset:
    def __init__(self, data_dir):
        self.data_dir = data_dir
        self.splits = defaultdict(list)  # For train/test split data
    
    def load_image(self, image_path):
        """Load an image from a given path."""
        image = Image.open(image_path)
        return image

    def parse_annotation(self, annotation_path):
        """Parse the JSON annotation file."""
        with open(annotation_path, "r", encoding="utf8") as f:
            data = json.load(f)
        return data

    def load_dataset(self, filepath):
        """Load and store each sample in the dataset."""
        ann_dir = os.path.join(filepath, "surya_results")
        img_dir = os.path.join(filepath, "images")

        samples = []
        for guid, file in enumerate(sorted(os.listdir(ann_dir))):
            tokens = []
            boxes = []

            # Load annotation
            file_path = os.path.join(ann_dir, file)
            data = self.parse_annotation(file_path)

            # Load corresponding image
            image_file = file.replace("json", "png")
            image_path = os.path.join(img_dir, image_file)
            image = self.load_image(image_path)

            for item in data['text_lines']: 
                wt = nlp.tokenizer(item['text'])    
                tokens.extend(wt)
                # Rejoin tokens that were split by slashes
                split_them = ['/', ':']  # Punctuation to merge with adjacent tokens
                not_split = ['&']        # Punctuation to remain unsplit
                for i in range(len(wt)):
                    boxes.append(item['bbox'])
            assert len(tokens) == len(boxes)  , "Lengths of ner_tags, tokens, and boxes must be equal."

            samples.append({
                "id": str(guid),
                "tokens": tokens,
                "bboxes": boxes,
                "image": image,
                'image_name':file
            })

        return samples

    def split_generators(self):
        """Return train and test splits."""
        train_dir = os.path.join(self.data_dir, "training_data")
        test_dir = os.path.join(self.data_dir, "testing_data")

        # Load train and test data
        self.splits["train"] = self.load_dataset(train_dir)
        self.splits["test"] = self.load_dataset(test_dir)

    def __repr__(self):
        """Customize the printed representation of the dataset."""
        train_size = len(self.splits["train"])
        test_size = len(self.splits["test"])

        return (
            f"CustomFunsdDataset:\n"
            f"DatasetDict({{\n"
            f"    train: Dataset({{features: ['id', 'tokens', 'line_boxes', 'bboxes', 'ner_tags','line_ids','linkings','image','image_name'], num_rows: {train_size}}}),\n"
            f"    test: Dataset({{features: ['id', 'tokens', 'line_boxes', 'bboxes', 'ner_tags','line_ids','linkings','image','image_name'], num_rows: {test_size}}})\n"
            f"}})"
        )

    def __getitem__(self, split):
        """Allow access to train or test splits like dataset['train']."""
        return self.splits[split]

# Example usage
data_dir = ""
dataset = CustomFunsdDataset(data_dir)

# Load the train and test splits
dataset.split_generators()

dataset

CustomFunsdDataset:
DatasetDict({
    train: Dataset({features: ['id', 'tokens', 'line_boxes', 'bboxes', 'ner_tags','line_ids','linkings','image','image_name'], num_rows: 149}),
    test: Dataset({features: ['id', 'tokens', 'line_boxes', 'bboxes', 'ner_tags','line_ids','linkings','image','image_name'], num_rows: 50})
})

In [50]:
dataset['train'][0]

{'id': '0',
 'tokens': [R&D,
  QUALITY,
  IMPROVEMENT,
  SUGGESTION,
  /,
  SOLUTION,
  FORM,
  Name,
  /,
  Phone,
  Ext,
  .,
  :,
   ,
  M.,
  Hamann,
  ,,
  P.,
  Harper,
  ,,
  P.,
  Martinez,
      ,
  Date,
  :,
   ,
  _,
  9/3/92,
  _,
  R&D,
  Group,
  :,
  _,
  Licensee,
  Supervisor,
  /,
  Manager,
  :,
  J.,
  S.,
  Wigand,
  Discontinue,
  coal,
  retention,
  analyses,
  on,
  licensee,
  submitted,
  Suggestion,
  :,
  product,
  samples,
  .,
  (,
  Note,
  :,
  Coal,
  Retention,
  testing,
  is,
  not,
  performed,
  by,
  most,
  licensees,
  .,
  Other,
  B&W,
  physical,
  measurements,
  as,
  ends,
  stability,
  and,
  inspection,
  for,
  soft,
  spots,
  in,
  cigarettes,
  are,
  thought,
  to,
  be,
  sufficient,
  measures,
  to,
  assure,
  cigarette,
  physical,
  integrity,
  .,
  The,
  proposed,
  action,
  will,
  Increase,
  laboratory,
  productivity,
  .,
  ),
  Suggested,
  Solution(s,
  ):,
   ,
  Delete,
  coal,
  retention,
  from,
  the,
  li