In [1]:
import json
import pandas as pd
import os
from PIL import Image

In [2]:
from glob import glob
from tqdm import tqdm

## Helper functions to extract bounding box and other data from the input annotations JSON 
Source: https://www.kaggle.com/code/alejopaullier/benetech-create-bounding-box-dataframe
(Added some comments)

In [3]:
def extract_plot_bb(data):
    # Extract the bounding box coordinates (x0, y0, width, height) of the plot from the data dictionary
    x0 = data["plot-bb"]["x0"]
    y0 = data["plot-bb"]["y0"]
    w = data["plot-bb"]["width"]
    h = data["plot-bb"]["height"]
    return [x0, y0, w, h, "plot-bb"]


def extract_label_bbox(data, rows, img_id):
    common = []
    common += [img_id, data["source"], data["chart-type"]]
    
    # Iterate over each box in the "text" key of the data dictionary
    for box in data["text"]:
        # Extract the bounding box coordinates (x0, y0, width, height) and label of the text box
        x0 = box["polygon"]["x0"]
        y0 = box["polygon"]["y0"]
        w = box["polygon"]["x1"] - box["polygon"]["x0"]
        h = box["polygon"]["y3"] - box["polygon"]["y0"]
        label = box["role"]
        
        # Create a list containing the common metadata along with the extracted bounding box information and label
        box_row = common + [x0, y0, w, h, label]
        rows.append(box_row)
    
    # Append the bounding box information of the plot itself to the rows list
    rows.append(common + extract_plot_bb(data)) 
    return rows


def extract_tick_bbox(data, rows, img_id):
    common = []
    common += [img_id, data["source"], data["chart-type"]]
    
    # Iterate over each axis (x-axis and y-axis) in the "axes" key of the data dictionary
    for axis in data["axes"].keys():
        # Iterate over each tick box in the "ticks" key of the axis
        for box in data["axes"][axis]["ticks"]:
            # Extract the bounding box coordinates (x0, y0, width, height) and label of the tick box
            x0 = box["tick_pt"]["x"] - 5
            y0 = box["tick_pt"]["y"] - 5
            w = 10
            h = 10
            label = axis + "-tick"
            
            # Create a list containing the common metadata along with the extracted bounding box information and label
            box_row = common + [x0, y0, w, h, label]
            rows.append(box_row)
    
    return rows

Read all the annotations in the given JSON format and create a dataframe of the format specified below before 
converting them to COCO format to be used in *detectron2* and *layoutparser*

In [4]:
data_dir = '/home/mhmdzm/projects/kaggle/benetech/data/'
ANNOTATION = data_dir + "/train/annotations/*.json"
rows = []
for file_name in tqdm(glob(ANNOTATION)):
    label_bbox = []
    tick_bbox = []
    with open(file_name) as f:
        data = json.load(f)
        img_id = file_name.split("/")[-1].split(".")[0]
        label_bbox = extract_label_bbox(data, label_bbox, img_id)
        tick_bbox = extract_tick_bbox(data, tick_bbox, img_id)
        data_rows = label_bbox + tick_bbox
    rows += data_rows

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 60578/60578 [00:17<00:00, 3370.87it/s]


In [5]:
df = pd.DataFrame(rows)
df.columns = ["image_id", "source", "chart", "x0", "y0", "w", "h", "label"]
df.sample(10)

Unnamed: 0,image_id,source,chart,x0,y0,w,h,label
1716206,02dcb0d20586,generated,line,72,173,10,10,y-axis-tick
440262,bfb4fa1c806b,generated,line,31,167,34,14,tick_label
805114,1b1ba8b282a8,generated,line,273,321,25,10,tick_label
2027738,4cc0ee28cdaa,generated,vertical_bar,460,163,10,10,x-axis-tick
2090529,f05c97d7bda0,generated,scatter,200,273,10,14,tick_label
1486028,d4a1189e5e93,generated,line,253,265,34,8,tick_label
194727,a35555c085b6,generated,vertical_bar,7,59,12,189,axis_title
1159238,717da9022523,generated,line,124,258,10,10,x-axis-tick
811049,2c2fd38bf111,generated,line,73,8,373,15,chart_title
738536,0fa0d0363e82,generated,scatter,454,306,28,11,tick_label


In [6]:
# Add the path to images so that we can read directly by going through the columns of the dataframe
df['image_id'] = 'images/' + df['image_id'] + '.jpg'

## Training samples 
The total number of graph images in the input dataset are ~60K. Lets select about 10K images at random at it 
should be enough for our training step. 

In [7]:
# Sample a specific number of groups
sample_num = 10000
sampled_groups = df['image_id'].drop_duplicates().sample(n=sample_num, random_state=42)

# Filter the DataFrame to retain all rows from the sampled groups
sampled_df = df[df['image_id'].isin(sampled_groups)]

# Reset the index of the sampled DataFrame
sampled_df = sampled_df.reset_index(drop=True)

In [8]:
sampled_df.label.unique()

array(['chart_title', 'axis_title', 'tick_label', 'plot-bb',
       'x-axis-tick', 'y-axis-tick', 'other', 'tick_grouping'],
      dtype=object)

In [9]:
new_df = sampled_df.copy()
new_df.rename(columns={'chart':'chart_type', 'label':'object_category'}, inplace=True)
new_df.head(5)

Unnamed: 0,image_id,source,chart_type,x0,y0,w,h,object_category
0,images/c8ef1f3b431b.jpg,generated,line,169,38,318,28,chart_title
1,images/c8ef1f3b431b.jpg,generated,line,20,112,22,264,axis_title
2,images/c8ef1f3b431b.jpg,generated,line,120,441,415,22,axis_title
3,images/c8ef1f3b431b.jpg,generated,line,47,92,23,13,tick_label
4,images/c8ef1f3b431b.jpg,generated,line,47,155,23,13,tick_label


The total number of rows in this dataframe. We selected ~10K images, but each image has multiple annotations for
every chart title, axes title, x-ticks, y-ticks, labels and so on. Each attribute is a row in this dataframe.

In [10]:
len(new_df.index)

429688

## Convert to COCO format

In [11]:
# DataFrame containing the bounding box details and image filenames
df = new_df.copy()

coco_data = {
    "info": {
        "description": "Benetech input images - random 10K sample",
        "version": "1.0",
        "year": 2023,
        "contributor": "",
        "date_created": "2023-06-11"
    },
    "licenses": [],
    "images": [],
    "categories": [],
    "annotations": []
}

# Mapping for chart types to COCO category IDs
chart_type_mapping = {
    "dot": 1,
    "horizontal_bar": 2,
    "vertical_bar": 3,
    "line": 4,
    "scatter": 5
}

# Create COCO categories
categories = df["object_category"].unique()
for i, category in enumerate(categories, start=1):
    category_entry = {
        "id": i,
        "name": category
    }
    coco_data["categories"].append(category_entry)

# Iterate over the unique image filenames in the DataFrame
for filename in tqdm(df['image_id'].unique()):
    # Construct the file path
    file_path = os.path.join('/home/mhmdzm/projects/kaggle/benetech/data/train/', filename)  # Replace with the actual path to the images directory

    # Open the image file
    image = Image.open(file_path)

    # Get the width and height of the image
    width, height = image.size

    # Create a dictionary for the current image
    image_data = {
        "id": len(coco_data["images"]) + 1,  # Unique image ID
        "file_name": filename,
        "width": width,
        "height": height
    }

    # Append the image dictionary to the COCO data
    coco_data["images"].append(image_data)

    # Filter the DataFrame for the current image
    image_df = df[df['image_id'] == filename]

    # Iterate over the bounding boxes for objects in the current image
    for _, row in image_df.iterrows():
        image_filename = row["image_id"]
        chart_type = row["chart_type"]
        category_id = chart_type_mapping[chart_type]

        # Create COCO image entry if not already present
        image_entry = next((img for img in coco_data["images"] if img["file_name"] == image_filename), None)
        if image_entry is None:
            image_entry = {
                "id": len(coco_data["images"]) + 1,
                "width": int(row["w"]),
                "height": int(row["h"]),
                "file_name": image_filename,
                "license": 0,  # Add license ID if applicable
                "date_captured": ""  # Add date if available
            }
            coco_data["images"].append(image_entry)
            
        # Extract the bounding box details
        x, y, w, h = row['x0'], row['y0'], row['w'], row['h']

        # Create COCO annotation entry
        annotation_entry = {
            "id": len(coco_data["annotations"]) + 1,
            "image_id": image_entry["id"],
            "category_id": category_id,
            "segmentation": [],
            "bbox": [int(row["x0"]), int(row["y0"]), int(row["w"]), int(row["h"])],
            "area": int(row["w"]) * int(row["h"]),
            "iscrowd": 0  # Update accordingly if needed
        }

        # Append the annotation dictionary to the COCO data
        coco_data["annotations"].append(annotation_entry)


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [11:09<00:00, 14.95it/s]


## Outputs
Let us view some of the outputs of the final JSON in COCO format

In [12]:
coco_data['categories']

[{'id': 1, 'name': 'chart_title'},
 {'id': 2, 'name': 'axis_title'},
 {'id': 3, 'name': 'tick_label'},
 {'id': 4, 'name': 'plot-bb'},
 {'id': 5, 'name': 'x-axis-tick'},
 {'id': 6, 'name': 'y-axis-tick'},
 {'id': 7, 'name': 'other'},
 {'id': 8, 'name': 'tick_grouping'}]

In [13]:
coco_data['annotations'][:5]

[{'id': 1,
  'image_id': 1,
  'category_id': 4,
  'segmentation': [],
  'bbox': [169, 38, 318, 28],
  'area': 8904,
  'iscrowd': 0},
 {'id': 2,
  'image_id': 1,
  'category_id': 4,
  'segmentation': [],
  'bbox': [20, 112, 22, 264],
  'area': 5808,
  'iscrowd': 0},
 {'id': 3,
  'image_id': 1,
  'category_id': 4,
  'segmentation': [],
  'bbox': [120, 441, 415, 22],
  'area': 9130,
  'iscrowd': 0},
 {'id': 4,
  'image_id': 1,
  'category_id': 4,
  'segmentation': [],
  'bbox': [47, 92, 23, 13],
  'area': 299,
  'iscrowd': 0},
 {'id': 5,
  'image_id': 1,
  'category_id': 4,
  'segmentation': [],
  'bbox': [47, 155, 23, 13],
  'area': 299,
  'iscrowd': 0}]

In [14]:
coco_data['info']

{'description': 'Benetech input images - random 10K sample',
 'version': '1.0',
 'year': 2023,
 'contributor': '',
 'date_created': '2023-06-11'}

In [15]:
coco_data['images'][:5]

[{'id': 1,
  'file_name': 'images/c8ef1f3b431b.jpg',
  'width': 640,
  'height': 480},
 {'id': 2,
  'file_name': 'images/9509499d26a7.jpg',
  'width': 488,
  'height': 294},
 {'id': 3,
  'file_name': 'images/84a341a29701.jpg',
  'width': 512,
  'height': 347},
 {'id': 4,
  'file_name': 'images/5b6b6c22b1a7.jpg',
  'width': 467,
  'height': 270},
 {'id': 5,
  'file_name': 'images/d8792f2b6fef.jpg',
  'width': 512,
  'height': 299}]

The individual components making up the COCO file looks good. Now we can create the final JSON file to be used 
in the training step next.

In [16]:
# Save the COCO JSON to a file
with open("coco_dataset.json", "w") as f:
    json.dump(coco_data, f, indent=4)