In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import json
import os
import cv2
def extract_blocks_and_crop(json_path, image_folder, output_folder):
    with open(json_path, 'r') as json_file:
        data = json.load(json_file)
    image_filename = data.get('imagePath', '')
    image_path = os.path.join(image_folder, image_filename)
    image = cv2.imread(image_path)
    a4_width_mm = 210
    a4_height_mm = 297
    dpi = 300
    # Convert A4 dimensions to pixels
    a4_width_pixels = a4_width_mm * dpi / 25.4
    a4_height_pixels = a4_height_mm * dpi / 25.4

    labeled_blocks = {}
    class_mapping = {
        'Name': 0,
        'Profile': 1,
        'Contact': 2,
        'Experience': 3,
        'Education': 4,
        'Skills': 5,
        'Projects': 6,
        'Certifications': 7,
        'Community': 8,
        'Languages': 9,
        'Interests': 10
    }

    for shape in data.get('shapes', []):
        label = shape.get('label')
        if label in ['bloc', 'nobloc', 'Picture', 'Profil']:
            continue
        if label in class_mapping:
            points = shape.get('points', [])
            class_id = class_mapping[label]

            # Extract the coordinates
            xmin, ymin = min(points, key=lambda x: x[0])
            xmax, ymax = max(points, key=lambda x: x[0])

            # Normalize coordinates
            normalized_xmin = xmin / a4_width_pixels
            normalized_ymin = ymin / a4_height_pixels
            normalized_xmax = xmax / a4_width_pixels
            normalized_ymax = ymax / a4_height_pixels

            # Crop
            cropped_region = image[int(ymin):int(ymax), int(xmin):int(xmax)]
            os.makedirs(output_folder, exist_ok=True)

            # Save the cropped blocks
            output_image_path = os.path.join(output_folder, f"{label}_{image_filename}")
            cv2.imwrite(output_image_path, cropped_region)
            output_txt_path = os.path.join(output_folder, f"{label}_{os.path.splitext(image_filename)[0]}.txt")
            with open(output_txt_path, 'w') as txt_file:
                txt_file.write(f"{class_id} {normalized_xmin} {normalized_xmax} {normalized_ymin} {normalized_ymax}\n")
            labeled_blocks[label] = {
                'xmin': xmin,
                'ymin': ymin,
                'xmax': xmax,
                'ymax': ymax,
                'output_image_path': output_image_path,
                'output_txt_path': output_txt_path,
                'class_id': class_id
            }

    return labeled_blocks

In [4]:
label_folder_path = "/content/drive/MyDrive/yolo-data-extraction/labels"
image_folder_path = "/content/drive/MyDrive/yolo-data-extraction/cvs"
output_folder_path = "/content/drive/MyDrive/train_custom_dataset/custom_dataset"

In [5]:
for filename in os.listdir(label_folder_path):
    if filename.endswith('.json'):
        json_file_path = os.path.join(label_folder_path, filename)
        blocks = extract_blocks_and_crop(json_file_path, image_folder_path, output_folder_path)

        # Process the extracted blocks as needed
        print(f"Processing {filename}:")
        for label, info in blocks.items():
            print(f"{label}: {info['output_image_path']} and {info['output_txt_path']}")
        print("\n")

Processing 1ea3ba3e-217a-4ad2-8ec2-4d000330a61f.json:
Experience: /content/drive/MyDrive/train_custom_dataset/custom_dataset/Experience_1ea3ba3e-217a-4ad2-8ec2-4d000330a61f.jpg and /content/drive/MyDrive/train_custom_dataset/custom_dataset/Experience_1ea3ba3e-217a-4ad2-8ec2-4d000330a61f.txt
Certifications: /content/drive/MyDrive/train_custom_dataset/custom_dataset/Certifications_1ea3ba3e-217a-4ad2-8ec2-4d000330a61f.jpg and /content/drive/MyDrive/train_custom_dataset/custom_dataset/Certifications_1ea3ba3e-217a-4ad2-8ec2-4d000330a61f.txt
Languages: /content/drive/MyDrive/train_custom_dataset/custom_dataset/Languages_1ea3ba3e-217a-4ad2-8ec2-4d000330a61f.jpg and /content/drive/MyDrive/train_custom_dataset/custom_dataset/Languages_1ea3ba3e-217a-4ad2-8ec2-4d000330a61f.txt
Skills: /content/drive/MyDrive/train_custom_dataset/custom_dataset/Skills_1ea3ba3e-217a-4ad2-8ec2-4d000330a61f.jpg and /content/drive/MyDrive/train_custom_dataset/custom_dataset/Skills_1ea3ba3e-217a-4ad2-8ec2-4d000330a61f.t

In [6]:
%cd /content/drive/MyDrive/train_custom_dataset/custom_dataset

/content/drive/MyDrive/train_custom_dataset/custom_dataset


In [None]:
# This function will create train.txt and test.txt files
def trainNtestFilesCreater(dataSetPath, mappingPath, train_size=0.80):
    listD = os.listdir(dataSetPath)
    images = [file for file in listD if file.split(".")[1] == "jpg"]
    trainS = int(len(images) * train_size)
    train_path = os.path.join(mappingPath, "train.txt")
    test_path = os.path.join(mappingPath, "test.txt")
    train = open(train_path, "wt")
    test = open(test_path, "wt")
    counter = 1
    for file in images:
        if counter <= trainS:
            # writing in train.txt file
            train.writelines(f"{os.path.join(mappingPath, file)}\n")
        else:
            # writing in test.txt file
            test.writelines(f"{os.path.join(mappingPath, file)}\n")
        counter += 1
    train.close()
    test.close()
    print(f"{counter - 1} files processed")

# Path of the custom dataset which includes images along with its annotations
dataSetPath = "/content/drive/MyDrive/yolo-data-extraction/custom_dataset"
# Complete path of the drive to write train.txt file and test.txt fiel
mappingPath = "/content/drive/MyDrive/yolo-data-extraction/custom_dataset"
# calling creater function to generate files
trainNtestFilesCreater(dataSetPath, mappingPath, train_size=0.80)