# Multiple Records Dataset Formatting with Parallel Processing

## Installing Packages

In [1]:
# !pip install pyinstaller
# !pip install openpyxl
# !pip install tqdm
# !pip install opencv-python-headless
# !pip install pandas
# !pip install google-cloud-vision

In [2]:
# import zipfile
# with zipfile.ZipFile('/root/NA0219/Dataset_Formatting/Training-Data-21-05-2024.zip', 'r') as zip_ref: zip_ref.extractall('i')

## Imporing Packages and Libraries

In [None]:
import concurrent.futures
from tqdm import tqdm
import xml.etree.ElementTree as ET
import os
import cv2
import pandas as pd
import json
from google.cloud import vision_v1
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = ''
client = vision_v1.ImageAnnotatorClient()

## Extracting the text from Image using Google Cloud Vision OCR

In [4]:
def google_cloud_vision_response(img_path):
    try:
        with open(img_path, 'rb') as img_file:
            content = img_file.read()
        imgocr = vision_v1.Image(content=content)
        image_context = vision_v1.ImageContext()
        with concurrent.futures.ThreadPoolExecutor() as executor:
            future = executor.submit(client.document_text_detection, image=imgocr, image_context=image_context)
            with tqdm(total=1, desc=f"Processing {img_path}") as pbar:
                while not future.done():
                    pbar.update(0.1)  # Arbitrary update to refresh the progress bar
            response = future.result()
        return response
    except Exception as e:
        print(f"Error in Google Cloud Vision API for {img_path}: {e}")
        return None

## Extracting coordinates of Bounding Box using xml file

In [5]:
def extract_bounding_boxes(xml_file):
    # Parse the XML file
    tree = ET.parse(xml_file)
    root = tree.getroot()

    # Initialize an empty list to store the bounding boxes
    bounding_boxes = []

    # Extract objects
    objects = root.findall('object')

    # Process objects in parallel
    with concurrent.futures.ThreadPoolExecutor() as executor, tqdm(total=len(objects), desc="Processing Objects") as pbar:
        # Define a function to process each object and extract its bounding box
        def process_object(obj):
            bndbox = obj.find('bndbox')
            xmin = int(bndbox.find('xmin').text)
            ymin = int(bndbox.find('ymin').text)
            xmax = int(bndbox.find('xmax').text)
            ymax = int(bndbox.find('ymax').text)
            return (xmin, ymin, xmax, ymax)

        # Submit each object processing task to the executor
        future_to_obj = {executor.submit(process_object, obj): obj for obj in objects}
        for future in concurrent.futures.as_completed(future_to_obj):
            obj = future_to_obj[future]
            try:
                result = future.result()
                bounding_boxes.append(result)
            except Exception as e:
                print(f"Error processing object: {e}")
            finally:
                pbar.update(1)

    return bounding_boxes

## Detecting the OCR from the respective bounding boxes

In [6]:
def get_response_inside_roi(response, image_file, xml_file):
    if response is None:
        return None

    # Extract bounding boxes from the XML file
    bounding_boxes = extract_bounding_boxes(xml_file)

    # Initialize a list to store the texts from each bounding box
    ocr_results = []

    # Define a function to process each bounding box and extract text
    def process_bbox(bbox):
        xmin, ymin, xmax, ymax = bbox
        words = []

        # Iterate through each page, block, paragraph, and word in the OCR response
        for page in response.full_text_annotation.pages:
            for block in page.blocks:
                for paragraph in block.paragraphs:
                    for word in paragraph.words:
                        # Combine the symbols to form the full word text
                        word_text = ''.join([symbol.text for symbol in word.symbols])

                        # Extract the word's bounding box vertices
                        word_bounding_box = word.bounding_box.vertices
                        x1 = word_bounding_box[0].x
                        y1 = word_bounding_box[0].y
                        x2 = word_bounding_box[2].x
                        y2 = word_bounding_box[2].y

                        # Check if the word's bounding box is within the current bounding box
                        if (xmin <= x1 <= xmax and xmin <= x2 <= xmax and ymin <= y1 <= ymax and ymin <= y2 <= ymax):
                            words.append(word_text)

        # If there are words found within the current bounding box, combine them
        # and add them as a paragraph to the list
        if words:
            paragraph_text = ' '.join(words)
            return paragraph_text
        else:
            return None

    # Process bounding boxes in parallel
    with concurrent.futures.ThreadPoolExecutor() as executor, tqdm(total=len(bounding_boxes), desc="Processing Bounding Boxes") as pbar:
        # Submit each bounding box processing task to the executor
        future_to_bbox = {executor.submit(process_bbox, bbox): bbox for bbox in bounding_boxes}
        for future in concurrent.futures.as_completed(future_to_bbox):
            bbox = future_to_bbox[future]
            try:
                result = future.result()
                if result:
                    ocr_results.append(result)
            except Exception as e:
                print(f"Error processing bounding box: {e}")
            finally:
                pbar.update(1)

    # Extract the image name without path and file extension
    image_name = image_file.split('/')[-1]  # Get the file name
    image_name = image_name.rsplit('.', 1)[0]  # Remove file extension

    # Create a dictionary with the image name as the key and the list of OCR results as the value
    result_dict = {image_name: ocr_results}

    # Return the dictionary
    return result_dict

## Mapping xml and image files

In [None]:
image_folder = "/root/NA0219/Dataset_Formatting/Training-Data-21-05-2024/Images"
xml_folder = "/root/NA0219/Dataset_Formatting/Training-Data-21-05-2024/xmlfile"

# Get lists of image and xml files in their respective folders
image_files = os.listdir(image_folder)
xml_files = os.listdir(xml_folder)

# Create a set of xml file names for quick lookup
xml_set = set(xml_files)

# Initialize a list to hold the results
results = []

# Iterate through the image files
for image_file in image_files:
    # Check if the file has an image extension (jpg, jpeg, png)
    if image_file.endswith('.jpg') or image_file.endswith('.jpeg') or image_file.endswith('.png'):
        # Get the base name without the extension
        base_name = os.path.splitext(image_file)[0]

        # Create the expected xml file name
        expected_xml_file = base_name + '.xml'

        # Check if the corresponding xml file exists in the xml folder
        if expected_xml_file in xml_set:
            # Full paths for the image and xml files
            image_file_path = os.path.join(image_folder, image_file)
            xml_file_path = os.path.join(xml_folder, expected_xml_file)

            # Call your existing functions with the image and XML file paths
            response = google_cloud_vision_response(image_file_path)
            p = get_response_inside_roi(response, image_file_path, xml_file_path)
            #print(p)
            # Create a dictionary with the base_name as the key and the list of strings as the value
            #result_dict = {base_name: p}

            # Add the result dictionary to the list
            results.append(p)
        else:
            print(f"Corresponding XML file not found for image: {image_file}")

# results

Processing /root/NA0219/Dataset_Formatting/Training-Data-21-05-2024/Images/111709242_00003.jpg: 226061.4000086706it [00:02, 94303.93it/s] 
Processing Objects: 100%|██████████| 2/2 [00:00<00:00, 1991.12it/s]
Processing Bounding Boxes: 100%|██████████| 2/2 [00:00<00:00, 12.62it/s]
Processing /root/NA0219/Dataset_Formatting/Training-Data-21-05-2024/Images/111709242_00004.jpg: 190385.10000659397it [00:02, 92384.59it/s]
Processing Objects: 100%|██████████| 4/4 [00:00<00:00, 5039.72it/s]
Processing Bounding Boxes: 100%|██████████| 4/4 [00:00<00:00, 10.32it/s]
Processing /root/NA0219/Dataset_Formatting/Training-Data-21-05-2024/Images/111709242_00005.jpg: 262153.6000107686it [00:02, 94943.07it/s] 
Processing Objects: 100%|██████████| 4/4 [00:00<00:00, 4391.94it/s]
Processing Bounding Boxes: 100%|██████████| 4/4 [00:00<00:00,  9.07it/s]
Processing /root/NA0219/Dataset_Formatting/Training-Data-21-05-2024/Images/111709242_00006.jpg: 215296.80000804403it [00:02, 93556.02it/s]
Processing Objects: 1

## Processing row and extracting data from the dictionary

In [16]:
def process_row(row, columns):
    image_id = row['Image']
    row_data = {col: str(row[col]) for col in columns}
    return image_id, row_data

def extract_data_to_dict(df, columns):
    """
    Extracts specified columns for each unique 'Image_ID' from a DataFrame, and
    stores them in a dictionary of dictionaries.

    :param df: DataFrame from which to extract data.
    :param columns: List of columns to be extracted.
    :return: Dictionary with 'Image_ID' as keys and corresponding data as values.
    """
    # Filter the dataframe to include only the specified columns and 'Image'
    columns_with_id = ['Image'] + columns
    #columns_with_id = ['Image_Id'] + columns
    filtered_df = df[columns_with_id].copy()  # Create a copy of the DataFrame slice

    # Replace NaN with empty strings
    filtered_df.fillna('', inplace=True)

    # Create a dictionary with 'Image' as keys and the other column data as values
    data_dict = {}
    
    # Process rows in parallel
    with concurrent.futures.ThreadPoolExecutor() as executor, tqdm(total=len(filtered_df), desc="Processing Rows") as pbar:
        # Submit each row processing task to the executor
        future_to_row = {executor.submit(process_row, row, columns): row for _, row in filtered_df.iterrows()}
        for future in concurrent.futures.as_completed(future_to_row):
            row = future_to_row[future]
            try:
                image_id, row_data = future.result()
                if image_id in data_dict:
                    # If the image_id already exists, append the new row data to it
                    if isinstance(data_dict[image_id], list):
                        data_dict[image_id].append(row_data)
                    else:
                        data_dict[image_id] = [data_dict[image_id], row_data]
                else:
                    # Create a new entry in the dictionary for the new image_id
                    data_dict[image_id] = row_data
            except Exception as e:
                print(f"Error processing row: {e}")
            finally:
                pbar.update(1)

    return data_dict

columns_to_extract = [
   'Event_Day_orig',
   'Event_Month_orig',
   'Event_Year_orig',
   'Given_Name_orig',
   'Surname_orig',
   'Sex_orig',
   'Age_orig',
   'Birth_Year_orig',
   'Death_Year_orig',
   'Father_Given_Name_orig',
   'Father_Surname_orig',
   'Mother_Given_Name_orig',
   'Mother_Surname_orig',
   'Spouse_Given_Name_orig',
   'Spouse_Surname_orig',
   'Spouse_Age_orig',
   'Spouse_Birth_Year_orig',
   'Spouse_Father_Given_Name_orig',
   'Spouse_Father_Surname_orig',
   'Spouse_Mother_Given_Name_orig',
   'Spouse_Mother_Surname_orig'
]


# Load the Excel file
file_path = "/root/NA0219/Dataset_Formatting/Training-Data-21-05-2024/Training-Data-21-05-2024.xlsx"

df = pd.read_excel(file_path, dtype=str)  # Convert all columns to strings while reading
data_dict = extract_data_to_dict(df, columns_to_extract)

# Convert the dictionary to a JSON string with indentation for formatting
formatted_data_json = json.dumps(data_dict, indent=2, sort_keys=False)

Processing Rows: 100%|██████████| 13383/13383 [00:03<00:00, 3730.04it/s]


## Processing OCR with mapping & combining data

In [17]:
## IF OCR & KEYED DATA MATCHES 

# def map_and_combine_data(ocr_data, keyed_data):
#     combined_data = []
#     with concurrent.futures.ThreadPoolExecutor() as executor:
#         # Submitting each OCR entry processing task to the executor
#         futures = [executor.submit(process_ocr_entry, ocr_entry, keyed_data) for ocr_entry in ocr_data]
        
#         # Create tqdm progress bar
#         with tqdm(total=len(futures), desc="Processing OCR Entries") as pbar:
#             # Retrieving results as they complete
#             for future in concurrent.futures.as_completed(futures):
#                 combined_data.extend(future.result())
#                 pbar.update(1)
    
#     return combined_data

# def process_ocr_entry(ocr_entry, keyed_data):
#     combined_data = []
#     for image_id, ocr_text_list in ocr_entry.items():
#         key = image_id.split('.')[0]  # Removing file extension
#         if key in keyed_data:
#             json_data_list = keyed_data[key]
#             for i, ocr_text in enumerate(ocr_text_list):
#                 if i < len(json_data_list):  # Ensure there is a corresponding JSON entry
#                     combined_entry = {
#                         'Input': ocr_text,
#                         'Output': json_data_list[i]  # Corresponding JSON entry for this OCR text
#                     }
#                     combined_data.append(combined_entry)
#                 else:
#                     print(f"Warning: Insufficient JSON data for OCR text {ocr_text} in image {image_id}")
#         else:
#             print(f"Warning: No keyed data found for image {image_id}")
#     return combined_data

## IF OCR & KEYED DATA DON'T MATCH

def map_and_combine_data(ocr_data, keyed_data):
    combined_data = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Submitting each OCR entry processing task to the executor
        futures = [executor.submit(process_ocr_entry, ocr_entry, keyed_data) for ocr_entry in ocr_data]
        # Create tqdm progress bar
        with tqdm(total=len(futures), desc="Processing OCR Entries") as pbar:
            # Retrieving results as they complete
            for future in concurrent.futures.as_completed(futures):
                combined_data.extend(future.result())
                pbar.update(1)
    return combined_data

def process_ocr_entry(ocr_entry, keyed_data):
    combined_data = []
    for image_id, ocr_text_list in ocr_entry.items():
        key = image_id.split('.')[0]  # Removing file extension
        if key in keyed_data:
            json_data_list = keyed_data[key]
            for i, ocr_text in enumerate(ocr_text_list):
                if i < len(json_data_list):
                    try:
                        # Ensure there is a corresponding JSON entry
                        combined_entry = {
                            'Input': ocr_text,
                            'Output': json_data_list[i]  # Corresponding JSON entry for this OCR text
                        }
                        combined_data.append(combined_entry)
                    except IndexError:
                        # Handle the case where there is no JSON entry for the current index
                        print(f"Warning: Insufficient JSON data for OCR text {ocr_text} in image {image_id}")
                    except KeyError:
                        # Handle the case where json_data_list is not a list
                        print(f"Warning: Invalid JSON data for image {image_id}")
                else:
                    # Skip this OCR text as there is no corresponding JSON data
                    pass
        else:
            # Skip all OCR texts for this image as there is no keyed data
            pass
    return combined_data

## Main Function

In [None]:
# keyed_data = json.loads(formatted_data_json)

# # Call map_and_combine_data function with the dictionary
# combined_data = map_and_combine_data(results, keyed_data)

# # Continue with the rest of your code
# df_combined = pd.DataFrame(combined_data)
# df_combined.to_csv('final_formatted_dataset_NA0219.csv', index=False)

In [34]:
keyed_data = json.loads(formatted_data_json)

# Call map_and_combine_data function with the dictionary
combined_data = map_and_combine_data(results, keyed_data)
# Continue with the rest of your code
df_combined = pd.DataFrame(combined_data)
df_combined.to_csv('final_formatted_dataset_NA0219.csv', index=False)



Processing OCR Entries: 100%|██████████| 3597/3597 [00:00<00:00, 178856.59it/s]


# Dataset Merging

In [35]:
df = pd.read_csv("/root/NA0219/Dataset_Formatting/final_formatted_dataset_NA0219.csv")
df.shape

(13183, 2)

In [36]:
df1 = pd.read_excel("/root/NA0219/Dataset_Formatting/Training-Data-21-05-2024/Training-Data-21-05-2024.xlsx")
df1.shape

(13383, 24)

In [39]:
df_new = pd.read_csv('/root/NA0219/Dataset_Formatting/final_formatted_dataset_NA0219.csv')
df_prev = pd.read_csv('/root/NA0219/Dataset_Formatting/NA0219_Finetuning_1315_Single_Records_Dataset.csv')
merged_df = pd.concat([df_new, df_prev], axis=0)
merged_df.reset_index(drop=True, inplace=True)
merged_df.to_csv('Combined_Dastset_14498.csv', index=False)

In [40]:
df_combined = pd.read_csv("/root/NA0219/Dataset_Formatting/Combined_Dastset_14498.csv")
df_combined.shape

(14498, 2)

In [41]:
df_combined.head(10)

Unnamed: 0,Input,Output
0,Nr A 15 Juni Bosman Willy Thillsmon Albert Het...,"{'Event_Day_orig': '8', 'Event_Month_orig': 'J..."
1,Nr # 1 H Juni Pertongen Francine Josephine Che...,"{'Event_Day_orig': '10', 'Event_Month_orig': '..."
2,Nr # 16 Juni De Moor Willy Georges en Het jaar...,"{'Event_Day_orig': '8', 'Event_Month_orig': 'J..."
3,Nr 4117 Juni Van den Steen Frans Gerardus vijf...,"{'Event_Day_orig': '8', 'Event_Month_orig': 'J..."
4,MAANDEN . EIGENNAMEN . 10 AKTEN VAN OVERLIJDEN...,"{'Event_Day_orig': '9', 'Event_Month_orig': 'F..."
5,12 sness a ..... Het jaar negentien honderd ve...,"{'Event_Day_orig': '24', 'Event_Month_orig': '..."
6,MAANDEN . EIGENNAMEN . 13 Februar Siner AKTEN ...,"{'Event_Day_orig': '2', 'Event_Month_orig': 'M..."
7,15 Sibaunt Loins . om Het jaar negentien honde...,"{'Event_Day_orig': '25', 'Event_Month_orig': '..."
8,11 Lodewyk Karel Het jaar negentien honderd ve...,"{'Event_Day_orig': '6', 'Event_Month_orig': 'F..."
9,14 Februari S1barm theresin ..oars ... 1 . Het...,"{'Event_Day_orig': '6', 'Event_Month_orig': 'F..."
