In [None]:
import json
import pandas as pd

# Load JSON data
with open('filtered-data.json') as file:
    data = json.load(file)

# Inspect data structure
print(json.dumps(data, indent=4))


In [None]:
from PIL import Image
import json
import os

def crop_image_using_yolo(image_path, yolo_result, image_size):
    try:
        # Load the image
        image = Image.open(image_path)

        # Parse yolo_result which is in the format 'x1,x2,y1,y2'
        yolo_coords = list(map(int, yolo_result.split(',')))
        # yolo_result example: "0,1010,563,1266"

        # Extract the coordinates for cropping (left, upper, right, lower)
        left = int((yolo_coords[0] / image_size[0]) * image_size[0])  # x1 -> left
        right = int((yolo_coords[1] / image_size[0]) * image_size[0])  # x2 -> right
        upper = int((yolo_coords[2] / image_size[1]) * image_size[1])  # y1 -> upper
        lower = int((yolo_coords[3] / image_size[1]) * image_size[1])  # y2 -> lower

        # Crop the image using the bounding box (left, upper, right, lower)
        cropped_image = image.crop((left, upper, right, lower))
        return cropped_image, (left, upper)
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return None, None

def process_images_from_json(json_file_path, image_folder):
    # Load the JSON file
    with open(json_file_path, 'r') as f:
        data = json.load(f)

    # Iterate through the documents in the JSON file
    for doc_id, doc_info in data.items():
        image_name = doc_info['path'] + '.jpeg'  # Assuming all images are in .jpeg format
        image_path = os.path.join(image_folder, image_name)

        # Get the YOLO results and image size from the JSON
        yolo_result = doc_info['yolo_result']
        image_size = doc_info['size']

        # Crop the image using the YOLO results
        cropped_image, (crop_x1, crop_y1) = crop_image_using_yolo(image_path, yolo_result, image_size)

        if cropped_image:
            # Save or display the cropped image
            cropped_image.show()  # Or save it: cropped_image.save(f'cropped_{image_name}')
            print(f"Processed and cropped image for document {doc_id}")

# Example usage:
json_file_path = 'filtered-data.json'  # Path to the JSON file
image_folder = 'Images'  # Folder where images are stored
process_images_from_json(json_file_path, image_folder)


In [5]:
import json
import pandas as pd
import os

def assign_category(text):
    """
    Assign category labels based on the extracted OCR text.
    This is a simple rule-based function.
    """
    text = text.lower()
    
    # Example categories, can be expanded
    if 'name' in text:
        return 'Name'
    elif 'dob' in text or 'birth' in text:
        return 'Date of Birth'
    elif 'number' in text or 'document' in text:
        return 'Document Number'
    elif 'expiry' in text:
        return 'Expiry Date'
    else:
        return 'unknown'  # Template information

def extract_ocr_data_and_assign_labels(data):
    """
    Extract OCR text and assign category labels.
    """
    extracted_data = []
    
    # Iterate through each document in the JSON
    for doc_id, doc_info in data.items():
        ocr_data = doc_info['ocr']  # OCR data section
        
        for text, coordinates in ocr_data.items():
            category = assign_category(text)
            
            # Store extracted information (text, coordinates, category)
            extracted_data.append({
                'Document ID': doc_id,
                'Text': text,
                'Coordinates': coordinates,
                'Category': category
            })
    
    return extracted_data


In [6]:
def process_json_and_assign_categories(json_file_path):
    # Load the JSON file
    with open(json_file_path, 'r') as f:
        data = json.load(f)
    
    # Extract data and assign category labels
    extracted_data = extract_ocr_data_and_assign_labels(data)
    
    # Convert extracted data to a pandas DataFrame for easier manipulation
    df = pd.DataFrame(extracted_data)
    
    return df


In [7]:
def save_data_to_csv(df, output_csv_path):
    """
    Save the DataFrame containing extracted OCR text and category labels to a CSV file.
    """
    df.to_csv(output_csv_path, index=False)
    print(f"Data saved to {output_csv_path}")

# Example usage
json_file_path = 'filtered-data.json'  # Path to your JSON file
output_csv_path = 'ocr_extracted_data.csv'  # Path where CSV will be saved

# Process the JSON and get the DataFrame
df = process_json_and_assign_categories(json_file_path)

# Save to CSV
save_data_to_csv(df, output_csv_path)


Data saved to ocr_extracted_data.csv


In [8]:
import json
import pandas as pd

# Load the JSON file
def load_json(json_file_path):
    with open(json_file_path, 'r') as f:
        return json.load(f)

# Extract numeric features (width, height, coordinates) and categorize text
def extract_features(data):
    extracted_data = []
    
    for doc_id, doc_info in data.items():
        image_path = doc_info.get('path')
        document_type = doc_info.get('type')
        ocr_data = doc_info.get('ocr', {})
        image_size = doc_info.get('size', [1024, 661])  # Default size if missing
        
        # Iterate through the OCR data
        for text, coords in ocr_data.items():
            # Extract X and Y coordinates
            x_coords = [coord['x'] for coord in coords]
            y_coords = [coord['y'] for coord in coords]
            
            # Calculate width, height, relative length, and slope
            width = max(x_coords) - min(x_coords)
            height = max(y_coords) - min(y_coords)
            relative_length = width / height if height != 0 else 0
            slope = (y_coords[-1] - y_coords[0]) / (x_coords[-1] - x_coords[0]) if (x_coords[-1] - x_coords[0]) != 0 else 0
            
            # Normalize coordinates between 0 and 1
            normalized_coords = [(x / image_size[0], y / image_size[1]) for x, y in zip(x_coords, y_coords)]
            
            # Assign category to the text
            category = assign_category(text, doc_info.get('llm', {}))  # Use LLM response to assist classification
            
            # Store the extracted data
            extracted_data.append({
                'Document ID': doc_id,
                'Image Path': image_path,
                'Document Type': document_type,
                'Text': text,
                'X Coordinates': x_coords,
                'Y Coordinates': y_coords,
                'Width': width,
                'Height': height,
                'Relative Length': relative_length,
                'Slope': slope,
                'Normalized Coordinates': normalized_coords,
                'Category': category
            })
    
    return pd.DataFrame(extracted_data)

# Function to categorize text based on its content
def assign_category(text, llm_response):
    text = text.lower()
    llm_text = llm_response.get('response', {}).get('text', '').lower()
    
    # Rule-based classification using LLM and OCR content
    if 'name' in text or 'eesnimi' in text or 'given' in text:
        return 'Name'
    elif 'surname' in text or 'perekonnanimi' in text:
        return 'Surname'
    elif 'dob' in text or 'birth' in text or 's√ºnniaeg' in text:
        return 'Date of Birth'
    elif 'document' in text or 'number' in text:
        return 'Document Number'
    elif 'expiry' in text or 'kehtiv' in text:
        return 'Expiry Date'
    elif text in llm_text:
        return 'User Data'  # Data that is part of the user-specific information
    else:
        return 'unknown'  # Template/static information

# Save the extracted features and categories to a CSV file
def save_to_csv(df, output_file):
    df.to_csv(output_file, index=False)
    print(f"Data saved to {output_file}")

# Main function to run the process
def main():
    json_file_path = 'filtered-data.json'  # Replace with your actual path
    output_csv_path = 'extracted_features.csv'  # Path to save the CSV file
    
    # Load and process the JSON data
    json_data = load_json(json_file_path)
    features_df = extract_features(json_data)
    
    # Save the DataFrame to a CSV file
    save_to_csv(features_df, output_csv_path)

# Run the process
if __name__ == '__main__':
    main()


Data saved to extracted_features.csv
