In [None]:
import os
import json
import xml.etree.ElementTree as ET
from cryptography.fernet import Fernet
import re

# Paths
json_folder = "C:/Users/User/Desktop/UltraSafeReview/deidentification_reidentification/reidentified_valid_images11"
xml_folder = "C:/Users/User/Desktop/UltraSafeReview/image_validity/xml_files11"
output_file = "C:/Users/User/Desktop/UltraSafeReview/metadata_summary.json"

# Encryption key for decryption
encryption_key = b'rv9QPS5m7IThOFMLUmJcECJ_5izPGPh2CxI7LJUuOGI='
handle_encryption = Fernet(encryption_key)

# Function to decrypt and read JSON metadata
def decrypt_json_file(json_path):
    try:
        with open(json_path, 'rb') as file:
            encrypted_data = file.read()
        decrypted_data = handle_encryption.decrypt(encrypted_data)
        metadata = json.loads(decrypted_data.decode('utf-8'))
        return metadata
    except Exception as e:
        return f"ERROR: Failed to decrypt or parse JSON file '{json_path}': {e}"

# Function to process all JSON files and save results to a single JSON file
def process_json_files(json_folder):
    metadata_summary = {}
    for file_name in os.listdir(json_folder):
        if file_name.endswith('.json'):
            json_file_path = os.path.join(json_folder, file_name)
            base_name = file_name.replace('.json', '')
            print(f"Processing '{file_name}'...")
            metadata = decrypt_json_file(json_file_path)
            if isinstance(metadata, str) and metadata.startswith("ERROR"):
                print(metadata)
            else:
                # Extract only the text fields from metadata
                text_only_metadata = [item['text'].strip().lower() for item in metadata if 'text' in item]
                metadata_summary[base_name] = text_only_metadata

    # Save the summary to a JSON file
    with open(output_file, 'w') as output:
        json.dump(metadata_summary, output, indent=4)
    print(f"Metadata summary saved to '{output_file}'")
    return metadata_summary

# Function to parse XML files and extract relevant data
def parse_xml_files(xml_folder):
    xml_data = {}
    for file_name in os.listdir(xml_folder):
        if file_name.endswith('.xml'):
            file_path = os.path.join(xml_folder, file_name)
            tree = ET.parse(file_path)
            root = tree.getroot()

            # Extract the filename without extension
            file_names_in_xml = [elem.text.replace('.tif', '').strip() for elem in root.findall(".//FileName") if elem.text.endswith('.tif')]

            # Extract PHI fields
            file_data = {
                "InstitutionName": root.findtext(".//InstitutionName", "").strip().lower(),
                "FamilyName": root.findtext(".//FamilyName", "").strip().lower(),
                "GivenName": root.findtext(".//GivenName", "").strip().lower(),
                "Sex": root.findtext(".//Sex", "").strip().lower(),
                "DicomPatientID": root.findtext(".//DicomPatientID", "").strip().lower(),
                "DicomProtocolName": root.findtext(".//DicomProtocolName", "").strip().lower()
            }

            for fname in file_names_in_xml:
                xml_data[fname] = file_data

    return xml_data

# Function to clean text by removing punctuation
def clean_text(text):
    return re.sub(r'[^a-zA-Z0-9]', '', text).lower()

# Function to validate if the DicomPatientID is an approximate match
def is_approximate_match(field_value, metadata_text):
    # Clean the field value and look for approximate matches in the metadata
    cleaned_field = clean_text(field_value)
    for text in metadata_text:
        if cleaned_field in text or text in cleaned_field:
            return True
    return False

# Function to validate re-identification by comparing XML and JSON data
def validate_reidentification(metadata_summary, xml_data):
    success_count = 0
    failure_count = 0
    failed_files = []

    for file_name, metadata_text in metadata_summary.items():
        if file_name in xml_data:
            xml_fields = xml_data[file_name]
            missing_fields = []

            # Clean metadata text
            cleaned_metadata = [clean_text(text) for text in metadata_text]

            for field, value in xml_fields.items():
                if value:
                    # Special handling for approximate DicomPatientID match
                    if field == "DicomPatientID":
                        if not is_approximate_match(value, cleaned_metadata):
                            missing_fields.append(field)
                    else:
                        # Clean and split XML field into words
                        words = [clean_text(word) for word in value.split()]
                        if not all(word in cleaned_metadata for word in words):
                            missing_fields.append(field)

            if not missing_fields:
                success_count += 1
            else:
                failure_count += 1
                failed_files.append((file_name, missing_fields))
        else:
            failure_count += 1
            failed_files.append((file_name, ['No corresponding XML data']))

    # Print results
    print(f"Number of successful re-identifications: {success_count}")
    print(f"Number of failed re-identifications: {failure_count}")
    if failed_files:
        print("Failed files and missing fields:")
        for failed_file, fields in failed_files:
            print(f"{failed_file}: Missing fields - {fields}")

# Main execution
def main():
    metadata_summary = process_json_files(json_folder)
    xml_data = parse_xml_files(xml_folder)
    validate_reidentification(metadata_summary, xml_data)

if __name__ == "__main__":
    main()
