In [None]:
import os
import json
import xml.etree.ElementTree as ET
from PIL import Image
import pytesseract
from sklearn.metrics import precision_score, recall_score, f1_score

# Paths
xml_folder = "C:/Users/User/Desktop/UltraSafeReview/image_validity/xml_files"
image_folder = "C:/Users/User/Desktop/UltraSafeReview/deidentification_reidentification/deidentified_valid_images"

# Step 1: Read and parse XML files
def parse_xml_files(xml_folder):
    data = {}
    for filename in os.listdir(xml_folder):
        if filename.endswith('.xml'):
            file_path = os.path.join(xml_folder, filename)
            tree = ET.parse(file_path)
            root = tree.getroot()

            # Extract relevant fields
            file_names = [elem.text for elem in root.findall(".//FileName") if elem.text.endswith('.tif')]
            institution_name = root.findtext(".//InstitutionName", "")
            family_name = root.findtext(".//FamilyName", "")
            given_name = root.findtext(".//GivenName", "")
            sex = root.findtext(".//Sex", "")
            dicom_patient_id = root.findtext(".//DicomPatientID", "")
            dicom_protocol_name = root.findtext(".//DicomProtocolName", "")

            # Store in dictionary
            for file_name in file_names:
                data[file_name] = {
                    "InstitutionName": institution_name,
                    "FamilyName": family_name,
                    "GivenName": given_name,
                    "Sex": sex,
                    "DicomPatientID": dicom_patient_id,
                    "DicomProtocolName": dicom_protocol_name
                }
    return data

# Step 2: Extract text from images
def extract_text_from_image(image_path):
    image = Image.open(image_path)
    text = pytesseract.image_to_string(image)
    return text

# Step 3: Validate de-identification
def validate_images(data, image_folder):
    y_true = []  # Ground truth: 1 for sensitive data present, 0 for no sensitive data
    y_pred = []  # Prediction: 1 for de-identification failure, 0 for success

    success_count = 0
    failure_count = 0
    failed_images = []

    for file_name in os.listdir(image_folder):
        if file_name.endswith('.tif'):
            image_path = os.path.join(image_folder, file_name)
            residual_text = extract_text_from_image(image_path)

            # Check if the image file has a corresponding entry in the JSON data
            if file_name in data:
                fields = data[file_name]
                failed_fields = [field for field, value in fields.items() if value and value in residual_text]

                # Determine if de-identification passed or failed
                if failed_fields:
                    failure_count += 1
                    failed_images.append(file_name)
                    y_true.append(1)  # Sensitive data present
                    y_pred.append(1)  # De-identification failed
                else:
                    success_count += 1
                    y_true.append(1)  # Sensitive data present
                    y_pred.append(0)  # De-identification succeeded
            else:
                success_count += 1
                y_true.append(0)  # No sensitive data present
                y_pred.append(0)  # De-identification succeeded

    return success_count, failure_count, failed_images, y_true, y_pred

# Main execution
def main():
    xml_data = parse_xml_files(xml_folder)
    success_count, failure_count, failed_images, y_true, y_pred = validate_images(xml_data, image_folder)

    print(f"Number of successful de-identifications: {success_count}")
    print(f"Number of failed de-identifications: {failure_count}")
    if failed_images:
        print("Failed images:")
        for image in failed_images:
            print(image)

    # Calculate and display precision, recall, and F1-score
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)

    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")

if __name__ == "__main__":
    main()
