In [2]:
pip install ucimlrepo


Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [10]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
regensburg_pediatric_appendicitis = fetch_ucirepo(id=938)

# data (as pandas dataframes)
X = regensburg_pediatric_appendicitis.data.features
y = regensburg_pediatric_appendicitis.data.targets

# metadata
print(regensburg_pediatric_appendicitis.metadata)

# variable information
print(regensburg_pediatric_appendicitis.variables)

{'uci_id': 938, 'name': 'Regensburg Pediatric Appendicitis', 'repository_url': 'https://archive.ics.uci.edu/dataset/938/regensburg+pediatric+appendicitis', 'data_url': 'https://archive.ics.uci.edu/static/public/938/data.csv', 'abstract': 'This repository holds the data from a cohort of pediatric patients with suspected appendicitis admitted with abdominal pain to Children’s Hospital St. Hedwig in Regensburg, Germany, between 2016 and 2021. Each patient has (potentially multiple) ultrasound (US) images, aka views, tabular data comprising laboratory, physical examination, scoring results and ultrasonographic findings extracted manually by the experts, and three target variables, namely, diagnosis, management and severity.', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Tabular', 'Image'], 'num_instances': 782, 'num_features': 53, 'feature_types': ['Real', 'Categorical', 'Integer'], 'demographics': ['Age', 'Sex'], 'target_col': ['Management', 'Severity',

In [4]:
print(X)

       Age    BMI     Sex  Height  Weight  Length_of_Stay  Alvarado_Score  \
0    12.68  16.90  female   148.0    37.0             3.0             4.0   
1    14.10  31.90    male   147.0    69.5             2.0             5.0   
2    14.14  23.30  female   163.0    62.0             4.0             5.0   
3    16.37  20.60  female   165.0    56.0             3.0             7.0   
4    11.08  16.90  female   163.0    45.0             3.0             5.0   
..     ...    ...     ...     ...     ...             ...             ...   
777  12.41  25.25  female   166.5    70.0             4.0             8.0   
778  17.09  20.43  female   158.0    51.0             6.0             5.0   
779  14.99  19.91  female   152.0    46.0             4.0             5.0   
780   7.20  14.30    male   129.3    23.9             5.0             9.0   
781  11.51  18.17    male   146.5    39.0             4.0             2.0   

     Paedriatic_Appendicitis_Score Appendix_on_US  Appendix_Diameter  ...  

In [7]:
import zipfile, os, re, shutil
from PIL import Image # Import the Image module from PIL

# === 2️⃣ Path to your ZIP file on Google Drive ===
# 👉 Change this path to match your actual file location
zip_path = "/content/drive/MyDrive/US Pictures.zip"

# === 3️⃣ Create folder to extract contents ===
extract_path = "/content/extracted_zip"
os.makedirs(extract_path, exist_ok=True)

# === 4️⃣ Unzip the folder ===
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)
print("✅ Unzipped to:", extract_path)

# === 5️⃣ Create output folder for matched images ===
output_folder = "/content/Filtered_Images" # Output will be written here
os.makedirs(output_folder, exist_ok=True)
if os.path.exists(output_folder):
    shutil.rmtree(output_folder) # Remove the folder and its contents if it exists
    print(f"🗑️ Cleared existing output folder: {output_folder}")
os.makedirs(output_folder, exist_ok=True) # Recreate the empty folder
print(f"✅ Created (or ensured empty) output folder: {output_folder}")

# Initialize lists to store loaded images and their original filenames
selected_images = []
selected_filenames = []

# === 6️⃣ Define regex for App / Appendix (FROM YOUR PROVIDED SNIPPET) ===
# This regex matches 'app' or 'appendix' as the main word.
# It allows digits, periods, spaces, underscores, or hyphens BEFORE the keyword.
# It does NOT allow any other English letters or numbers AFTER the keyword.
pattern = re.compile(r'^(?:[\d\.\s_-]*)(app|appendix)$', re.IGNORECASE)

# === 7️⃣ Image extensions allowed (FROM YOUR PROVIDED SNIPPET) ===
allowed_exts = ('.bmp', '.png')

# === 8️⃣ Traverse extracted files and copy/load matches ===
matched_files_copied_paths = [] # This list will store the paths to the COPIED files in output_folder

for root, _, files in os.walk(extract_path):
    for file in files:
        if file.lower().endswith(allowed_exts): # Use the user-defined allowed_exts
            # Remove file extension
            name_without_ext = os.path.splitext(file)[0]
            # Remove leading/trailing spaces
            cleaned_name = name_without_ext.strip()

            # Match only if the cleaned name fits the pattern
            if pattern.match(cleaned_name):
                src = os.path.join(root, file)
                dst = os.path.join(output_folder, file) # Destination in the output_folder

                # Copy the file to the output folder
                shutil.copy2(src, dst)
                matched_files_copied_paths.append(dst) # Track the copied path
                print(f"✅ Copied: {file}")

                # Load the image using PIL and store it
                try:
                    img = Image.open(src).convert('RGB') # Open from source path
                    selected_images.append(img)
                    selected_filenames.append(file) # Store original filename
                except Exception as e:
                    print(f"⚠️ Could not load image {file}: {e}")

# === 9️⃣ Show summary ===
print("\n--- Summary of copied files ---")
print("Total copied images:", len(matched_files_copied_paths))
print("Saved in:", output_folder)

print("\n--- Summary of loaded images (into Python memory) ---")
print("Total images loaded into memory:", len(selected_images))
# print("Filenames of loaded images:", selected_filenames) # Uncomment to see all loaded filenames

✅ Unzipped to: /content/extracted_zip
🗑️ Cleared existing output folder: /content/Filtered_Images
✅ Created (or ensured empty) output folder: /content/Filtered_Images
✅ Copied: 887.3 App.bmp
✅ Copied: 96.12 App .bmp
✅ Copied: 747.10 App.bmp
✅ Copied: 152.2 App.bmp
✅ Copied: 918.2 App.bmp
✅ Copied: 130.6 App.bmp
✅ Copied: 458.1 App.bmp
✅ Copied: 277.4 App.bmp
✅ Copied: 919.3 App.bmp
✅ Copied: 4.2 Appendix.png
✅ Copied: 761.4 App.bmp
✅ Copied: 747.4 App.bmp
✅ Copied: 777.3 App.bmp
✅ Copied: 927.12 App.bmp
✅ Copied: 801.2 App.bmp
✅ Copied: 565.1 App.bmp
✅ Copied: 777.2 App.bmp
✅ Copied: 80.2 App.bmp
✅ Copied: 91.4 App.bmp
✅ Copied: 143.2 App.bmp
✅ Copied: 463.4 App.bmp
✅ Copied: 936.2 App.bmp
✅ Copied: 7.1 Appendix.bmp
✅ Copied: 927.8 App.bmp
✅ Copied: 503.2 app.bmp
✅ Copied: 49.2 App.bmp
✅ Copied: 278.2 App.bmp
✅ Copied: 443.1 App.bmp
✅ Copied: 697.1 App.bmp
✅ Copied: 841.1 App.bmp
✅ Copied: 928.2 App.bmp
✅ Copied: 117.1 App.bmp
✅ Copied: 99.2 App.bmp
✅ Copied: 249.1 App.bmp
✅ Copied: 77

In [8]:
numeric_data = {}
# Regex to find all sequences of digits (integers or floats with periods)
# This will capture numbers like "742", "2", "102", "19", etc.
number_pattern = re.compile(r'\d+\.?\d*') # Matches one or more digits, optionally followed by a period and more digits

for full_path in matched_files_copied_paths:
    filename = os.path.basename(full_path)
    name_without_ext = os.path.splitext(filename)[0]

    # Find all numbers in the cleaned filename
    numbers = number_pattern.findall(name_without_ext)

    if numbers:
        numeric_data[filename] = numbers
        print(f"File: {filename}, Numbers: {', '.join(numbers)}")
    else:
        print(f"File: {filename}, No numbers found.")

print(f"\nTotal files with extracted numbers: {len(numeric_data)}")

File: 887.3 App.bmp, Numbers: 887.3
File: 96.12 App .bmp, Numbers: 96.12
File: 747.10 App.bmp, Numbers: 747.10
File: 152.2 App.bmp, Numbers: 152.2
File: 918.2 App.bmp, Numbers: 918.2
File: 130.6 App.bmp, Numbers: 130.6
File: 458.1 App.bmp, Numbers: 458.1
File: 277.4 App.bmp, Numbers: 277.4
File: 919.3 App.bmp, Numbers: 919.3
File: 4.2 Appendix.png, Numbers: 4.2
File: 761.4 App.bmp, Numbers: 761.4
File: 747.4 App.bmp, Numbers: 747.4
File: 777.3 App.bmp, Numbers: 777.3
File: 927.12 App.bmp, Numbers: 927.12
File: 801.2 App.bmp, Numbers: 801.2
File: 565.1 App.bmp, Numbers: 565.1
File: 777.2 App.bmp, Numbers: 777.2
File: 80.2 App.bmp, Numbers: 80.2
File: 91.4 App.bmp, Numbers: 91.4
File: 143.2 App.bmp, Numbers: 143.2
File: 463.4 App.bmp, Numbers: 463.4
File: 936.2 App.bmp, Numbers: 936.2
File: 7.1 Appendix.bmp, Numbers: 7.1
File: 927.8 App.bmp, Numbers: 927.8
File: 503.2 app.bmp, Numbers: 503.2
File: 49.2 App.bmp, Numbers: 49.2
File: 278.2 App.bmp, Numbers: 278.2
File: 443.1 App.bmp, Number

In [19]:

import pandas as pd
import numpy as np # To handle NaN values
import os # For os.path.basename in print statements

# 👉 IMPORTANT: Update this path to your actual Excel file location on Google Drive.
excel_file_path = '/content/drive/MyDrive/Regensburg_Pediatric_Appendicitis_Dataset.xlsx'
# 👉 IMPORTANT: Update these to your actual column names in the Excel sheet.
us_number_col_name = 'US_Number'
diagnosis_col_name = 'Diagnosis'

# --- Initialize lists for categorized US numbers ---
appendicitis_us_numbers = []
no_appendicitis_us_numbers = []
nan_diagnosis_us_numbers = []
unmatched_us_numbers = [] # To track numbers not found in Excel

# --- Load the Excel sheet ---
try:
    df_diagnosis = pd.read_excel(excel_file_path)
    print(f"\n✅ Excel sheet '{os.path.basename(excel_file_path)}' loaded successfully.")
    # print(df_diagnosis) # Keep this commented unless you want to see the full DataFrame printed

    # Validate required columns
    if us_number_col_name not in df_diagnosis.columns:
        raise ValueError(f"Column '{us_number_col_name}' not found in Excel sheet. Please check the name.")
    if diagnosis_col_name not in df_diagnosis.columns:
        raise ValueError(f"Column '{diagnosis_col_name}' not found in Excel sheet. Please check the name.")

    # --- FIX START: Standardize Excel US_Number column by rounding down and converting to string ---
    df_diagnosis[us_number_col_name] = pd.to_numeric(df_diagnosis[us_number_col_name], errors='coerce')

    def standardize_and_round_down(num):
        if pd.isna(num):
            return str(np.nan) # Keep NaN as 'nan' string
        try:
            return str(int(float(num))) # Convert to float, then truncate to int (rounds down for positive), then to string
        except ValueError:
            return str(num) # Fallback in case of unexpected non-numeric value after coerce

    df_diagnosis[us_number_col_name] = df_diagnosis[us_number_col_name].apply(standardize_and_round_down)
    # --- FIX END ---


    # --- Combine all extracted US numbers from numeric_data (with rounding down) ---
    all_extracted_unique_us_numbers = set()

    # Iterate through the numeric_data dictionary to collect all unique US numbers
    if 'numeric_data' in locals() or 'numeric_data' in globals():
        for filename, numbers_list in numeric_data.items():
            for num_str in numbers_list:
                try:
                    # Convert to float, round down (truncate decimal), then convert to string
                    rounded_down_num_str = str(int(float(num_str)))
                    all_extracted_unique_us_numbers.add(rounded_down_num_str)
                except ValueError:
                    print(f"⚠️ Warning: Non-numeric value '{num_str}' found in filename '{filename}'. Skipping for standardization.")
        print(f"Total unique US numbers extracted from image filenames (rounded down): {len(all_extracted_unique_us_numbers)}")
    else:
        print("⚠️ Warning: 'numeric_data' variable not found. Ensure previous cells were run to extract numbers.")


    # --- Process each unique extracted US number against the Excel data ---
    print("\n--- Categorizing US Numbers based on Diagnosis ---")
    for us_num_str in all_extracted_unique_us_numbers:
        # Find rows in DataFrame where the 'US_Number' column matches the extracted number
        matching_rows = df_diagnosis[df_diagnosis[us_number_col_name] == us_num_str]
        print(f"Searching for US_Number (rounded down): '{us_num_str}'") # Added for better debugging

        if not matching_rows.empty:
            # Get the diagnosis from the first matching row (if multiple exist for one US_Number)
            diagnosis_value = matching_rows[diagnosis_col_name].iloc[0]

            # Convert diagnosis to string for case-insensitive comparison and strip whitespace
            diagnosis_str = str(diagnosis_value).lower().strip()

            if diagnosis_str == 'appendicitis':
                appendicitis_us_numbers.append(us_num_str)
            elif diagnosis_str == 'no appendicitis':
                no_appendicitis_us_numbers.append(us_num_str)
            elif diagnosis_str == 'nan' or pd.isna(diagnosis_value): # Check for string 'nan' or actual NaN
                nan_diagnosis_us_numbers.append(us_num_str)
            else:
                print(f"❓ Unrecognized diagnosis '{diagnosis_value}' for US number '{us_num_str}'. Skipping this number.")
        else:
            unmatched_us_numbers.append(us_num_str)
            print(f"⚠️ US number '{us_num_str}' from image not found in the Excel sheet after standardization.")

    # --- Display Final Categorization Results ---
    print("\n--- Final Categorized US Numbers ---")
    print(f"US Numbers with 'Appendicitis' diagnosis ({len(appendicitis_us_numbers)}): {appendicitis_us_numbers}")
    print(f"US Numbers with 'No Appendicitis' diagnosis ({len(no_appendicitis_us_numbers)}): {no_appendicitis_us_numbers}")
    print(f"US Numbers with 'NaN' or missing diagnosis ({len(nan_diagnosis_us_numbers)}): {nan_diagnosis_us_numbers}")
    if unmatched_us_numbers:
        print(f"US Numbers from images NOT found in Excel ({len(unmatched_us_numbers)}): {unmatched_us_numbers}")

except FileNotFoundError:
    print(f"\n❌ Error: Excel file not found at '{excel_file_path}'. Please verify the path.")
except ValueError as ve:
    print(f"\n❌ Configuration Error: {ve}")
except Exception as e:
    print(f"\n❌ An unexpected error occurred: {e}")


✅ Excel sheet 'Regensburg_Pediatric_Appendicitis_Dataset.xlsx' loaded successfully.
Total unique US numbers extracted from image filenames (rounded down): 325

--- Categorizing US Numbers based on Diagnosis ---
Searching for US_Number (rounded down): '875'
⚠️ US number '875' from image not found in the Excel sheet after standardization.
Searching for US_Number (rounded down): '747'
⚠️ US number '747' from image not found in the Excel sheet after standardization.
Searching for US_Number (rounded down): '22'
⚠️ US number '22' from image not found in the Excel sheet after standardization.
Searching for US_Number (rounded down): '619'
Searching for US_Number (rounded down): '490'
Searching for US_Number (rounded down): '26'
Searching for US_Number (rounded down): '771'
Searching for US_Number (rounded down): '7'
Searching for US_Number (rounded down): '826'
⚠️ US number '826' from image not found in the Excel sheet after standardization.
Searching for US_Number (rounded down): '72'
Search

In [20]:
print("\n--- Organizing Images by Diagnosis ---")

appendicitis_folder = os.path.join(output_folder, "Appendicitis")
no_appendicitis_folder = os.path.join(output_folder, "No_Appendicitis")
uncategorized_images_folder = os.path.join(output_folder, "Uncategorized_by_Diagnosis")

os.makedirs(appendicitis_folder, exist_ok=True)
os.makedirs(no_appendicitis_folder, exist_ok=True)
os.makedirs(uncategorized_images_folder, exist_ok=True) # For images found but no clear diagnosis

moved_count = 0
skipped_count = 0

# Ensure our reference lists are sets for efficient lookup
appendicitis_set = set(appendicitis_us_numbers)
no_appendicitis_set = set(no_appendicitis_us_numbers)
nan_diagnosis_set = set(nan_diagnosis_us_numbers) # Also include for completeness if needed later

for full_img_path in matched_files_copied_paths:
    filename = os.path.basename(full_img_path)
    name_without_ext = os.path.splitext(filename)[0]
    extracted_numbers_from_filename = number_pattern.findall(name_without_ext)

    target_diagnosis_folder = None
    current_us_num = None

    if extracted_numbers_from_filename:
        # We take the first extracted number and standardize it for comparison
        raw_num_str = extracted_numbers_from_filename[0]
        try:
            current_us_num = str(int(float(raw_num_str)))
        except ValueError:
            print(f"⚠️ Could not standardize number '{raw_num_str}' from file '{filename}'. Skipping image categorization.")
            continue # Skip to the next image if number can't be standardized

        if current_us_num in appendicitis_set:
            target_diagnosis_folder = appendicitis_folder
        elif current_us_num in no_appendicitis_set:
            target_diagnosis_folder = no_appendicitis_folder
        elif current_us_num in nan_diagnosis_set:
            # If you want to move NaN diagnosed images to a specific folder
            # target_diagnosis_folder = os.path.join(output_folder, "Diagnosis_NaN")
            # os.makedirs(target_diagnosis_folder, exist_ok=True)
            print(f"ℹ️ Image '{filename}' has US number '{current_us_num}' with 'NaN' diagnosis. Moving to uncategorized.")
            target_diagnosis_folder = uncategorized_images_folder # Or a dedicated NaN folder
        else:
            print(f"❓ Image '{filename}' (US: {current_us_num}) does not have a clear 'Appendicitis' or 'No Appendicitis' diagnosis in Excel. Moving to uncategorized.")
            target_diagnosis_folder = uncategorized_images_folder
    else:
        print(f"❓ No US number found in filename '{filename}'. Moving to uncategorized.")
        target_diagnosis_folder = uncategorized_images_folder


    if target_diagnosis_folder:
        destination_path = os.path.join(target_diagnosis_folder, filename)
        try:
            shutil.move(full_img_path, destination_path)
            print(f"✅ Moved: {filename} to {os.path.basename(target_diagnosis_folder)}")
            moved_count += 1
        except Exception as e:
            print(f"❌ Error moving {filename}: {e}")
    else:
        print(f"⚠️ Could not determine destination for {filename}. Image remains in original Filtered_Images.")
        skipped_count += 1

print(f"\n--- Image Organization Summary ---")
print(f"Total images moved: {moved_count}")
print(f"Images remaining uncategorized (or skipped): {len(matched_files_copied_paths) - moved_count}")
print(f"Images for 'Appendicitis' moved to: {appendicitis_folder}")
print(f"Images for 'No Appendicitis' moved to: {no_appendicitis_folder}")
print(f"Uncategorized images moved to: {uncategorized_images_folder}")


--- Organizing Images by Diagnosis ---
✅ Moved: 887.3 App.bmp to Appendicitis
✅ Moved: 96.12 App .bmp to Appendicitis
❓ Image '747.10 App.bmp' (US: 747) does not have a clear 'Appendicitis' or 'No Appendicitis' diagnosis in Excel. Moving to uncategorized.
✅ Moved: 747.10 App.bmp to Uncategorized_by_Diagnosis
✅ Moved: 152.2 App.bmp to Appendicitis
✅ Moved: 918.2 App.bmp to Appendicitis
✅ Moved: 130.6 App.bmp to Appendicitis
✅ Moved: 458.1 App.bmp to Appendicitis
❓ Image '277.4 App.bmp' (US: 277) does not have a clear 'Appendicitis' or 'No Appendicitis' diagnosis in Excel. Moving to uncategorized.
✅ Moved: 277.4 App.bmp to Uncategorized_by_Diagnosis
❓ Image '919.3 App.bmp' (US: 919) does not have a clear 'Appendicitis' or 'No Appendicitis' diagnosis in Excel. Moving to uncategorized.
✅ Moved: 919.3 App.bmp to Uncategorized_by_Diagnosis
✅ Moved: 4.2 Appendix.png to Appendicitis
✅ Moved: 761.4 App.bmp to Appendicitis
❓ Image '747.4 App.bmp' (US: 747) does not have a clear 'Appendicitis' o

In [21]:
# --- NEW: Print lengths of the diagnosis folders ---
try:
    print(f"Number of images in '{os.path.basename(appendicitis_folder)}' folder: {len(os.listdir(appendicitis_folder))}")
    print(f"Number of images in '{os.path.basename(no_appendicitis_folder)}' folder: {len(os.listdir(no_appendicitis_folder))}")
    print(f"Number of images in '{os.path.basename(uncategorized_images_folder)}' folder: {len(os.listdir(uncategorized_images_folder))}")
except Exception as e:
    print(f"❌ Error getting folder lengths: {e}")


Number of images in 'Appendicitis' folder: 493
Number of images in 'No_Appendicitis' folder: 79
Number of images in 'Uncategorized_by_Diagnosis' folder: 107
