<a href="https://colab.research.google.com/github/kunal-shetty/Chest-X-ray-disease-prediction/blob/main/DataEntry.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

!ls /content/drive/MyDrive/Colab\ Notebooks/real\ data

Mounted at /content/drive/
bbox_2.csv			    images_001.zip  images_007.zip
data_entry.csv			    images_002.zip  images_008.zip
data_entry_.csv			    images_003.zip  images_009.zip
HACKATHON_CORRUPTED_BBox_List.csv   images_004.zip  images_010.zip
HACKATHON_CORRUPTED_Data_Entry.csv  images_005.zip  images_011.zip
images_0013			    images_006.zip  images_012.zip


In [None]:
import pandas as pd

BASE_PATH = "/content/drive/MyDrive/Colab Notebooks/real data/"

bbox = pd.read_csv(BASE_PATH + "bbox_2.csv")
data = pd.read_csv(BASE_PATH + "data_entry.csv")

corrupt_bbox = pd.read_csv(BASE_PATH + "HACKATHON_CORRUPTED_BBox_List.csv")
corrupt_data = pd.read_csv(BASE_PATH + "HACKATHON_CORRUPTED_Data_Entry.csv")


In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import re

data = pd.read_csv(
    "/content/drive/MyDrive/Colab Notebooks/real data/HACKATHON_CORRUPTED_Data_Entry.csv"
)

data.columns = data.columns.str.strip()

timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
error_log = []

def normalize_image_index(idx):
    idx = str(idx).strip()
    match = re.match(r"^(\d+)_([0-9]{3})\.png$", idx)
    if match:
        patient_id, img_no = match.groups()
        return f"{patient_id.zfill(8)}_{img_no}.png"
    return idx  # already correct or unknown format



In [None]:
# extract digits only (058Y → 58, -5 → 5, ?? → NaN)
age_extracted = data["patient_age"].astype(str).str.extract(r"(\d+)", expand=False)

data["patient_age"] = pd.to_numeric(age_extracted, errors="coerce")

# mark invalid ages
invalid_age_mask = (
    data["patient_age"].isna() |
    (data["patient_age"] <= 1) |
    (data["patient_age"] > 100)
)

# log errors
age_errors = data[invalid_age_mask].copy()
age_errors["error_reason"] = "INVALID_PATIENT_AGE"
age_errors["error_column"] = "patient_age"
age_errors["logged_at"] = timestamp

error_log.append(age_errors)



In [None]:
valid_views = ["PA", "AP", "LATERAL"]

data["View Position"] = data["View Position"].where(
    data["View Position"].isin(valid_views),
    "UNKNOWN"
)

invalid_view_mask = data["View Position"] == "UNKNOWN"

view_errors = data[invalid_view_mask].copy()
view_errors["error_reason"] = "INVALID_VIEW_POSITION"
view_errors["error_column"] = "View Position"
view_errors["logged_at"] = timestamp

error_log.append(view_errors)


In [None]:
def clean_finding_labels(label):
    if pd.isna(label):
        return "UNKNOWN_LABEL"

    parts = str(label).split("|")
    cleaned = []

    for p in parts:
        p = p.strip()
        if p in VALID_FINDINGS:
            cleaned.append(p)

    if not cleaned:
        return "UNKNOWN_LABEL"

    # sort for consistency
    return "|".join(sorted(set(cleaned)))


VALID_FINDINGS = {
    "Atelectasis",
    "Cardiomegaly",
    "Effusion",
    "Infiltration",
    "Mass",
    "Nodule",
    "Pneumonia",
    "Pneumothorax",
    "Emphysema",
    "Fibrosis",
    "Pleural_Thickening",
    "Hernia",
    "No Finding"
}

original_labels = data["finding_labels"].copy()

data["finding_labels"] = data["finding_labels"].apply(clean_finding_labels)

label_changed_mask = original_labels != data["finding_labels"]

label_errors = data[label_changed_mask].copy()
label_errors["error_reason"] = "INVALID_OR_OUTLIER_FINDING_LABEL"
label_errors["error_column"] = "finding_labels"
label_errors["logged_at"] = timestamp

error_log.append(label_errors)


In [None]:
# Normalize Gender IN-PLACE (canonical: M, F, UNKNOWN)
data["Gender"] = (
    data["Gender"]
    .replace([pd.NA, None], "UNKNOWN")
    .astype(str)
    .str.strip()
    .str.lower()
)

# Map textual values to canonical form
gender_map = {
    "m": "M",
    "male": "M",
    "f": "F",
    "female": "F"
}

data["Gender"] = data["Gender"].map(gender_map).fillna("UNKNOWN")

# Log rows that were UNKNOWN
invalid_gender_mask = data["Gender"] == "UNKNOWN"

gender_errors = data[invalid_gender_mask].copy()
gender_errors["error_reason"] = "INVALID_OR_MISSING_GENDER"
gender_errors["error_column"] = "Gender"
gender_errors["logged_at"] = timestamp

error_log.append(gender_errors)


In [None]:
data["Temp_Notes"] = (
    data["Temp_Notes"]
    .replace([pd.NA, None], "N/A")     # real NaNs first
    .astype(str)
    .str.strip()
    .replace(["", "nan", "NaN", "NA"], "N/A")
)

# Log rows that were normalized
notes_errors = data[data["Temp_Notes"] == "N/A"].copy()
notes_errors["error_reason"] = "PLACEHOLDER_TEMP_NOTES"
notes_errors["error_column"] = "Temp_Notes"
notes_errors["logged_at"] = timestamp

error_log.append(notes_errors)


In [None]:
# detect rows that need normalization
image_fix_mask = data["Image Index"].astype(str).str.match(r"^\d{1,7}_\d{3}\.png$")

# log BEFORE fixing
image_fix_errors = data[image_fix_mask].copy()
image_fix_errors["error_reason"] = "IMAGE_INDEX_NORMALIZED"
image_fix_errors["error_column"] = "Image Index"
image_fix_errors["logged_at"] = timestamp

error_log.append(image_fix_errors)

# FIX IN-PLACE
data.loc[image_fix_mask, "Image Index"] = (
    data.loc[image_fix_mask, "Image Index"]
    .apply(normalize_image_index)
)


In [None]:

UNKNOWN_VALUES = ["UNKNOWN", "UNKNOWN_IMAGE", "N/A", "UNKNOWN","UNKNOWN_LABEL", "invalid_image.png"]

delete_mask = (
    invalid_age_mask |
    data["Gender"].isin(UNKNOWN_VALUES) |
    data["View Position"].isin(UNKNOWN_VALUES) |
    data["Image Index"].isin(UNKNOWN_VALUES) |
    data["finding_labels"].isin(UNKNOWN_VALUES)
)

rows_before = len(data)

data = data[~delete_mask].reset_index(drop=True)

rows_after = len(data)

print(f"Deleted {rows_before - rows_after} rows containing UNKNOWN / invalid values")


Deleted 19409 rows containing UNKNOWN / invalid values


In [None]:
error_log_df = pd.concat(error_log, ignore_index=True)

final_error_log = (
    error_log_df
    .groupby("Image Index", as_index=False)
    .agg({
        **{col: "first" for col in error_log_df.columns if col != "error_reason"},
        "error_reason": lambda x: "; ".join(sorted(set(x)))
    })
)

# Define key columns to KEEP
KEY_COLUMNS = [
    "Image Index",
    "Patient ID",
    "Gender",
    "patient_age",
    "View Position",
    "finding_labels",
    "OriginalImageWidth",
    "OriginalImageHeight",
    "OriginalImagePixelSpacing_x",
    "OriginalImagePixelSpacing_y"
]

# Keep only key columns
final_model_data = data[KEY_COLUMNS].copy()

# Save final dataset
final_model_data.to_csv(
    "/content/drive/MyDrive/Colab Notebooks/outputs/FINAL_clean_data.csv",
    index=False
)

final_error_log.to_csv(
    "/content/drive/MyDrive/Colab Notebooks/outputs/ERROR_LOG_clean_data.csv",
    index=False
)
