In [None]:
# Import libraries
import os
import numpy as np
import pandas as pd

class CBISDDSM_Preprocessor:
    
    def __init__(self, base_path='/kaggle/input/cbis-ddsm-breast-cancer-image-dataset', output_path='/kaggle'):
        self.base_path = base_path
        self.output_path = output_path
        self.manifest = None
        
    # 1. Manifest structuration and filtering
    def load_and_filter_manifest(self):
        print(">> Loading and consolidating Manifest...")
        
        # Load diagnose CSVs and dicom_info (contains correct JPEG image paths)
        try:
            calc_df = pd.read_csv(f'{self.base_path}/csv/calc_case_description_train_set.csv')
            mass_df = pd.read_csv(f'{self.base_path}/csv/mass_case_description_train_set.csv')
            dicom_info = pd.read_csv(f'{self.base_path}/csv/dicom_info.csv')
        except FileNotFoundError:
            print(f'ERROR: Files not found at {self.base_path}')
            return

        # Merge both diagnose datasets and filter relevant info
        full_df = pd.concat([calc_df, mass_df], ignore_index=True)
        dicom_map = dicom_info[['file_path', 'image_path', 'PatientID']].copy()

        # Filter BENIGN_WITHOUT_CALLBACK data
        initial_count = len(full_df)
        full_df = full_df[full_df['pathology'] != 'BENIGN_WITHOUT_CALLBACK'].copy()
        print(f"Removed registers: {initial_count - len(full_df)} (Benign without callback)")

        # Create a binary label: 0: Benign, 1: Malignant
        full_df['label'] = full_df['pathology'].apply(lambda x: 1 if x == 'MALIGNANT' else 0)

        # Create a "foreign key" to map correct image filepaths inside dicom_info
        
        # Define prefix
        full_df['dataset_prefix'] = [
            'Mass-Training' if 'mass' in str(x).lower() else 'Calc-Training' 
            for x in full_df['abnormality type']
        ]
        
        # Build key (Concatenate)
        # [Tipo-Set]_[PatientID]_[Side]_[View]_[AbnormalityID]
        # Ex: Mass-Training_P_00001_LEFT_CC_1
        full_df['dicom_fk'] = (
            full_df['dataset_prefix'] + "_" +
            full_df['patient_id'] + "_" +
            full_df['left or right breast'] + "_" +
            full_df['image view'] + "_" +
            full_df['abnormality id'].astype(str)
        )

        # Map correct filepath inside dicom_info
        dicom_info['PatientID'] = dicom_info['PatientID'].astype(str).str.strip()
        
        # Merge using mounted key
        full_df = full_df.merge(
            dicom_info[['PatientID', 'image_path']],
            left_on='dicom_fk',
            right_on='PatientID',
            how='left'
        )
        
        # Verify if merge was possible
        print(f"Merge concluded. Found images: {full_df['image_path'].notnull().sum()}")

        # Fix path
        if 'image_path' in full_df.columns:
            full_df['image_path'] = full_df['image_path'].str.replace('CBIS-DDSM/', '', regex=False)

        # Fix ROI path
        if 'ROI mask file path' in full_df.columns:
            full_df['ROI mask file path'] = full_df['ROI mask file path'].str.replace('CBIS-DDSM/', '', regex=False)

        full_df['ROI mask file path'] = full_df['ROI mask file path'].str.replace('.dcm', '.jpg', regex=False)
        full_df['ROI mask file path'] = full_df['ROI mask file path'].str.replace('dicom', 'jpeg', regex=False)
            
        # Select essential columns, rename to fit project requisites and produce final dataframe
        self.manifest = full_df[[
            'patient_id', 
            'image_path', 
            'ROI mask file path', 
            'image view', 
            'label'
        ]].rename(columns={ 
            'ROI mask file path': 'mask_path', 
            'image view': 'view'               
        })
            
        # Reorganize dataframe
        self.manifest = self.manifest[['patient_id', 'image_path', 'mask_path', 'view', 'label']]

        # Save file
        save_loc = os.path.join(self.output_path, 'manifest.csv')
        self.manifest.to_csv(save_loc, index=False)
        print(f">> Manifest saved at: {save_loc}")
        
        return self.manifest

# Main function
def run_processing_pipeline():
    
    # Initialize
    processor = CBISDDSM_Preprocessor(base_path='/kaggle/input/cbis-ddsm-breast-cancer-image-dataset')
    
    # 1. Manifest
    df = processor.load_and_filter_manifest()

    df.head()

# Run main function
run_processing_pipeline()