In [None]:
# Import libraries
import os
import numpy as np
import pandas as pd

class CBISDDSM_Preprocessor:
    
    def __init__(self, base_path='/kaggle/input/cbis-ddsm-breast-cancer-image-dataset', output_path='/kaggle'):
        self.base_path = base_path
        self.output_path = output_path
        self.manifest = None
        
    # 1. Manifest structuration and filtering
    def load_and_filter_manifest(self):
        print(">> Loading and consolidating Manifest...")
        
        # Load CSVs
        try:
            calc_df = pd.read_csv(f'{self.base_path}/csv/calc_case_description_train_set.csv')
            mass_df = pd.read_csv(f'{self.base_path}/csv/mass_case_description_train_set.csv')
        except FileNotFoundError:
            print(f'ERROR: Files not found at {self.base_path}')
            return

        # Consolidate: Merge both datasets
        full_df = pd.concat([calc_df, mass_df], ignore_index=True)
        
        # Filter: Remove BENIGN_WITHOUT_CALLBACK data
        initial_count = len(full_df)
        full_df = full_df[full_df['pathology'] != 'BENIGN_WITHOUT_CALLBACK'].copy()
        print(f"Removed registers: {initial_count - len(full_df)} (Benign without callback)")
        
        # Create a binary label: 0: Benign, 1: Malignant
        full_df['label'] = full_df['pathology'].apply(lambda x: 1 if x == 'MALIGNANT' else 0)
        
        # Select essential columns, rename to fit project requisites and produce final dataframe
        self.manifest = full_df[[
            'patient_id', 
            'image file path', 
            'ROI mask file path', 
            'image view', 
            'label'
        ]].rename(columns={
            'image file path': 'image_path',    
            'ROI mask file path': 'mask_path', 
            'image view': 'view'               
        })
        
        # Reorganize dataframe
        self.manifest = self.manifest[['patient_id', 'image_path', 'mask_path', 'view', 'label']]

        # Save file
        save_loc = os.path.join(self.output_path, 'manifest.csv')
        self.manifest.to_csv(save_loc, index=False)
        print(f">> Manifest saved at: {save_loc}")
        
        return self.manifest

# Main function
def run_processing_pipeline():
    
    # Initialize
    processor = CBISDDSM_Preprocessor(base_path='/kaggle/input/cbis-ddsm-breast-cancer-image-dataset')
    
    # 1. Manifest
    df = processor.load_and_filter_manifest()

    df.head()

# Run main function
run_processing_pipeline()