In [None]:
! pip install kaggle

In [None]:

from google.colab import drive
drive.mount('/content/drive')

In [None]:
! mkdir ~/.kaggle

In [None]:

! cp /content/drive/MyDrive/Stat_Docs/kaggle.json ~/.kaggle/kaggle.json

In [None]:
# Change permission
! chmod 600 ~/.kaggle/kaggle.json


### Dataset Download
Link to dataset here

In [None]:
! kaggle datasets download -d awsaf49/cbis-ddsm-breast-cancer-image-dataset

In [None]:
! unzip 

In [None]:
# check disk usage
! df -h

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import random 
import tensorflow as tf
import re


In [None]:
calc_train = pd.read_csv('/kaggle/input/cbis-ddsm-breast-cancer-image-dataset/csv/calc_case_description_train_set.csv')
calc_test = pd.read_csv('/kaggle/input/cbis-ddsm-breast-cancer-image-dataset/csv/calc_case_description_test_set.csv')
mass_train = pd.read_csv('/kaggle/input/cbis-ddsm-breast-cancer-image-dataset/csv/mass_case_description_train_set.csv')
mass_test = pd.read_csv('/kaggle/input/cbis-ddsm-breast-cancer-image-dataset/csv/mass_case_description_test_set.csv')
dicom_data = pd.read_csv('/kaggle/input/cbis-ddsm-breast-cancer-image-dataset/csv/dicom_info.csv')

In [None]:
# Change the directory in the dicom_info.csv in order to load the imgs correctly
image_dir = '/kaggle/input/cbis-ddsm-breast-cancer-image-dataset/jpeg/'
full_mammogram_images = dicom_data[dicom_data.SeriesDescription == 'full mammogram images'].image_path
cropped_images = dicom_data[dicom_data.SeriesDescription == 'cropped images'].image_path
roi_mask_images = dicom_data[dicom_data.SeriesDescription == 'ROI mask images'].image_path

full_mammogram_images = full_mammogram_images.apply(lambda x: x.replace('CBIS-DDSM/jpeg', image_dir))
cropped_images = cropped_images.apply(lambda x: x.replace('CBIS-DDSM/jpeg', image_dir))
roi_mask_images = roi_mask_images.apply(lambda x: x.replace('CBIS-DDSM/jpeg', image_dir))
full_mammogram_images.head()

In [None]:
dicom_cleaning_data = dicom_data.copy()
dicom_cleaning_data['image_path'] = dicom_cleaning_data['image_path'].str.replace('CBIS-DDSM/jpeg/', image_dir)

In [None]:
dicom_cleaning_data.drop(['PatientBirthDate','AccessionNumber','Columns','ContentDate','ContentTime','PatientSex','PatientBirthDate',
                                                'ReferringPhysicianName','Rows','SOPClassUID','SOPInstanceUID',
                                                'StudyDate','StudyID','StudyInstanceUID','StudyTime','InstanceNumber','SeriesInstanceUID','SeriesNumber'],axis =1, inplace=True)
dicom_cleaning_data.info()

In [None]:
dicom_cleaning_data['SeriesDescription'].fillna(0, axis = 0, inplace=True)
dicom_cleaning_data['Laterality'].fillna(method = 'bfill', axis = 0, inplace=True)

dicom_cleaning_data.isna().sum()

In [None]:
# Mapping the labels
label_mapping = {'BENIGN': 0, 'MALIGNANT': 1, 'BENIGN_WITHOUT_CALLBACK': 2}
calc_train['label'] = calc_train['pathology'].map(label_mapping)
calc_test['label'] = calc_test['pathology'].map(label_mapping)
mass_train['label'] = mass_train['pathology'].map(label_mapping)
mass_test['label'] = mass_test['pathology'].map(label_mapping)

In [None]:
# Creating our functions to load and process mammograms, focusing on all 3 types of images (full mammogram, cropped images, ROI mask)

dicom_model = dicom_data.copy()
dicom_model['image_path'] = dicom_cleaning_data['image_path'].str.replace('CBIS-DDSM/jpeg/', image_dir)

# image loading and processing fxn to numpy array
def load_and_process_image(image_path):
    image = load_img(image_path, target_size=(224,224), color_mode="grayscale")
    image = img_to_array(image) / 255.0
    return image


def match1(file_path):
    patientID = file_path.split('/')[0]
    series_description = 'full mammogram images'
    filtered_df = dicom_cleaning_data[(dicom_cleaning_data['SeriesDescription'] == series_description) & 
                            (dicom_cleaning_data['PatientName'] == patientID)]

    if filtered_df.empty:
        return None
    #print(1)
    return filtered_df['image_path'].iloc[0]

def match2(file_path):
    patientID = file_path.split('/')[0]
    series_description = 'cropped images'
    filtered_df = dicom_cleaning_data[(dicom_cleaning_data['SeriesDescription'] == series_description) & 
                            (dicom_cleaning_data['PatientName'] == patientID)]
    if filtered_df.empty:
        return None
    #print(2)
    return filtered_df['image_path'].iloc[0]

def match3(file_path):
    patientID = file_path.split('/')[0]
    series_description = 'ROI mask images'

    filtered_df = dicom_cleaning_data[(dicom_cleaning_data['SeriesDescription'] == series_description) & 
                            (dicom_cleaning_data['PatientName'] == patientID)]
    if filtered_df.empty:
        print('no')
        return None
    #print(3)
    return filtered_df['image_path'].iloc[0]

# data loading 
def load_data(df):
    full_imgs = []
    cropped_imgs = []
    roi_imgs = []
    labels = []
    for _, row in df.iterrows():
        full_img_path = match1(row['image file path'])
        if full_img_path is None:
            continue
        cropped_img_path = match2(row['cropped image file path'])
        if cropped_img_path is None:
            continue
        roi_img_path = match3(row['ROI mask file path'])
        if roi_img_path is None:
            continue
        # roi_img_path = match4(row['ROI mask file path'])
        # if roi_img_path is None:
        #     continue

        if full_img_path is not None and cropped_img_path is not None and roi_img_path is not None:
            if os.path.exists(full_img_path) and os.path.exists(cropped_img_path) and os.path.exists(roi_img_path):
                full_imgs.append(load_and_process_image(full_img_path))
                cropped_imgs.append(load_and_process_image(cropped_img_path))
                roi_imgs.append(load_and_process_image(roi_img_path))
                labels.append(row['label'])
            

    return np.array(full_imgs), np.array(cropped_imgs), np.array(roi_imgs), np.array(labels)

In [None]:
# First, for calc_train - identify and remove duplicate image file paths.
calc_train['image file path'].nunique()
calc_train_model = calc_train.copy()
calc_train_model = calc_train_model.drop_duplicates(subset=['image file path']).reset_index(drop=True)
calc_train_model['image file path'].nunique()

In [None]:
# mass_train.
print(mass_train['image file path'].nunique())
mass_train_model = mass_train.copy()
mass_train_model = mass_train_model.drop_duplicates(subset=['image file path']).reset_index(drop=True)
mass_train_model['image file path'].nunique()

In [None]:
# mass_test.
print(mass_test['image file path'].nunique())
mass_test_model = mass_test.copy()
mass_test_model = mass_test_model.drop_duplicates(subset=['image file path']).reset_index(drop=True)
mass_test_model['image file path'].nunique()

In [None]:
# calc_test.
print(calc_test['image file path'].nunique())
calc_test_model = calc_test.copy()
calc_test_model = calc_test_model.drop_duplicates(subset=['image file path']).reset_index(drop=True)
calc_test_model['image file path'].nunique()

In [None]:
# check data-frame
calc_train_model.info()
print("/n")
mass_train_model.info()
print("/n")
calc_test_model.info()
print("/n")
mass_test_model.info()

In [None]:
print(match1(calc_train_model['image file path'][1000]))
print(match2(calc_train_model['cropped image file path'][1000]))
print(match3(calc_train_model['ROI mask file path'][1000]))
calc_train_model['label'][1000]

In [None]:
x_calc_full_train, x_calc_cropped_train, x_calc_roi_train, y_calc_train = [],[],[],[]
x_calc_full_train, x_calc_cropped_train, x_calc_roi_train, y_calc_train = load_data(calc_train_model)

x_calc_full_train.shape

In [None]:
x_calc_full_test = x_calc_full_train[1000:]
x_calc_cropped_test = x_calc_cropped_train[1000:]
x_calc_roi_test = x_calc_roi_train[1000:]
y_calc_test = y_calc_train[1000:]

x_calc_full_train = x_calc_full_train[:1000]
x_calc_cropped_train = x_calc_cropped_train[:1000]
x_calc_roi_train = x_calc_roi_train[:1000]
y_calc_train = y_calc_train[:1000]

In [None]:
x_calc_roi_train.shape

In [None]:
x_mass_full_train, x_mass_cropped_train, x_mass_roi_train, y_mass_train = [],[],[],[]
x_mass_full_train, x_mass_cropped_train, x_mass_roi_train, y_mass_train = load_data(mass_train_model)
x_mass_cropped_train.shape

In [None]:
x_mass_full_test, x_mass_cropped_test, x_mass_roi_test, y_mass_test = [], [], [], []
x_mass_full_test, x_mass_cropped_test, x_mass_roi_test, y_mass_test = load_data(mass_test_model)
x_mass_cropped_test.shape

In [None]:
# combine the training data 
x_full = np.concatenate([x_calc_full_train,x_mass_full_train], axis=0)
x_cropped = np.concatenate([x_calc_cropped_train,x_mass_cropped_train], axis=0)
x_roi = np.concatenate([x_calc_roi_train,x_mass_roi_train], axis=0)
y = np.concatenate([y_calc_train,y_mass_train], axis=0)


In [None]:
# combine testing data 
x_full_test = np.concatenate([x_calc_full_test,x_mass_full_test], axis=0)
x_cropped_test = np.concatenate([x_calc_cropped_test,x_mass_cropped_test], axis=0)
x_roi_test = np.concatenate([x_calc_roi_test,x_mass_roi_test], axis=0)
y_test = np.concatenate([y_calc_test,y_mass_test], axis=0)

In [None]:
# check shape of np arrays above
print(f"x_full shape: {x_full.shape}, /n x_cropped shape: {x_cropped.shape}, /n x_roi shape: {x_roi.shape}, /n y shape: {y.shape}")
print(f"x_full_test shape: {x_full_test.shape}, /n x_cropped_test shape: {x_cropped_test.shape}, /n x_roi_test shape: {x_roi_test.shape}, /n y_test shape: {y_test.shape}")

In [None]:
# to categorical
from tensoflow.keras.utils import to_categorical
y = to_categorical(y, num_classes=3)
y_test = to_categorical(y_test, num_classes=3)

In [None]:
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization, concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import ReduceLROnPlateau

In [None]:
def build_multi_input_model(input_shape = (224, 224, 3)):
    full_input = Input(shape=input_shape, name='full_input')
    x1 = Conv2D(32, (3, 3), activation='relu', padding='same')(full_input)
    x1 = BatchNormalization()(x1)
    x1 = MaxPooling2D(pool_size=(2, 2))(x1)
    x1 = Dropout(0.25)(x1)
    
    cropped_input = Input(shape=input_shape, name='cropped_input')
    x2 = Conv2D(32, (3, 3), activation='relu', padding='same')(cropped_input)
    x2 = BatchNormalization()(x2)
    x2 = MaxPooling2D(pool_size=(2, 2))(x2)
    x2 = Dropout(0.25)(x2)
    
    roi_input = Input(shape=input_shape, name='roi_input')
    x3 = Conv2D(32, (3, 3), activation='relu', padding='same')(roi_input)
    x3 = BatchNormalization()(x3)
    x3 = MaxPooling2D(pool_size=(2, 2))(x3)
    x3 = Dropout(0.25)(x3)
    
    merged = concatenate([x1, x2, x3])

    # fully connected layers
    x = Dense(256, activation='relu', kernel_regularizer=l2(0.001))(merged)  # Add L2 regularization
    x = Dropout(0.5)(x)
    x = Dense(128, activation='relu', kernel_regularizer=l2(0.001))(x)
    x = Dropout(0.4)(x)
    
    # Output layer
    output = Dense(3, activation='softmax')(x)

    model = Model(inputs=[full_input, cropped_input, roi_input], outputs=output)

    # Compile model with Adam optimizer and custom learning rate
    model.compile(optimizer=Adam(learning_rate=1e-4), loss='categorical_crossentropy', metrics=['accuracy'])

    return model