In [2]:
import os
import math
import keras
import pydicom
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras import Sequential
from tensorflow.keras.utils import Sequence
from sklearn.preprocessing import minmax_scale
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import (Input, 
                                     Conv2D,
                                     GlobalMaxPooling2D,
                                     MaxPooling2D,
                                     Dropout, 
                                     Dense)

https://www.tensorflow.org/api_docs/python/tf/keras/utils/PyDataset

In [10]:
imgShape = (256, 256, 1)
folder = '/mnt/c/Users/vange/OneDrive - Tennessee Tech University/Desktop/Fall 23, Spring 24/4260/mammography'


class BatchCreator(Sequence):

    def __init__(self, data, dataType, batchSize, workers = 8, **kwargs):
        super().__init__(**kwargs)
        self.data = data
        self.batchSize = batchSize
        
        if dataType == 1:
            self.dataType = 'train'
            
        else:
            self.dataType = 'test'


    # returns the number of batches in the dataset 
    def __len__(self):
        return math.ceil(len(self.data) / self.batchSize)

    def __getitem__(self, index):
        # calculates the beginning and end indices for the batch based on the batch size and the index
        start = index * self.batchSize
        end = min(start + self.batchSize, len(self.data))
        
        # initializes arrays to hold the input images and labels for the batch
        inputImages = np.zeros((self.batchSize, ) + imgShape)        
        labels = np.zeros((self.batchSize, 1))

        # iterates over the indices of the current batch, stops if the index exceeds the length of the dataset
        for i, idx in enumerate(range(start, end)):          
            if index >= len(self.data):
                break
                
            patientID = self.data.iloc[idx]['patient_id']
            imgID = self.data.iloc[idx]['image_id']
            
            fileName = os.path.join(folder, f"{self.dataType}_images", str(patientID), f"{imgID}.dcm")
            image = pydicom.dcmread(fileName)
            images = image.pixel_array

            # normalizing pixel values [0, 1]
            images = minmax_scale(images)

            # Many images are in MONOCHROME1 format, we however need all in MONOCHROME2 format since 0 is black instead
            # of white like in MONOCRHOME1. This ensures that our dark areas in the images are actually dark and that
            # the bright areas are actually bright, in its original form, the opposite is true.
            if image.PhotometricInterpretation == "MONOCHROME1":
                images = 1 - images

            # resizes the image using nearest-neighbor interpolation and converts it to a nparray
            images = tf.image.resize(images = np.expand_dims(images, axis = -1), size = imgShape[:-1], method='nearest').numpy()

            # assigns the processed image to the inputImages array at index i
            inputImages[i, :, :, 0] = images.squeeze()
            
            if self.dataType == 'train':
                # assigns the label to the labels array at index i
                labels[i] = self.data.iloc[idx]['cancer']
            
        if self.dataType == 'train':
            return (inputImages, labels)
        
        else: 
            return inputImages

In [4]:
trainData = pd.read_csv("/mnt/c/Users/vange/OneDrive - Tennessee Tech University/Desktop/Fall 23, Spring 24/4260/mammography/train.csv")
newTrainData = []

# counts each unique cancer value
cancerCount = trainData.cancer.value_counts()

# prevents oversampling of patients with and without cancer, we use all available true cancer patients
# and sample the rest of the dataset to get the same amount of noncancerous patients
for i, placeholder in cancerCount.items():
    filteredData = trainData[trainData['cancer'] == i]
    sampledData = filteredData.sample(cancerCount[1])
    newTrainData.append(sampledData)

trainDF = pd.concat(newTrainData)

# gives 2316, we have 1158 true cancer patients, so we grab 1158 false cancer patients
len(trainDF)

2316

In [11]:
from sklearn.model_selection import train_test_split

xTrain, xVal = train_test_split(trainDF, test_size = 0.20, random_state = 1)
len(xTrain), len(xVal)

(1852, 464)

In [12]:
gendTrain = BatchCreator(xTrain, 1, 15)
gendVal = BatchCreator(xVal, 1, 15)

## Try a dropout of 0.35 - 0.4 and compare to 0.3, 0.5 is not good

In [8]:
imgShape = (256, 256, 1)
model = Sequential()

model.add(Input(shape = imgShape))

# Start with 32 filters and scale by 2 each layer, except for the layer following the next each time
# MaxPooling2d will downsample helping reduce computations, can also help overfitting and model efficiency
# Dropout randomly sets 30% of input to 0 to reduce overfitting, repeat these multiple times

model.add(Conv2D(32, (5, 5), activation = "relu"))
model.add(Conv2D(64, (5, 5), activation = "relu"))
model.add(MaxPooling2D())
model.add(Dropout(0.3))

model.add(Conv2D(64, (5, 5), activation = "relu"))
model.add(Conv2D(128, (5, 5), activation = "relu"))
model.add(MaxPooling2D())
model.add(Dropout(0.3))

model.add(Conv2D(128, (5, 5), activation = "relu"))
model.add(Conv2D(256, (5, 5), activation = "relu"))
model.add(MaxPooling2D())
model.add(Dropout(0.3))

model.add(GlobalMaxPooling2D())
model.add(Dropout(0.3))

# Dense layer used to learn non-linear combinations of the features
model.add(Dense(32, activation = 'relu'))
model.add(Dropout(0.3))

# Output layer, sigmoid activation function for binary classification
model.add(Dense(1, activation = 'sigmoid'))

# using recall and precision with different thresholds to see performance
model.compile(optimizer = Adam(learning_rate = 0.0005), loss = 'binary_crossentropy',
              metrics = [tf.keras.metrics.BinaryAccuracy(threshold = 0.5),
                         tf.keras.metrics.Recall(thresholds = [0.4, 0.5, 0.6]),
                         tf.keras.metrics.Precision(thresholds = [0.4, 0.5, 0.6])])

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_6 (Conv2D)           (None, 252, 252, 32)      832       
                                                                 
 conv2d_7 (Conv2D)           (None, 248, 248, 64)      51264     
                                                                 
 max_pooling2d_3 (MaxPoolin  (None, 124, 124, 64)      0         
 g2D)                                                            
                                                                 
 dropout_5 (Dropout)         (None, 124, 124, 64)      0         
                                                                 
 conv2d_8 (Conv2D)           (None, 120, 120, 64)      102464    
                                                                 
 conv2d_9 (Conv2D)           (None, 116, 116, 128)     204928    
                                                      

In [None]:
# loss = quantified value of how well the model is predicting vs the actual value, we want a low number
# binary accuracy = percentage of correct predictions
# recall = % of true + over all + predictions (true and false +) accuracy of the + predictions
# precision = % of true + out of all + (true + and false -) ability of model to find all + samples
# f1score = balance between recall and precision, we want high number


history = model.fit(gendTrain, validation_data = gendVal, epochs = 4)

In [None]:
history.history.keys()
f1_scores = []

# calculate the F1 score by iterating over the precision and recall values in the model history
# change precision_# to whatever numbers shows in the training history of model.fit
for precision, recall in zip(history.history['precision_8'], history.history['recall_8']):
    f1Score = 2 * (precision * recall) / (precision + recall)
    f1_scores.append(f1Score)

max(f1Score)

In [None]:
epochs = range(1, 5)

plt.plot(epochs, history.history['loss'], 'go', label = 'Training Loss')
plt.plot(epochs, history.history['val_loss'], 'b', label = 'Validation Loss')
plt.legend()

plt.show()

In [None]:
plt.plot(epochs, history.history['binary_accuracy'], 'bo', label = 'Training Accuracy')
plt.plot(epochs, history.history['val_binary_accuracy'], 'r', label = 'Validation Accuracy')
plt.legend()
plt.show()

In [None]:
model.save('model.keras')

In [None]:
model = keras.models.load_model('model.keras')