In [1]:
import os
import pydicom
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from skimage.transform import resize
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import torch
from torchvision.transforms import Resize

### `Creating 2 functions, grabTrainingImages and grabTestingImages, both will be used to obtain image data`

In [32]:
# Check if CUDA is available and set the device accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define a resize transform
resizeTransform = Resize((128, 128))
trainImages = []
trainLabels = []

def grabTrainingImages(rootFolder, subsetAmount=0) -> None:
    """Grabs all the training images up to an amount

    Args:
        rootFolder (string): The root folder to start the grab from
        subsetAmount (int): How many folders of mammograms to process
        
    This function, at least in our case, will never process all folders since it takes roughly 10
    hours to process all 60,000 mammograms. Therefore, a subsetAmount can be passed to allow a 
    certain amount to be processed. If no subsetAmount is passed, the function will process all.
    """
    
    count = 1
    
    # Iterate through each folder in the root folder
    for folderName in os.listdir(rootFolder):
        folderPath = os.path.join(rootFolder, folderName)

        # Check if the current item is a folder
        if os.path.isdir(folderPath):
            print(f"Processing Folder Number {count}: {folderName}")
            # Call the function recursively to process subfolders
            grabTrainingImages(folderPath, subsetAmount)
            
            for filename in os.listdir(folderPath):                
                if filename.endswith(".dcm"):                    
                    ds = pydicom.dcmread(os.path.join(folderPath, filename))
                    image = ds.pixel_array.astype(float)
                    
                    # Convert the image to a PyTorch tensor and move it to the GPU if available
                    imgTensor = torch.tensor(image).unsqueeze(0).to(device)
                    # Resize the image
                    imgTensor = resizeTransform(imgTensor)
                    # Flatten to create feature vector
                    trainImages.append(imgTensor.view(-1).cpu().numpy())


                    # To view the image, uncomment next line
                    # plt.imshow(image, cmap='gray'); plt.title(f"Patient ID: {folderName}"); plt.show()
            
            # If count reaches subsetAmount, it breaks the loop
            # If no subsetAmount is passed, it will never break since count start at 1, and subset is 0
            if count == subsetAmount:
                break
            
            # Print count either way to show progress
            count += 1


In [33]:
# Check if CUDA is available and set the device accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define a resize transform
resizeTransform = Resize((128, 128))
testImages = []
testLabels = []

def grabTestingImages(rootFolder) -> None:
    """Function for grabbing testing images

    Args:
        rootFolder (string): The root folder to start the grab from
        
    Although similar to grabTrainingImages, this function lacks the subset amount since the
    amount of testing image folders is not large, therefore a subset is never needed.
    Also, incorporating a case for if the folder is train_images or test_images would result
    in a lot of repeated code, so this function is created instead.
    """
    
    # Iterate through each folder in the root folder
    for folderName in os.listdir(rootFolder):
        folderPath = os.path.join(rootFolder, folderName)

        # Check if the current item is a folder
        if os.path.isdir(folderPath):
            print(f"Processing Folder: {folderName}")
            
            # Call the function recursively to process subfolders
            grabTestingImages(folderPath)
                        
            for filename in os.listdir(folderPath):                
                if filename.endswith(".dcm"):                    
                    ds = pydicom.dcmread(os.path.join(folderPath, filename))
                    image = ds.pixel_array.astype(float)
                    
                    # Convert the image to a PyTorch tensor and move it to the GPU if available
                    imgTensor = torch.tensor(image).unsqueeze(0).to(device)
                    # Resize the image
                    imgTensor = resizeTransform(imgTensor)
                    # Flatten to create feature vector
                    testImages.append(imgTensor.view(-1).cpu().numpy())


### `Running the functions to grab the needed data`

In [35]:
# Specify the root folder with or without subfolders 
trainImagesPath = "C:\\Users\\vange\\OneDrive - Tennessee Tech University\\Desktop\\Fall 23, Spring 24\\4260\\mammography\\train_images"
testImagesPath = "C:\\Users\\vange\\OneDrive - Tennessee Tech University\\Desktop\\Fall 23, Spring 24\\4260\\mammography\\test_images"

# Change this to the amount of folders you want to process
subsetAmount = 20

# Used for grabbing training data (many folders, needs subset, if you don't pass a subset amount, it will process all folders)
grabTrainingImages(trainImagesPath, subsetAmount)

# Used for grabbing testing data (less folders, no need for subset)
grabTestingImages(testImagesPath)

Processing Folder Number 1: 10006
Processing Folder Number 2: 10011
Processing Folder Number 3: 10025
Processing Folder Number 4: 10038
Processing Folder Number 5: 10042
Processing Folder Number 6: 10048
Processing Folder Number 7: 10049
Processing Folder Number 8: 10050
Processing Folder Number 9: 10051
Processing Folder Number 10: 10086
Processing Folder Number 11: 10095
Processing Folder Number 12: 10097
Processing Folder Number 13: 10102
Processing Folder Number 14: 10106
Processing Folder Number 15: 10116
Processing Folder Number 16: 10119
Processing Folder Number 17: 10122
Processing Folder Number 18: 10124
Processing Folder Number 19: 10126
Processing Folder Number 20: 10130
Processing Folder: 10008


### `Convert the images to a numpy array, this allows the svm to learn based off`
### `the patterns in the images related to cancer`

In [34]:
xTrain = np.array(trainImages)
xTest = np.array(testImages)

trainData = pd.read_csv("C:\\Users\\vange\\OneDrive - Tennessee Tech University\\Desktop\\Fall 23, Spring 24\\4260\\mammography\\Mammography\\train.csv")
testData = pd.read_csv("C:\\Users\\vange\\OneDrive - Tennessee Tech University\\Desktop\\Fall 23, Spring 24\\4260\\mammography\\Mammography\\test.csv")

# yTrain is the label -> cancer
# Change number after : to 1 less than the number of xTrain.shape[1]
yTrain = np.array(trainData.loc[:93, "cancer"])

# What labels will be used for yTest?
# yTest = np.array(testData.loc[:,])

print(f"xTrain.shape: {xTrain.shape}, yTrain.shape: {yTrain.shape}")
print(f"xTest.shape: {xTest.shape}") # yTest.shape: {yTest.shape}

xTrain.shape: (0,), yTrain.shape: (94,)
xTest.shape: (0,)


#### `Following, we train the svm using a linear kernel since we are trying to find if a breast is cancerous`
#### `or not, meaning a linearlly seperable answer, being cancerous or non cancerous. Then, we predict using`
#### `the test data from above, and then that will be used on the yTest data to check for accuracy`

In [18]:
# Train SVM model
svm = SVC(kernel='linear')
svm.fit(xTrain, yTrain)

In [19]:
# Predict on the xTest data and store it in yPred for scoring later
yPred = svm.predict(xTest)

In [21]:
# Evaluate model

# Not quite working yet!

accuracy = accuracy_score(yTest, yPred)
print(f"Accuracy: {accuracy}")

ValueError: Classification metrics can't handle a mix of continuous-multioutput and binary targets

## `Once the model works properly, I wish to add visualizations below`