In [5]:
import torch
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import torchvision.transforms as transforms
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.utils import shuffle
import sklearn.feature_selection
from sklearn import metrics
import gc
import os
import sys

In [6]:
from my_modules.nsclc import NSCLCDataset
import torchvision.transforms.v2 as tvt
# Prepare data
data = NSCLCDataset('E:/NSCLC_Data_for_ML',
                    ['orr', 'g', 's', 'photons', 'tau1', 'tau2', 'alpha1', 'alpha2', 'taumean', 'boundfraction'],
                    device='cpu', label='Metastases', mask_on=False)
data.normalize_channels('preset')
data.transforms = tvt.Compose([tvt.RandomVerticalFlip(p=0.25),
                               tvt.RandomHorizontalFlip(p=0.25),
                               tvt.RandomRotation(degrees=(-180, 180))])
data.augment()

In [7]:
from sklearn.neighbors import KNeighborsClassifier

names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]

In [28]:
import random

# Prepare folded data samplers
# Get random indices of patients
subsampler = torch.utils.data.sampler.SubsetRandomSampler(range(data.patient_count))
idx = [i for i in subsampler]

# Get the image indices for all patients as nested lists
patient_subsets = [data.get_patient_subset(i) for i in idx]

# Find and remove any patients with no image indices
idx_for_removal = []
for i, subset in enumerate(patient_subsets):
    if len(subset) == 0:
        idx_for_removal.append(idx[i])
for ix in idx_for_removal:
    idx.remove(ix)

# Get labels for all remaining patients
labels = [data.get_patient_label(i).item() for i in idx]

# Separate 0 and 1 labels (still shuffled)
shuffled_zeros = [i for i, l in zip(idx, labels) if l == 0]
shuffled_ones = [i for i, l in zip(idx, labels) if l == 1]
print('Number of non-metastatic patients: {}'.format(len(shuffled_ones)))
print('Number of metastatic patients: {}'.format(len(shuffled_zeros)))

train_subjects = shuffled_ones[3:] + shuffled_zeros[3:]
train_subsets = [data.get_patient_subset(i) for i in train_subjects]  # Get all patient indices
train_indices = [i for sub in train_subsets for i in sub]  # Un-nest
random.shuffle(train_indices)

test_subjects = shuffled_zeros[:3] + shuffled_ones[:3]  # Get a set of patients from both classes
test_subsets = [data.get_patient_subset(i) for i in test_subjects]  # Get all patient indices
test_indices = [i for sub in test_subsets for i in sub]  # Un-nest
random.shuffle(test_indices)

x_train = np.empty(((len(train_indices),) + data.shape))
y_train = np.empty(len(train_indices))
for i, idx in enumerate(train_indices):
    x_train[i] = data[idx][0].numpy()
    y_train[i] = data[idx][1].item()

x_test = np.empty(((len(test_indices),) + data.shape))
y_test = np.empty(len(test_indices))
for i, idx in enumerate(test_indices):
    x_test[i] = data[idx][0].numpy()
    y_test[i] = data[idx][1].item()

Number of non-metastatic patients: 12
Number of metastatic patients: 11


In [36]:
print(x_train.shape, y_train.shape)
x_train = x_train.reshape((x_train.shape[0], -1))
x_train[np.isnan(x_train)] = 0
x_test = x_test.reshape((x_test.shape[0], -1))
x_test[np.isnan(x_test)] = 0
print(x_train.shape)

(485, 655360) (485,)
(485, 655360)


In [39]:
# Fit each model and test
score = {}
for name, clf in zip(names, classifiers):
    pipe = make_pipeline(StandardScaler(), clf)
    pipe.fit(x_train, y_train)
    score[name] = pipe.score(x_test, y_test)
    print(score[name])

0.4222222222222222
0.4666666666666667
0.5
0.5
0.5166666666666667
0.45
0.6111111111111112




0.45555555555555555
0.3611111111111111




0.48333333333333334


In [40]:
print(score)

{'Nearest Neighbors': 0.4222222222222222, 'Linear SVM': 0.4666666666666667, 'RBF SVM': 0.5, 'Gaussian Process': 0.5, 'Decision Tree': 0.5166666666666667, 'Random Forest': 0.45, 'Neural Net': 0.6111111111111112, 'AdaBoost': 0.45555555555555555, 'Naive Bayes': 0.3611111111111111, 'QDA': 0.48333333333333334}


In [46]:
from sklearn.decomposition import PCA
pca = PCA(n_components=data.stack_height)
pca.fit(x_train)
print(pca.explained_variance_ratio_)
print(pca.singular_values_)

[0.30460095 0.10698153 0.07496978 0.02175497 0.02089734 0.01909765
 0.0166035  0.01534505 0.01344136 0.01224451]
[5300256.38409434 3141128.08711531 2629507.22853761 1416480.27078047
 1388279.12398873 1327153.76970146 1237459.83259843 1189639.50428946
 1113404.08142886 1062678.81823199]


MemoryError: Unable to allocate 3.12 TiB for an array with shape (655360, 655360) and data type float64

In [51]:
# Fit each model and test
score = {}
for name, clf in zip(names, classifiers):
    pipe = make_pipeline(StandardScaler(), PCA(n_components=data.stack_height), clf)
    pipe.fit(x_train, y_train)
    score[name] = pipe.score(x_test, y_test)
    print(name, score[name])

Nearest Neighbors 0.4777777777777778
Linear SVM 0.46111111111111114
RBF SVM 0.5
Gaussian Process 0.5
Decision Tree 0.40555555555555556
Random Forest 0.37222222222222223
Neural Net 0.5333333333333333




AdaBoost 0.48333333333333334
Naive Bayes 0.46111111111111114
QDA 0.45555555555555555


In [50]:
# Fit each model and test
score = {}
for name, clf in zip(names, classifiers):
    pipe = make_pipeline(StandardScaler(), PCA(n_components=3), clf)
    pipe.fit(x_train, y_train)
    score[name] = pipe.score(x_test, y_test)
    print(name, score[name])

Nearest Neighbors 0.4222222222222222
Linear SVM 0.5
RBF SVM 0.5
Gaussian Process 0.42777777777777776
Decision Tree 0.3888888888888889
Random Forest 0.45555555555555555
Neural Net 0.42777777777777776




AdaBoost 0.46111111111111114
Naive Bayes 0.4722222222222222
QDA 0.5777777777777777


In [None]:
# Fit each model and test
score = {}
for name, clf in zip(names, classifiers):
    pipe = make_pipeline(StandardScaler(), PCA(n_components=3), clf)
    pipe.fit(x_train, y_train)
    score[name] = pipe.score(x_test, y_test)
    print(name, score[name])