In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import pydicom
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from PIL import Image
from imblearn.over_sampling import SMOTE
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.layers.experimental.preprocessing import RandomFlip, RandomRotation, RandomZoom
from imblearn.over_sampling import RandomOverSampler

import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_columns', 500)



# Approach

1. Fine-tune VGG16 on OASIS dataset 
2. Use fine-tuned VGG16 to extract features from ADNI images
3. Pool the feature vectors for each patient into a single vector
4. Concatenate the pooled vector with the clinical/genetic data
5. Train model

# What is in this notebook?
1. Fine-tuned VGG16 model
2. Neural network for ADNI prediction (our task)
3. SVC for ADNI prediction (our task)
4. Image-only neural network for ADNI prediction (our task)

Note that all 4 models in this notebook are trained on data that was passed through the VGG16 feature extractor.

## Fine-tune VGG16 on OASIS dataset

Note: We must retrain a portion of the trunk since we will remove the head we attach here to generate embeddings later on.

In [2]:
NUM_CLASSES = 1
IMG_SIZE = (224,224) # Expected size for VGG16
NUM_EPOCHS = 15
BATCH_SIZE = 64
LR = 0.01

In [3]:
# Load and convert images to .jpeg
img_dir = "/Users/johngalvin/Desktop/OASIS/0/"

for file in os.listdir(img_dir):
    if file.endswith(".JPG") or file.endswith(".jpg"):
        img = Image.open(img_dir + file)
        file_name, file_ext = os.path.splitext(file)
        new_name = file_name + ".jpeg"
        img.save(img_dir + new_name)
        
img_dir = "/Users/johngalvin/Desktop/OASIS/1/"

for file in os.listdir(img_dir):
    if file.endswith(".JPG") or file.endswith(".jpg"):
        img = Image.open(img_dir + file)
        file_name, file_ext = os.path.splitext(file)
        new_name = file_name + ".jpeg"
        img.save(img_dir + new_name)

In [4]:
# Delete the .JPG and.jpg files
img_dir = "/Users/johngalvin/Desktop/OASIS/0/"
for file in os.listdir(img_dir):
    if file.endswith(".JPG") or file.endswith(".jpg"):
        path_to_file = os.path.join("/Users/johngalvin/Desktop/OASIS/0/", file)
        os.remove(path_to_file)
        
img_dir = "/Users/johngalvin/Desktop/OASIS/1/"
for file in os.listdir(img_dir):
    if file.endswith(".JPG") or file.endswith(".jpg"):
        path_to_file = os.path.join("/Users/johngalvin/Desktop/OASIS/1/", file)
        os.remove(path_to_file)

In [5]:
targets = []
arrays = []

img_dir = "/Users/johngalvin/Desktop/OASIS/0/"
for file in os.listdir(img_dir):
    fpath = os.path.join("/Users/johngalvin/Desktop/OASIS/0/", file)
    img = Image.open(fpath).convert("L")  # Convert the image to grayscale
    resized_image = img.resize((224, 224), Image.BILINEAR)  # Resize the image
    resized_array = np.expand_dims(np.array(resized_image, dtype=np.uint8), axis=-1)  # Convert to NumPy array and add channel dimension
    targets.append(0)
    arrays.append(resized_array)

In [6]:
img_dir = "/Users/johngalvin/Desktop/OASIS/1/"
for file in os.listdir(img_dir):
    fpath = os.path.join("/Users/johngalvin/Desktop/OASIS/1/", file)
    img = Image.open(fpath).convert("L")  # Convert the image to grayscale
    resized_image = img.resize((224, 224), Image.BILINEAR)  # Resize the image
    resized_array = np.expand_dims(np.array(resized_image, dtype=np.uint8), axis=-1)  # Convert to NumPy array and add channel dimension
    targets.append(1)
    arrays.append(resized_array)
    
X = np.array(arrays)
y = np.array(targets)

In [7]:
# Balance positive and negative class
y = y[-y.sum()*2:]
X = X[-y.sum()*2:]

In [9]:
# Split data
X_train_set, X_test, y_train_set, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Reduce size to fit in memory
X_train = X_train_set[:6000]
y_train = y_train_set[:6000]
X_val = X_train_set[6000:7000]
y_val = y_train_set[6000:7000]
X_test = X_test[:1000]
y_test = y_test[:1000]

# Scale data
scaler = StandardScaler()

num_samples, height, width, channels = X_train.shape
X_train_reshaped = X_train.reshape(num_samples, -1)
X_train_scaled_2d = scaler.fit_transform(X_train_reshaped)
X_train_scaled = X_train_scaled_2d.reshape(num_samples, height, width, channels)

num_samples, height, width, channels = X_val.shape
X_val_reshaped = X_val.reshape(num_samples, -1)
X_val_scaled_2d = scaler.transform(X_val_reshaped)
X_val_scaled = X_val_scaled_2d.reshape(num_samples, height, width, channels)

num_samples, height, width, channels = X_test.shape
X_test_reshaped = X_test.reshape(num_samples, -1)
X_test_scaled_2d = scaler.transform(X_test_reshaped)
X_test_scaled = X_test_scaled_2d.reshape(num_samples, height, width, channels)

In [10]:
# Add channels (VGG16 expects 3 channels)
X_train_rgb = np.repeat(X_train_scaled, 3, axis=-1)
X_val_rgb = np.repeat(X_val_scaled, 3, axis=-1)
X_test_rgb = np.repeat(X_test_scaled, 3, axis=-1)

In [11]:
def build_base_model():
    
    base_model = VGG16(include_top=False,
                       input_shape= IMG_SIZE + (3,),
                       weights="imagenet")
    base_model.trainable = False
    
    inputs = tf.keras.layers.Input(shape=IMG_SIZE + (3,))
    x = base_model(inputs, training=False)
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = tf.keras.layers.Dense(512, activation="relu")(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    x = tf.keras.layers.Dense(256, activation="relu")(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    outputs = tf.keras.layers.Dense(NUM_CLASSES)(x)
    
    model = tf.keras.Model(inputs, outputs)
    model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                  optimizer=tf.keras.optimizers.Adam(learning_rate=LR),
                  metrics=["accuracy"])
    
    return model

In [12]:
base = build_base_model()

2023-10-09 13:07:10.945549: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-10-09 13:07:10.945618: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Metal device set to: Apple M2 Pro


In [13]:
base.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 vgg16 (Functional)          (None, 7, 7, 512)         14714688  
                                                                 
 global_average_pooling2d (G  (None, 512)              0         
 lobalAveragePooling2D)                                          
                                                                 
 dense (Dense)               (None, 512)               262656    
                                                                 
 dropout (Dropout)           (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 256)               131328    
                                                             

In [15]:
tf.config.run_functions_eagerly(True)
base_history = base.fit(X_train_rgb,
                        y_train,
                        validation_data=[X_val_rgb, y_val],
                        epochs=NUM_EPOCHS,
                        batch_size=BATCH_SIZE)



Epoch 1/15


2023-10-09 13:08:20.667055: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [18]:
base.trainable = True
early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss",
                                                  patience=3,
                                                  restore_best_weights=True)

base.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
             optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
             metrics=["accuracy"])

base_history_2 = base.fit(X_train_rgb,
                          y_train,
                          validation_data=[X_val_rgb, y_val],
                          epochs=10,
                          batch_size=BATCH_SIZE,
                          callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [19]:
base.evaluate(X_test_rgb, y_test)



[0.14064964652061462, 0.956000030040741]

In [21]:
feature_extractor = tf.keras.Model(base.input, base.get_layer("global_average_pooling2d").output)

In [22]:
feature_extractor.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 vgg16 (Functional)          (None, 7, 7, 512)         14714688  
                                                                 
 global_average_pooling2d (G  (None, 512)              0         
 lobalAveragePooling2D)                                          
                                                                 
Total params: 14,714,688
Trainable params: 14,714,688
Non-trainable params: 0
_________________________________________________________________


In [23]:
# Save the entire model as a SavedModel.
!mkdir -p feature_extractor
feature_extractor.save("feature_extractor/feature_extractor_model")





INFO:tensorflow:Assets written to: feature_extractor/feature_extractor_model/assets


INFO:tensorflow:Assets written to: feature_extractor/feature_extractor_model/assets


## Generate embeddings for ADNI images

In [2]:
pt_ids = []
pixels = []

directory_path = '/Users/johngalvin/Downloads/ADNI 2'

# Iterate through level 2 subdirectories
for level_2_foldername in os.listdir(directory_path):
    level_2_folder_path = os.path.join(directory_path, level_2_foldername)
    
    if os.path.isdir(level_2_folder_path):
        # Iterate through DICOM files in level 5 (bottom-most level) of each level 2 folder
        for root, _, files in os.walk(level_2_folder_path):
            for file in files:
                try:
                    file_path = os.path.abspath(os.path.join(root, file))
                    
                    # Attempt to read DICOM file
                    dcm = pydicom.dcmread(file_path)
                    
                    # Check if the file has PixelData (to avoid non-image DICOM files)
                    if hasattr(dcm, 'PixelData'):
                        # Append both level 2 folder name and pixel array to the lists
                        pt_ids.append(file[5:15])
                        pixels.append(dcm.pixel_array)
                except Exception as e:
                    # Handle exceptions (e.g., files without 'TransferSyntaxUID')
                    print(f"Error processing file {file_path}: {e}")

# Create a DataFrame from the lists
mri_df = pd.DataFrame({'PTID': pt_ids, 'Pixel Array': pixels})

Error processing file /Users/johngalvin/Downloads/ADNI 2/068_S_0473/.DS_Store: File is missing DICOM File Meta Information header or the 'DICM' prefix is missing from the header. Use force=True to force reading.
Error processing file /Users/johngalvin/Downloads/ADNI 2/068_S_0473/MPRAGE/.DS_Store: File is missing DICOM File Meta Information header or the 'DICM' prefix is missing from the header. Use force=True to force reading.
Error processing file /Users/johngalvin/Downloads/ADNI 2/032_S_0677/.DS_Store: File is missing DICOM File Meta Information header or the 'DICM' prefix is missing from the header. Use force=True to force reading.
Error processing file /Users/johngalvin/Downloads/ADNI 2/032_S_0677/MPRAGE/.DS_Store: File is missing DICOM File Meta Information header or the 'DICM' prefix is missing from the header. Use force=True to force reading.
Error processing file /Users/johngalvin/Downloads/ADNI 2/032_S_0677/MPRAGE/2016-07-22_09_23_31.0/.DS_Store: File is missing DICOM File Met

In [3]:
# Resize image arrays with Bilinear Interpolation
resized_arrays = []

for val in mri_df["Pixel Array"]:
    image = Image.fromarray(val, mode='L')
    resized_image = image.resize((224, 224), Image.BILINEAR)
    resized_array = np.expand_dims(np.array(resized_image, dtype=np.uint8), axis=-1) #TF expects channel dim
    resized_arrays.append(resized_array)
    
mri_df["Pixel Array"] = resized_arrays

In [4]:
# Keep just the first 160 images for each patient (size/speed)
mri_df = mri_df.groupby("PTID").head(160)
mri_df.reset_index(inplace=True)

In [5]:
# Add 3 channels for VGG16 (num_samples, 224, 224, 3) - after running this cell
resized_arrays = []

for i in range(len(mri_df["Pixel Array"])):
    resized_arrays.append(np.repeat(mri_df["Pixel Array"][i], 3, axis=-1))
    
mri_df["Pixel Array"] = resized_arrays

In [6]:
# Read in med/famhist
mf_hist = pd.read_csv('../data/clinical_training_data_with_medhist_famhist.csv')

# Handle Nan
mf_hist["Family_History_of_AD"] = mf_hist["Family_History_of_AD"].fillna(0)
mf_hist["Family_History_of_Dementia"] = mf_hist["Family_History_of_Dementia"].fillna(0)

# For converting categorical variables to ints
label_encoder = LabelEncoder()
scaler = MinMaxScaler()

# Split features / target
X = mf_hist.drop(columns=['AD_dx_in_5_yrs'])
y = mf_hist['AD_dx_in_5_yrs']

# Encode features
X["Diagnosis_at_Baseline"] = label_encoder.fit_transform(X["Diagnosis_at_Baseline"])
X["Gender"] = label_encoder.fit_transform(X["Gender"])
X["Ethnicity"] = label_encoder.fit_transform(X["Ethnicity"])
X["Race"] = label_encoder.fit_transform(X["Race"])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Scale pixel data
train_unscaled_mri_df = pd.merge(mri_df, X_train[["PTID"]], on="PTID", how="inner")
test_unscaled_mri_df = pd.merge(mri_df, X_test[["PTID"]], on="PTID", how="inner")

train_unscaled_arrays = np.array(train_unscaled_mri_df["Pixel Array"].tolist())
test_unscaled_arrays = np.array(test_unscaled_mri_df["Pixel Array"].tolist())

mean = np.mean(train_unscaled_arrays, axis=(0, 1, 2))
std = np.std(train_unscaled_arrays, axis=(0, 1, 2))

train_scaled_array = (train_unscaled_arrays - mean) / std
test_scaled_array = (test_unscaled_arrays - mean) / std

In [8]:
# Reorder labels (order changed when forming unscaled_mri_df)
y_train_final = []
for val in train_unscaled_mri_df["PTID"].value_counts().index.values:
    y_train_final.append(mf_hist.loc[mf_hist["PTID"] == val, "AD_dx_in_5_yrs"].values[0])
y_train_final = np.array(y_train_final)

y_test_final = []
for val in test_unscaled_mri_df["PTID"].value_counts().index.values:
    y_test_final.append(mf_hist.loc[mf_hist["PTID"] == val, "AD_dx_in_5_yrs"].values[0])
y_test_final = np.array(y_test_final)

In [9]:
print (f'Mean: {mean}')
print (f'STD: {std}')

Mean: [23.54082765 23.54082765 23.54082765]
STD: [31.03262227 31.03262227 31.03262227]


In [10]:
# Load model
feature_extractor = tf.keras.models.load_model("feature_extractor/feature_extractor_model")

Metal device set to: Apple M2 Pro


2023-10-11 09:50:49.627817: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-10-11 09:50:49.627855: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)




## Generate embeddings and pool

In [11]:
# Generate embeddings and pool
patient_ids = []
latent_reps = []

start = 0
end = 160
for i in range(len(train_unscaled_mri_df["PTID"].value_counts().index.values)):
    p_id = train_unscaled_mri_df["PTID"].value_counts().index.values[i]
    patient_ids.append(p_id)
    features = feature_extractor(train_scaled_array[start:end]).numpy() # Each patient has 160 images (160, 512)
    latent_rep = np.mean(np.stack(features, axis=-1), axis=1) # (512,), max pooling
    latent_reps.append(latent_rep)
    start += 160
    end += 160

train_embeddings = pd.DataFrame()
train_embeddings["PTID"] = patient_ids
train_embeddings["embedding"] = latent_reps

In [12]:
# Generate embeddings and pool
patient_ids = []
latent_reps = []

start = 0
end = 160
for i in range(len(test_unscaled_mri_df["PTID"].value_counts().index.values)):
    p_id = test_unscaled_mri_df["PTID"].value_counts().index.values[i]
    patient_ids.append(p_id)
    features = feature_extractor(test_scaled_array[start:end]).numpy() # Each patient has 160 images (160, 512)
    latent_rep = np.mean(np.stack(features, axis=-1), axis=1) # (512,), max pooling
    latent_reps.append(latent_rep)
    start += 160
    end += 160

test_embeddings = pd.DataFrame()
test_embeddings["PTID"] = patient_ids
test_embeddings["embedding"] = latent_reps

In [13]:
# Join back into X_train, X_test
X_train = pd.merge(train_embeddings, X_train, on="PTID")
X_test = pd.merge(test_embeddings, X_test, on="PTID")

# Scale remaining data (clinical/genetic)
embedding_column_train = X_train["embedding"]
embedding_column_test = X_test["embedding"]

columns_to_scale_train = X_train.drop(columns=["embedding", "PTID"])
columns_to_scale_test = X_test.drop(columns=["embedding", "PTID"])

scaled_train = scaler.fit_transform(columns_to_scale_train)
scaled_test = scaler.transform(columns_to_scale_test)

X_train_scaled = pd.DataFrame(data=scaled_train, columns=columns_to_scale_train.columns)
X_test_scaled = pd.DataFrame(data=scaled_test, columns=columns_to_scale_test.columns)

X_train_scaled["embedding"] = embedding_column_train
X_test_scaled["embedding"] = embedding_column_test

In [14]:
# Oversample
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train_scaled, y_train_final)

## Concatenate pooled embedding with clinical/genetic feature vector

In [15]:
# Structure for training
vector_column = X_train_resampled["embedding"].values
other_columns = X_train_resampled.drop(columns=["embedding"]).values
X_train_final = np.hstack((other_columns, np.vstack(vector_column)))

# Structure for test
vector_column = X_test_scaled["embedding"].values
other_columns = X_test_scaled.drop(columns=["embedding"]).values
X_test_final = np.hstack((other_columns, np.vstack(vector_column)))

In [56]:
IN_FEATURES = X_train_final[0].shape[0]

In [91]:
def create_model(dropout=0.3, learning_rate=0.0001, l2_penalty=0.1):
    """Builds classification model"""
    
    model = tf.keras.Sequential()
    inputs = tf.keras.layers.Input(shape=(IN_FEATURES,), name="input_layer") # (Batch, num_features)
    
    hidden_1 = tf.keras.layers.Dense(256, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(l2_penalty), name="hidden_1")(inputs)
    hidden_1 = tf.keras.layers.Dropout(dropout)(hidden_1)
    hidden_2 = tf.keras.layers.Dense(128, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(l2_penalty), name="hidden_2")(hidden_1)
    hidden_2 = tf.keras.layers.Dropout(dropout)(hidden_2)
    
    classification = tf.keras.layers.Dense(1, activation="sigmoid", name="classification_layer")(hidden_1)
    classification_model = tf.keras.Model(inputs=[inputs], outputs=[classification])
    
    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                                 loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), 
                                 metrics="accuracy")

    return classification_model

In [92]:
model = create_model()

In [93]:
tf.config.run_functions_eagerly(True)
history = model.fit(X_train_final,
                    y_train_resampled,
                    #validation_data=[X_test_final, y_test_final],
                    batch_size=16,
                    epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [94]:
model.evaluate(X_test_final, y_test_final)



[0.6152703762054443, 0.7460317611694336]

In [90]:
print ("If simply guessing all 0, can get this accuracy:", 1 - y_test_final.sum() / y_test_final.shape[0])

If simply guessing all 0, can get this accuracy: 0.746031746031746


# Try SVC

In [36]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [37]:
# Instantiate Model
svc = SVC()

In [38]:
# Grid search
param_grid = {
    "C": [0.1, 1, 10],                
    "kernel": ["linear", "poly", "rbf", "sigmoid"],
    "gamma": ["scale", "auto", 0.1],
    "degree": [1, 2, 3, 4, 5]}

In [39]:
grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, cv=5, scoring="accuracy")

In [40]:
grid_search.fit(X_train_final, y_train_resampled)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [0.1, 1, 10], 'degree': [1, 2, 3, 4, 5],
                         'gamma': ['scale', 'auto', 0.1],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
             scoring='accuracy')

In [41]:
# Get the best parameters and best accuracy score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best parameters and best score
print("Best Parameters:", best_params)
print("Best Accuracy:", best_score)

Best Parameters: {'C': 10, 'degree': 2, 'gamma': 0.1, 'kernel': 'poly'}
Best Accuracy: 0.8941176470588236


In [42]:
# Instantiate and fit model
svc = SVC(C=10, degree=2, gamma=0.1, kernel="poly")

svc.fit(X_train_final, y_train_resampled)

SVC(C=10, degree=2, gamma=0.1, kernel='poly')

In [43]:
# Assess
y_pred = svc.predict(X_test_final)

# evaluate predictions
print(classification_report(y_test_final, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.85      0.83        47
           1       0.50      0.44      0.47        16

    accuracy                           0.75        63
   macro avg       0.66      0.64      0.65        63
weighted avg       0.74      0.75      0.74        63



# What if we do image-only?

Note: this model below accepts the pooled embeddings formed after VGG16 feature extraction. The only difference from above is that it does **not** concatenate each final pooled embedding with the clinical/genetic feature vector.

In [77]:
X_train_img_only = np.vstack(X_train_resampled["embedding"])
y_train_img_only = y_train_resampled

X_test_img_only = np.vstack(X_test_scaled["embedding"])
y_test_img_only = y_test_final

In [81]:
def create_model(dropout=0.3, learning_rate=0.0001, l2_penalty=0.01):
    """Builds classification model"""
    
    model = tf.keras.Sequential()
    inputs = tf.keras.layers.Input(shape=(512,), name="input_layer") # (Batch, num_features)
    
    hidden_1 = tf.keras.layers.Dense(256, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(l2_penalty), name="hidden_1")(inputs)
    hidden_1 = tf.keras.layers.Dropout(dropout)(hidden_1)
    hidden_2 = tf.keras.layers.Dense(128, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(l2_penalty), name="hidden_2")(hidden_1)
    hidden_2 = tf.keras.layers.Dropout(dropout)(hidden_2)
    
    classification = tf.keras.layers.Dense(1, activation="sigmoid", name="classification_layer")(hidden_2)
    classification_model = tf.keras.Model(inputs=[inputs], outputs=[classification])
    
    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                                 loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), 
                                 metrics="accuracy")

    return classification_model

In [82]:
img_only_model = create_model()

In [83]:
tf.config.run_functions_eagerly(True)
img_only_history = img_only_model.fit(X_train_img_only,
                                      y_train_img_only,
                                      validation_data=[X_test_img_only, y_test_img_only],
                                      batch_size=16,
                                      epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78