In [90]:
import pandas as pd
import numpy as np
import tensorflow as tf
import pydicom
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from PIL import Image
from imblearn.over_sampling import SMOTE

import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_columns', 500)

## Combine pixel arrays for each patient into a single vector

In [2]:
pt_ids = []
pixels = []

directory_path = '/Users/johngalvin/Downloads/ADNI 2'

# Iterate through level 2 subdirectories
for level_2_foldername in os.listdir(directory_path):
    level_2_folder_path = os.path.join(directory_path, level_2_foldername)
    
    if os.path.isdir(level_2_folder_path):
        # Iterate through DICOM files in level 5 (bottom-most level) of each level 2 folder
        for root, _, files in os.walk(level_2_folder_path):
            for file in files:
                try:
                    file_path = os.path.abspath(os.path.join(root, file))
                    
                    # Attempt to read DICOM file
                    dcm = pydicom.dcmread(file_path)
                    
                    # Check if the file has PixelData (to avoid non-image DICOM files)
                    if hasattr(dcm, 'PixelData'):
                        # Append both level 2 folder name and pixel array to the lists
                        pt_ids.append(file[5:15])
                        pixels.append(dcm.pixel_array)
                except Exception as e:
                    # Handle exceptions (e.g., files without 'TransferSyntaxUID')
                    print(f"Error processing file {file_path}: {e}")

# Create a DataFrame from the lists
mri_df = pd.DataFrame({'PTID': pt_ids, 'Pixel Array': pixels})

Error processing file /Users/johngalvin/Downloads/ADNI 2/068_S_0473/.DS_Store: File is missing DICOM File Meta Information header or the 'DICM' prefix is missing from the header. Use force=True to force reading.
Error processing file /Users/johngalvin/Downloads/ADNI 2/068_S_0473/MPRAGE/.DS_Store: File is missing DICOM File Meta Information header or the 'DICM' prefix is missing from the header. Use force=True to force reading.
Error processing file /Users/johngalvin/Downloads/ADNI 2/032_S_0677/.DS_Store: File is missing DICOM File Meta Information header or the 'DICM' prefix is missing from the header. Use force=True to force reading.
Error processing file /Users/johngalvin/Downloads/ADNI 2/032_S_0677/MPRAGE/.DS_Store: File is missing DICOM File Meta Information header or the 'DICM' prefix is missing from the header. Use force=True to force reading.
Error processing file /Users/johngalvin/Downloads/ADNI 2/032_S_0677/MPRAGE/2016-07-22_09_23_31.0/.DS_Store: File is missing DICOM File Met

In [3]:
# Resize image arrays with Bilinear Interpolation
resized_arrays = []

for val in mri_df["Pixel Array"]:
    image = Image.fromarray(val, mode='L')
    resized_image = image.resize((224, 224), Image.BILINEAR)
    resized_array = np.expand_dims(np.array(resized_image, dtype=np.uint8), axis=-1) #TF expects channel dim
    resized_arrays.append(resized_array)
    
mri_df["Pixel Array"] = resized_arrays

In [12]:
def max_pooling(vectors):
    """Reduces dimensionality of a group of vectors"""
    return np.max(np.stack(vectors, axis=-1), axis=-1)[:, :, :]

In [35]:
# Pool each patient's pixel arrays
pooled_data = mri_df.groupby("PTID").apply(lambda x: (max_pooling(x["Pixel Array"]), x["PTID"].values[0]))
pooled_vectors, ptids = zip(*pooled_data)
pooled_vectors = np.array(pooled_vectors)
ptids = np.array(ptids)

In [37]:
# Create dataframe
ids = []
vectors = []

for i in range(len(pooled_vectors)):
    ids.append(ptids[i])
    vectors.append(pooled_vectors[i])
    
pooled_df = pd.DataFrame()
pooled_df["PTID"] = ids
pooled_df["Pooled_Vector"] = vectors

In [40]:
pooled_df.head(2)

Unnamed: 0,PTID,Pooled_Vector
0,002_S_0295,"[[[2], [1], [1], [1], [1], [1], [0], [2], [1],..."
1,002_S_0413,"[[[2], [0], [1], [1], [1], [1], [0], [1], [0],..."


## Join image data with clinical/genetic data

In [115]:
mf_hist = pd.read_csv('../data/clinical_training_data_with_medhist_famhist.csv')
df = pd.merge(pooled_df, mf_hist, on='PTID')

## Preprocess

In [163]:
# Reshape Pooled_Vectors
pooled_vectors = np.array(df["Pooled_Vector"].tolist())
flattened_vectors = pooled_vectors.reshape(pooled_vectors.shape[0], -1)
df["Pooled_Vector"] = list(flattened_vectors)

# Handle Nan
df["Family_History_of_AD"] = df["Family_History_of_AD"].fillna(0)
df["Family_History_of_Dementia"] = df["Family_History_of_Dementia"].fillna(0)

# For converting categorical variables to ints
label_encoder = LabelEncoder()
sequences_scaler = StandardScaler()
scaler = StandardScaler()

# Split features / target
X = df.drop(columns=["AD_dx_in_5_yrs", "PTID"])
y = df["AD_dx_in_5_yrs"]

# Encode features
X["Diagnosis_at_Baseline"] = label_encoder.fit_transform(X["Diagnosis_at_Baseline"])
X["Gender"] = label_encoder.fit_transform(X["Gender"])
X["Ethnicity"] = label_encoder.fit_transform(X["Ethnicity"])
X["Race"] = label_encoder.fit_transform(X["Race"])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale data
tmp_train = sequences_scaler.fit_transform(np.vstack(X_train["Pooled_Vector"].values).astype(float))
tmp_test = sequences_scaler.transform(np.vstack(X_test["Pooled_Vector"].values).astype(float))

X_train = X_train.drop(columns=["Pooled_Vector"])
X_test = X_test.drop(columns=["Pooled_Vector"])

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train = np.concatenate((X_train, tmp_train), axis=1)
X_test = np.concatenate((X_test, tmp_test), axis=1)

# SMOTE
smote = SMOTE(sampling_strategy="auto")
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [169]:
IN_FEATURES = X_resampled[0].shape[0]

In [170]:
def create_model(dropout=0.3, learning_rate=0.0001):
    
    """Builds classification model"""
    
    model = tf.keras.Sequential()
    inputs = tf.keras.layers.Input(shape=(IN_FEATURES,), name="input_layer") # (Batch, num_features)
    
    hidden_1 = tf.keras.layers.Dense(512, activation="relu", name="hidden_1")(inputs)
    hidden_1 = tf.keras.layers.Dropout(dropout)(hidden_1)
    hidden_2 = tf.keras.layers.Dense(256, activation="relu", name="hidden_2")(hidden_1)
    hidden_2 = tf.keras.layers.Dropout(dropout)(hidden_2)
    
    classification = tf.keras.layers.Dense(1, activation="sigmoid", name="classification_layer")(hidden_2)
    classification_model = tf.keras.Model(inputs=[inputs], outputs=[classification])
    
    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                                 loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), 
                                 metrics="accuracy")

    return classification_model

In [171]:
model = create_model()

In [173]:
tf.config.run_functions_eagerly(True)
history = model.fit(X_resampled,
                    y_resampled,
                    batch_size=16,
                    epochs=20)

Epoch 1/20




Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [174]:
model.evaluate(X_test, y_test)



[1.7419241666793823, 0.6935483813285828]