https://www.nitrc.org/plugins/mwiki/index.php/neurobureau:AthenaPipeline

In [1]:
import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append("..")

from Code.data_generator import FMRIDataGenerator

import numpy as np
import pandas as pd
import os
import sys
from datetime import datetime

import tensorflow as tf

from tensorflow.keras.layers import Conv3D, MaxPool3D, TimeDistributed, Flatten, LSTM, Dense
from tensorflow.keras import Sequential
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import CSVLogger

import tensorflow.keras as keras

import logging

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # FATAL
logging.getLogger('tensorflow').setLevel(logging.FATAL)

#### Extract file names for your data in an excel sheet

In [11]:
import argparse
from pathlib import Path
import openpyxl # For writing .xlsx files

def create_excel_of_nii_gz_files(folder_path_str: str, output_excel_name: str = "nii_gz_file_list.xlsx"):
    """
    Scans a folder for .nii.gz files and creates an Excel sheet listing them.

    Args:
        folder_path_str (str): The path to the folder to scan.
        output_excel_name (str): The name for the output Excel file.
                                 This will be created inside the specified folder_path_str.
    """
    folder_path = Path(folder_path_str)

    # 1. Validate the folder path
    if not folder_path.is_dir():
        print(f"Error: Folder not found at '{folder_path_str}'")
        return

    # 2. Find all .nii.gz files
    #    glob("*.nii.gz") finds files matching the pattern directly in the folder.
    #    We use a list comprehension to get just the names.
    #    sorted() ensures the list is in a predictable order.
    nii_gz_files = sorted([file.name for file in folder_path.glob("*.nii.gz") if file.is_file()])

    if not nii_gz_files:
        print(f"No '.nii.gz' files found in '{folder_path_str}'.")
        return

    # 3. Create an Excel workbook and select the active sheet
    workbook = openpyxl.Workbook()
    sheet = workbook.active
    sheet.title = "NII.GZ Files" # You can customize the sheet name

    # 4. Add a header to the first column
    sheet['A1'] = "Filename"

    # 5. Write the filenames to the sheet, starting from the second row
    for i, filename in enumerate(nii_gz_files, start=2): # start=2 means row 2 (A2, A3, ...)
        sheet[f'A{i}'] = filename
        # If you wanted full paths, you could use:
        # full_path_file = folder_path / filename
        # sheet[f'A{i}'] = str(full_path_file)


    # 6. Define the output path for the Excel file
    output_excel_path = folder_path / output_excel_name

    # 7. Save the workbook
    try:
        workbook.save(output_excel_path)
        print(f"Successfully created '{output_excel_path}' with {len(nii_gz_files)} file names.")
    except Exception as e:
        print(f"Error saving Excel file: {e}")
        print(f"Please ensure you have write permissions to '{folder_path}' or that the file is not open.")

if __name__ == "__main__":
    # --- Method 1: Hardcode the folder path (simple for quick use) ---
    target_folder = r"D:/adhd-fmri/model_data"   
    create_excel_of_nii_gz_files(target_folder)

Successfully created 'D:\adhd-fmri\model_data\nii_gz_file_list.xlsx' with 222 file names.


In [16]:
import re
import os
import pandas as pd

def parse_dx(dx):
    if int(dx) == 0:
        return 0
    else:
        return 1
    
# base_dir = "../data"
# dataset_dir = "../data/model_data"

base_dir = r"D:/adhd-fmri"
dataset_dir = r"D:/adhd-fmri/model_data"  

files_list = []
for file in os.listdir(dataset_dir):
    nums = re.findall(r'\d+', file)
    file_id = None
    for num in nums: 
        if len(num) > 1: 
            file_id = int(num)
            
    files_list.append({"ScanDir ID": file_id, "Image": file} )

images_df = pd.DataFrame(files_list)
tsv_path = r"D:\Joe Workspace\Codespace\Github\Diagnosing-ADHD-With-ConvLSTM\References\adhd200_preprocessed_phenotypics.tsv"
references_path = r"D:\Joe Workspace\Codespace\Github\Diagnosing-ADHD-With-ConvLSTM\References"

adhd_info = pd.read_csv(tsv_path, delimiter="\t")[['ScanDir ID','DX']]

model_data = adhd_info.merge(images_df, on='ScanDir ID')

for index,row in model_data.iterrows():
    if row['DX'] == 'pending':
        model_data.drop(index,axis=0,inplace=True)

model_data['DX'] = model_data['DX'].apply(parse_dx)

model_data.to_csv(os.path.join(references_path, "model_data.csv"), index=False)

#### Data Generator

In [25]:
import numpy as np
import tensorflow.keras as keras
from scipy.ndimage import zoom
import os
import nibabel as nib

class FMRIDataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, list_IDs, labels, dataset_dir, batch_size):
        'Initialization'
        self.x_dim = 49
        self.y_dim = 58
        self.z_dim = 47
        self.time_length = 177
        self.img_dim = (self.x_dim, self.y_dim, self.z_dim) 
        self.dim = [self.time_length, 28, 28, 28, 1] # [time, x, y, z, c]
        self.dataset_dir = dataset_dir
        self.batch_size = batch_size
        self.labels = labels
        self.list_IDs = list_IDs
        self.n_channels = 1
        self.n_classes = 1
        self.shuffle = True
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.empty((self.batch_size, *self.dim))
        y = np.empty((self.batch_size), dtype=int)
                
        for i, img_path in enumerate(list_IDs_temp):
            X[i,] = self.preprocess_image(img_path)
            y[i] = self.labels[img_path]

        return X,y

    # Image Preprocessing Methods
    def preprocess_image(self, img_path):
        img = nib.load(os.path.join(self.dataset_dir, img_path))

        pp_img = None
        if img.shape[3] > self.time_length:
            pp_img = self.truncate_image(img)
        elif img.shape[3] < self.time_length:
            pp_img = self.pad_image(img)
        else:
            pp_img = img.get_fdata()

        # For each image at the index-th time step, do this
        new_x=28/49
        new_y=28/58
        new_z=28/47
        
        new_img = []
        for index in range(self.time_length):
            z_img = zoom(pp_img[:,:,:,index], (new_x,new_y,new_z), order=1)
            new_img.append(z_img.reshape((28,28,28,1)))
        
        f_img = np.array(new_img)
        return f_img
    
    def truncate_image(self, img):
        return img.get_fdata()[:,:,:,:self.time_length]

    def pad_image(self, img):
        img_padding = np.expand_dims(np.zeros((self.x_dim,self.y_dim,self.z_dim)), axis=3)
        amt_to_fill = self.time_length - img.get_fdata().shape[3]
        padded_img = img.get_fdata()
        for _ in range(amt_to_fill):
            padded_img = np.append(arr=padded_img, values=img_padding, axis=3)

        return padded_img

#### Lets Do It

In [None]:
# ============================ DATA WORK ============================

file_num = sys.argv[1]

# Dataframes
dataset_dir =  r"D:/adhd-fmri/model_data"  
model_train_data = pd.read_csv(r"D:/adhd-fmri/training_data.csv".format(file_num) )
model_val_data = pd.read_csv(r"D:/adhd-fmri/validation_data.csv".format(file_num) )

# Dictionary of data values
partition = {'train': model_train_data['Image'].values, 
             'validation': model_val_data['Image'].values}

# Training Data
train_labels = {}
for index, row in model_train_data.iterrows():
    train_labels[row['Image']] = row['DX']
    
# Validation Data
val_labels = {}
for index, row in model_val_data.iterrows():
    val_labels[row['Image']] = row['DX']

In [19]:
# ============================ MODEL META ============================

epochs = 500
batch_size = 6
input_shape=(177,28,28,28,1)

train_steps_per_epoch = model_train_data.shape[0] // batch_size
validate_steps_per_epoch = model_val_data.shape[0] // batch_size

# Generators
training_generator = FMRIDataGenerator(partition['train'], train_labels, dataset_dir, batch_size)
validation_generator = FMRIDataGenerator(partition['validation'], val_labels, dataset_dir, batch_size)

curr_time = f'{datetime.now():%H-%M-%S%z_%m%d%Y}'
logger_path = "/pylon5/cc5614p/deopha32/Saved_Models/adhd-fmri-history_cv{num}_{time}.csv".format(num=file_num,time=curr_time)

csv_logger = CSVLogger(logger_path, append=True)

callbacks = [csv_logger]

In [30]:
# ============================ MODEL ARCHITECTURE ============================

with tf.device('/gpu:0'):
    cnn_lstm_model = Sequential()

    cnn_lstm_model.add(TimeDistributed(Conv3D(filters=64,kernel_size=(3,3,3),activation='relu'),
                                  input_shape=input_shape, name="Input_Conv_Layer"))

    cnn_lstm_model.add(TimeDistributed(MaxPool3D(
                                    pool_size=(2, 2, 2),
                                    strides=(2, 2, 2),
                                    padding='valid'
                                    ), name="Pool_Layer_1"))

    cnn_lstm_model.add(TimeDistributed(Flatten(), name="Flatten_Layer"))
    
with tf.device('/cpu:0'):

    cnn_lstm_model.add(LSTM(10, dropout = 0.3, recurrent_dropout = 0.3, name="LSTM_Layer"))

with tf.device('/gpu:0'):

    cnn_lstm_model.add(Dense(1, activation = 'sigmoid', name="Output_Dense_Layer"))

    cnn_lstm_model.compile(optimizer=optimizers.Adam(learning_rate=0.0001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

# cnn_lstm_model.fit_generator(generator=training_generator,
#     steps_per_epoch=train_steps_per_epoch, verbose=1, callbacks=callbacks,
#     validation_data=validation_generator, validation_steps=validate_steps_per_epoch,
#     epochs=epochs)

# cnn_lstm_model.fit(
#     x=training_generator,
#     steps_per_epoch=train_steps_per_epoch,
#     validation_data=validation_generator,
#     validation_steps=validate_steps_per_epoch,
#     epochs=epochs,
#     verbose=1,
#     callbacks=callbacks,
# )

# Misc. 

In [44]:
# https://stackoverflow.com/a/46216013/9221241
def get_model_memory_usage(batch_size, model):
    import numpy as np
    from keras import backend as K

    shapes_mem_count = 0
    internal_model_mem_count = 0
    for l in model.layers:
        layer_type = l.__class__.__name__
        if layer_type == 'Model':
            internal_model_mem_count += get_model_memory_usage(batch_size, l)
        # Ensure output shape exists
        if hasattr(l, 'output_shape'):
            out_shape = l.output_shape
        elif hasattr(l, 'compute_output_shape'):
            out_shape = l.compute_output_shape(l.input_shape)
        else:
            continue  # skip layers without shape information

        single_layer_mem = 1
        # for s in l.output_shape:
        #     if s is None:
        #         continue
        #     single_layer_mem *= s
        # shapes_mem_count += single_layer_mem

    trainable_count = 2
    non_trainable_count = 1
    # trainable_count = np.sum([K.count_params(p) for p in set(model.trainable_weights)])
    # non_trainable_count = np.sum([K.count_params(p) for p in set(model.non_trainable_weights)])

    number_size = 4.0
    if K.floatx() == 'float16':
         number_size = 2.0
    if K.floatx() == 'float64':
         number_size = 8.0

    total_memory = number_size*(batch_size*shapes_mem_count + trainable_count + non_trainable_count)
    gbytes = np.round(total_memory / (1024.0 ** 3), 3) + internal_model_mem_count
    return gbytes

In [45]:
get_model_memory_usage(32, cnn_lstm_model)

np.float64(0.0)

In [29]:
cnn_lstm_model.summary()