<a href="https://colab.research.google.com/github/iSarahSajjad/neuro-chronical-prediction/blob/Models/Batch_Processing_Preprocess_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [6]:
import os
import pandas as pd
import numpy as np
import nibabel as nib
from skimage.transform import resize
import pickle

# Load participant information
participant_df = pd.read_csv('/content/drive/My Drive/ChronicAI/dataset/participants.tsv', sep='\t')

# Convert labels to numerical values
label_mapping = {'chronic': 0, 'healthy': 1, 'subacute': 2}

# Parameters for batch processing
batch_size = 32  # Adjust based on available RAM
num_samples = len(participant_df)
num_batches = int(np.ceil(num_samples / batch_size))

# Directory to store preprocessed batches
output_dir = '/content/drive/My Drive/ChronicAI/dataset/preprocessed_batches'
os.makedirs(output_dir, exist_ok=True)

target_shape = None  # Initialize target shape for resizing

def preprocess_and_save_batch(batch_idx, batch_size):
    global target_shape  # Access the global target_shape variable
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, num_samples)

    batch_data = []
    batch_labels = []

    for index in range(start_idx, end_idx):
        row = participant_df.iloc[index]
        participant_id = row['participant_id']
        folder_name = participant_id
        for visit in range(1, 6):
            mri_path = f'/content/drive/My Drive/ChronicAI/dataset/{folder_name}/ses-visit{visit}/anat/{folder_name}_ses-visit{visit}_T1w.nii'
            if os.path.exists(mri_path):
                img = nib.load(mri_path)
                img_data = img.get_fdata()

                if target_shape is None:
                    target_shape = img_data.shape

                img_data_resized = resize(img_data, target_shape, order=1, preserve_range=True)

                batch_data.append(img_data_resized)
                batch_labels.append(row['group'])
            else:
                print(f"Warning: MRI file not found at {mri_path}")

    batch_data = np.stack(batch_data)
    batch_data = (batch_data - np.min(batch_data)) / (np.max(batch_data) - np.min(batch_data))
    batch_labels = np.array([label_mapping[label] for label in batch_labels])

    # Save batch data and labels
    batch_filename = os.path.join(output_dir, f'batch_{batch_idx}.pkl')
    with open(batch_filename, 'wb') as f:
        pickle.dump((batch_data, batch_labels), f)

# Process data in batches and save to disk
for batch_idx in range(num_batches):
    preprocess_and_save_batch(batch_idx, batch_size)
    print(f"Batch {batch_idx} processed and saved.")

print("All batches preprocessed and saved.")

Batch 0 processed and saved.
Batch 1 processed and saved.
Batch 2 processed and saved.
Batch 3 processed and saved.
All batches preprocessed and saved.
