In [2]:
import pandas as pd

# Load the two CSV files
df1 = pd.read_csv('output_1.csv')
df2 = pd.read_csv('output.csv')

# Merge the DataFrames on 'ID' from df1 and 'subject_id' from df2
merged_df = pd.merge(df1, df2, left_on='ID', right_on='subject_id', how='inner')

# Save the merged DataFrame to a new CSV file
merged_df.to_csv('merged_output.csv', index=False)

print("The files have been merged and saved as 'merged_output.csv'.")



The files have been merged and saved as 'merged_output.csv'.


In [3]:
import pandas as pd

# Load the dataset from merged_output.csv
df = pd.read_csv('merged_output.csv')

# Display the number of missing values in each column (optional, for verification)
print("Missing values before removal:")
print(df.isnull().sum())

# Remove rows where 'dRecov' has missing values
df_cleaned = df.dropna(subset=['dRecov'])

# Display the number of missing values in each column after removal (optional, for verification)
print("\nMissing values after removal:")
print(df_cleaned.isnull().sum())

# Save the cleaned DataFrame back to a CSV file
df_cleaned.to_csv('cleaned_output.csv', index=False)

print("Rows with missing 'dRecov' values have been removed. Cleaned data saved as 'cleaned_output.csv'.")


Missing values before removal:
ID                 0
age                0
sex                0
HOC                0
dInj               0
dRecov            10
subject_id         0
file_name          0
mean_intensity     0
std_intensity      0
max_intensity      0
min_intensity      0
shape              0
dtype: int64

Missing values after removal:
ID                0
age               0
sex               0
HOC               0
dInj              0
dRecov            0
subject_id        0
file_name         0
mean_intensity    0
std_intensity     0
max_intensity     0
min_intensity     0
shape             0
dtype: int64
Rows with missing 'dRecov' values have been removed. Cleaned data saved as 'cleaned_output.csv'.


In [5]:
import pandas as pd
import h5py

# Step 1: Load the merged_output.csv and identify IDs with missing dRecov
df = pd.read_csv('merged_output.csv')

# Identify the IDs where 'dRecov' is missing
missing_ids = df[df['dRecov'].isnull()]['ID']

# Save the missing IDs to a CSV for reference (optional)
missing_ids.to_csv('removed_ids.csv', index=False)

# Step 2: Load the original output.h5 file
with h5py.File('output.h5', 'r') as h5_file:
    # Copy data to a new HDF5 file, excluding the rows with missing IDs
    with h5py.File('cleaned_output.h5', 'w') as cleaned_h5:
        for subject_id in h5_file.keys():
            # Only copy data if the subject_id is not in the missing_ids list
            if subject_id not in missing_ids.astype(str).values:
                # Copy the group corresponding to this subject_id
                h5_file.copy(subject_id, cleaned_h5)

print("Cleaned HDF5 file saved as 'cleaned_output.h5'.")


Cleaned HDF5 file saved as 'cleaned_output.h5'.


In [2]:
import h5py
import numpy as np

# Function to overlay a mask with MRI data
def overlay_mask(mri_data, mask_data):
    reduced_data = mri_data[mask_data == 1]
    
    return reduced_data

# Load the brain and GM masks
with h5py.File('brain_mask.h5', 'r') as brain_mask_file, \
     h5py.File('gm_mask.h5', 'r') as gm_mask_file:

    brain_mask = brain_mask_file['image_array'][:]
    gm_mask = gm_mask_file['image_array'][:]

    # Load the cleaned MRI data and create new HDF5 files for overlayed data
    with h5py.File('cleaned_output.h5', 'r') as cleaned_file, \
         h5py.File('brain_overlayed_new.h5', 'w') as brain_overlayed_file, \
         h5py.File('gm_overlayed_new.h5', 'w') as gm_overlayed_file:

        for subject_id in cleaned_file.keys():
            # Load the MRI data for the current subject
            mri_data = cleaned_file[subject_id]['image_array'][:]

            # Overlay the brain mask
            brain_overlay = overlay_mask(mri_data, brain_mask)
            brain_overlayed_file.create_dataset(f'{subject_id}/image_array', data=brain_overlay)

            # Overlay the GM mask
            gm_overlay = overlay_mask(mri_data, gm_mask)
            gm_overlayed_file.create_dataset(f'{subject_id}/image_array', data=gm_overlay)

print("Overlayed data saved in 'brain_overlayed_new.h5' and 'gm_overlayed_new.h5'.")


Overlayed data saved in 'brain_overlayed_new.h5' and 'gm_overlayed_new.h5'.


In [13]:
import numpy as np
import pandas as pd
import h5py
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load CSV Data
df = pd.read_csv('cleaned_output.csv')

# Define input features and target variable
X = df[['age', 'sex', 'HOC', 'dInj']]
y = df['dRecov']

# Normalize or standardize the numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Load MRI and Mask Data from HDF5
with h5py.File('brain_overlayed.h5', 'r') as brain_h5, \
     h5py.File('gm_overlayed.h5', 'r') as gm_h5:

    # Example: Extract features for each subject and concatenate with X_scaled
    mri_features = []
    for subject_id in brain_h5.keys():
        brain_data = brain_h5[subject_id]['image_array'][:]
        gm_data = gm_h5[subject_id]['image_array'][:]
        
        # Flatten the MRI data or use a feature extractor here
        brain_flatten = brain_data.flatten()
        gm_flatten = gm_data.flatten()
        
        # Combine all features into one vector
        combined_features = np.concatenate((brain_flatten, gm_flatten))
        mri_features.append(combined_features)

    mri_features = np.array(mri_features)

# Combine the features from the CSV with the MRI features
X_combined = np.hstack((X_scaled, mri_features))

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Define the neural network model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)  # Output layer for regression
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
history = model.fit(X_train, y_train, validation_split=0.2, epochs=50, batch_size=32)

# Evaluate the model
test_loss = model.evaluate(X_test, y_test)
print(f"Test Loss (RMSE): {np.sqrt(test_loss)}")

# Make predictions
y_pred = model.predict(X_test)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 620ms/step - loss: 34284036.0000 - val_loss: 369005.7188
Epoch 2/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 327ms/step - loss: 1120861.8750 - val_loss: 183958.7812
Epoch 3/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 319ms/step - loss: 960735.5000 - val_loss: 24012.4961
Epoch 4/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 300ms/step - loss: 1053184.2500 - val_loss: 656512.1250
Epoch 5/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 364ms/step - loss: 1107223.0000 - val_loss: 140808.7969
Epoch 6/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 332ms/step - loss: 1186439.6250 - val_loss: 2310.1797
Epoch 7/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 324ms/step - loss: 1279148.1250 - val_loss: 91696.7500
Epoch 8/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 313ms/step - loss: 1378528.0

In [17]:
import numpy as np
import pandas as pd
import h5py
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load CSV Data
df = pd.read_csv('cleaned_output.csv')

# Convert IDs to strings and pad zeros to make sure they are of length 3 (e.g., '001' instead of '1')
csv_ids = set(df['ID'].astype(str).str.zfill(3))

# Define input features and target variable
X = df[['age', 'sex', 'HOC', 'dInj']]
y = df['dRecov']

# Normalize or standardize the numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Load MRI and Mask Data from HDF5
mri_features = []


with h5py.File('brain_overlayed.h5', 'r') as brain_h5, \
     h5py.File('gm_overlayed.h5', 'r') as gm_h5:

    for subject_id in brain_h5.keys():
        if subject_id in csv_ids:
            # Load the MRI data for the current subject
            brain_data = brain_h5[subject_id]['image_array'][:]
            gm_data = gm_h5[subject_id]['image_array'][:]

            # Flatten the MRI data or use a feature extractor here
            brain_flatten = brain_data.flatten()
            gm_flatten = gm_data.flatten()

            # Combine all features into one vector
            combined_features = np.concatenate((brain_flatten, gm_flatten))
            mri_features.append(combined_features)

    mri_features = np.array(mri_features)

# Check if the dimensions match after filtering
if X_scaled.shape[0] != mri_features.shape[0]:
    raise ValueError(f"Dimension mismatch: X_scaled has {X_scaled.shape[0]} rows, but mri_features has {mri_features.shape[0]} rows.")

# Combine the features from the CSV with the MRI features
X_combined = np.hstack((X_scaled, mri_features))

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Define the neural network model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)  # Output layer for regression
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
history = model.fit(X_train, y_train, validation_split=0.2, epochs=50, batch_size=32)

# Evaluate the model
test_loss = model.evaluate(X_test, y_test)
print(f"Test Loss (RMSE): {np.sqrt(test_loss)}")

# Make predictions
y_pred = model.predict(X_test)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 485ms/step - loss: 45776900.0000 - val_loss: 1158715.7500
Epoch 2/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 307ms/step - loss: 3546350.0000 - val_loss: 2034875.7500
Epoch 3/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 292ms/step - loss: 2230781.2500 - val_loss: 890452.6875
Epoch 4/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 309ms/step - loss: 2821116.7500 - val_loss: 806163.3125
Epoch 5/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 310ms/step - loss: 1280334.5000 - val_loss: 128545.2188
Epoch 6/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 304ms/step - loss: 807340.9375 - val_loss: 1613796.8750
Epoch 7/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 333ms/step - loss: 1590319.0000 - val_loss: 1290348.7500
Epoch 8/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 330ms/step - loss: 1

k-fold method


In [18]:
import numpy as np
import pandas as pd
import h5py
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

# Load CSV Data
df = pd.read_csv('cleaned_output.csv')

# Select input features and target variable
X = df[['age', 'sex', 'HOC', 'dInj']]
y = df['dRecov']

# Normalize or standardize the numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Load MRI Data from HDF5
mri_features = []

# Extract MRI data from HDF5 and match it by subject ID
with h5py.File('cleaned_output.h5', 'r') as h5_file:
    for subject_id in df['ID'].astype(str).str.zfill(3):
        if subject_id in h5_file.keys():
            # Load MRI data for the current subject
            mri_data = h5_file[subject_id]['image_array'][:]
            
            # Flatten the MRI data or use a feature extractor here
            mri_flatten = mri_data.flatten()
            
            # Append the flattened MRI data to the features list
            mri_features.append(mri_flatten)

mri_features = np.array(mri_features)

# Combine the features from the CSV with the MRI features
X_combined = np.hstack((X_scaled, mri_features))

# Discretize y to create bins for stratification
y_binned = pd.qcut(y, q=5, labels=False)  # Create 5 bins for stratification

# Define the Stratified K-Fold Cross-Validator
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Store results for each fold
fold_rmse = []

# K-Fold Cross-Validation
for train_index, test_index in skf.split(X_combined, y_binned):
    X_train, X_test = X_combined[train_index], X_combined[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Define the neural network model
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1)  # Output layer for regression
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='mean_squared_error')

    # Train the model
    history = model.fit(X_train, y_train, validation_split=0.2, epochs=50, batch_size=32, verbose=0)

    # Evaluate the model
    test_loss = model.evaluate(X_test, y_test, verbose=0)
    rmse = np.sqrt(test_loss)
    fold_rmse.append(rmse)
    print(f"Fold RMSE: {rmse}")

# Calculate the mean RMSE across all folds
mean_rmse = np.mean(fold_rmse)
print(f"Mean RMSE across all folds: {mean_rmse}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold RMSE: 61.9045522072842


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold RMSE: 59.519913618783924


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold RMSE: 77.42995826874764


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold RMSE: 90.60838748599657


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold RMSE: 51.27667227054253
Mean RMSE across all folds: 68.14789677027098


gridsearchCV, repeated stratified k fold

In [21]:
import numpy as np
import pandas as pd
import h5py
import tensorflow as tf
from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, mean_squared_error

# Load CSV Data
df = pd.read_csv('cleaned_output.csv')

# Select input features and target variable
X = df[['age', 'sex', 'HOC', 'dInj']]
y = df['dRecov']

# Normalize or standardize the numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Load MRI Data from HDF5
mri_features = []

# Extract MRI data from HDF5 and match it by subject ID
with h5py.File('cleaned_output.h5', 'r') as h5_file:
    for subject_id in df['ID'].astype(str).str.zfill(3):
        if subject_id in h5_file.keys():
            # Load MRI data for the current subject
            mri_data = h5_file[subject_id]['image_array'][:]
            
            # Flatten the MRI data or use a feature extractor here
            mri_flatten = mri_data.flatten()
            
            # Append the flattened MRI data to the features list
            mri_features.append(mri_flatten)

mri_features = np.array(mri_features)

# Combine the features from the CSV with the MRI features
X_combined = np.hstack((X_scaled, mri_features))

# Define a function to create a Keras model for use in GridSearchCV
def create_model(neurons=64, learning_rate=0.001, dropout_rate=0.0):
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(neurons, activation='relu', input_shape=(X_combined.shape[1],)),
        tf.keras.layers.Dropout(dropout_rate),
        tf.keras.layers.Dense(neurons // 2, activation='relu'),
        tf.keras.layers.Dense(1)  # Output layer for regression
    ])
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    
    return model

# Wrap the model for use in scikit-learn
model = KerasRegressor(model=create_model, verbose=0)

# Define the grid of hyperparameters to search
param_grid = {
    'model__neurons': [32, 64, 128],  # Number of neurons in the first hidden layer
    'model__learning_rate': [0.001, 0.01, 0.1],  # Learning rate for Adam optimizer
    'model__dropout_rate': [0.0, 0.2, 0.5],  # Dropout rate
    'batch_size': [16, 32, 64],  # Batch size for training
    'epochs': [50, 100]  # Number of epochs
}

# Set up the stratified k-fold cross-validation
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)

# Define the scoring function (RMSE)
scorer = make_scorer(mean_squared_error, squared=False)

# Set up GridSearchCV to search for the best hyperparameters
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scorer, cv=rskf, n_jobs=-1)

# Fit GridSearchCV
grid_result = grid.fit(X_combined, y)

# Display the best parameters and the best RMSE score
print(f"Best parameters: {grid_result.best_params_}")
print(f"Best RMSE: {grid_result.best_score_}")

# Get the best model
best_model = grid_result.best_estimator_

# Evaluate the best model on a hold-out test set if available, or using cross-validation scores
print(f"Best model score (RMSE): {grid_result.best_score_}")




In [1]:
import tensorflow as tf


In [16]:
import numpy as np
import pandas as pd
import h5py
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load CSV Data
df = pd.read_csv('cleaned_output.csv')

# Convert IDs to strings and pad zeros to make sure they are of length 3 (e.g., '001' instead of '1')
csv_ids = set(df['ID'].astype(str).str.zfill(3))

# Select input features and target variable
X = df[['age', 'sex', 'HOC', 'dInj']]
y = df['dRecov']

# Normalize or standardize the numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: Load MRI Data from HDF5
mri_features = []

# Extract MRI data from HDF5 and match it by subject ID
with h5py.File('cleaned_output.h5', 'r') as h5_file:
    for subject_id in csv_ids:
        if subject_id in h5_file.keys():
            # Load MRI data for the current subject
            mri_data = h5_file[subject_id]['image_array'][:]
            
            # Flatten the MRI data or use a feature extractor here
            mri_flatten = mri_data.flatten()
            
            # Append the flattened MRI data to the features list
            mri_features.append(mri_flatten)

mri_features = np.array(mri_features)

# Ensure the dimensions match after processing
if X_scaled.shape[0] != mri_features.shape[0]:
    raise ValueError(f"Dimension mismatch: X_scaled has {X_scaled.shape[0]} rows, but mri_features has {mri_features.shape[0]} rows.")

# Step 3: Combine Features
X_combined = np.hstack((X_scaled, mri_features))

In [7]:
import h5py
import pandas as pd
import numpy as np

# Open the HDF5 file
with h5py.File('cleaned_output.h5', 'r') as h5_file:
    # Create a list to store the data to be converted to CSV
    data_list = []

    # Iterate through each subject in the HDF5 file
    for subject_id in h5_file.keys():
        # Load MRI data for the current subject
        mri_data = h5_file[subject_id]['image_array'][:]
        
        # Flatten the MRI data (or perform any desired feature extraction)
        mri_flattened = mri_data.flatten()
        
        # Convert the flattened MRI data into a list
        mri_flattened_list = mri_flattened.tolist()
        
        # Append the subject ID and the flattened MRI data to the data list
        data_list.append([subject_id] + mri_flattened_list)

# Convert the data list to a DataFrame
df_mri = pd.DataFrame(data_list)

# Define the column names (subject_id + MRI pixel values)
df_mri.columns = ['subject_id'] + [f'pixel_{i}' for i in range(df_mri.shape[1] - 1)]

# Save the DataFrame to a CSV file
df_mri.to_csv('cleaned_output_from_h5.csv', index=False)

print("The HDF5 data has been successfully converted to 'cleaned_output.csv'.")


The HDF5 data has been successfully converted to 'cleaned_output.csv'.


In [12]:
import h5py
import pandas as pd

# Step 1: Extract IDs from brain_overlayed.h5
with h5py.File('brain_overlayed.h5', 'r') as brain_h5:
    brain_ids = set(brain_h5.keys())

# Step 2: Extract IDs from gm_overlayed.h5
with h5py.File('gm_overlayed.h5', 'r') as gm_h5:
    gm_ids = set(gm_h5.keys())

# Step 3: Extract IDs from cleaned_output.h5
with h5py.File('cleaned_output.h5', 'r') as cleaned_h5:
    cleaned_h5_ids = set(cleaned_h5.keys())

# Step 4: Extract and format IDs from cleaned_output.csv
df = pd.read_csv('cleaned_output.csv')

# Convert IDs to strings and pad zeros to make sure they are of length 3 (e.g., '001' instead of '1')
csv_ids = set(df['ID'].astype(str).str.zfill(3))

# Step 5: Compare the IDs
# Find common IDs
common_ids = brain_ids & gm_ids & cleaned_h5_ids & csv_ids

# Find unique IDs in each set
unique_brain_ids = brain_ids - common_ids
unique_gm_ids = gm_ids - common_ids
unique_cleaned_h5_ids = cleaned_h5_ids - common_ids
unique_csv_ids = csv_ids - common_ids

# Print the results
print("Common IDs across all files:", common_ids)
print("\nUnique IDs in brain_overlayed.h5:", unique_brain_ids)
print("\nUnique IDs in gm_overlayed.h5:", unique_gm_ids)
print("\nUnique IDs in cleaned_output.h5:", unique_cleaned_h5_ids)
print("\nUnique IDs in cleaned_output.csv:", unique_csv_ids)


Common IDs across all files: {'113', '119', '044', '008', '040', '067', '075', '083', '020', '080', '043', '115', '002', '107', '110', '004', '050', '073', '069', '101', '068', '003', '047', '071', '082', '057', '064', '120', '072', '005', '009', '088', '079', '085', '097', '011', '112', '049', '055', '042', '010', '028', '127', '061', '102', '103', '118', '015', '014', '022', '033', '024', '063', '046', '123', '017', '091', '051', '018', '109', '036', '126', '125', '013', '089', '093', '062', '114', '081', '023', '006', '099', '007', '104', '034', '021', '016', '052', '031', '012', '105', '032', '076', '111', '106', '090', '039', '019', '056', '094', '092', '027', '078'}

Unique IDs in brain_overlayed.h5: set()

Unique IDs in gm_overlayed.h5: set()

Unique IDs in cleaned_output.h5: set()

Unique IDs in cleaned_output.csv: set()


In [11]:
import h5py
import os

# Define the IDs to be removed
ids_to_remove = {'086', '035', '060', '066', '084'}

# Function to remove IDs from an HDF5 file and overwrite the original file
def remove_ids_from_h5(file_path, ids_to_remove):
    # Open the existing HDF5 file
    with h5py.File(file_path, 'r') as h5_file:
        # Create a temporary file to store cleaned data
        temp_file_path = 'temp_' + file_path
        with h5py.File(temp_file_path, 'w') as cleaned_h5_file:
            # Copy groups that are not in ids_to_remove
            for subject_id in h5_file.keys():
                if subject_id not in ids_to_remove:
                    h5_file.copy(subject_id, cleaned_h5_file)
    
    # Replace the original file with the cleaned temporary file
    os.remove(file_path)
    os.rename(temp_file_path, file_path)

# Remove IDs from brain_overlayed.h5 and overwrite the file
remove_ids_from_h5('brain_overlayed.h5', ids_to_remove)

# Remove IDs from gm_overlayed.h5 and overwrite the file
remove_ids_from_h5('gm_overlayed.h5', ids_to_remove)

# Remove IDs from cleaned_output.h5 and overwrite the file
remove_ids_from_h5('cleaned_output.h5', ids_to_remove)

print("Specified IDs have been removed and original files have been updated.")


Specified IDs have been removed and original files have been updated.
