In [1]:
#!/usr/bin/env python
# coding: utf-8

"""
NAPLS3 BIDS Converter

High-level Overview:
1) Extract subject IDs and session information (dates → session numbers)
   from folder names in a 'source_images' directory.
2) Unzip DICOMs (from .tgz archives) into an intermediate folder in
   /scratch/ to avoid filesystem overload with small files.
3) Convert DICOMs to NIfTI using dcm2niix.
4) Move resulting .nii.gz (and .json) files into their proper BIDS format
   folders (anat, func, dwi, fmap, etc.) within designated site-specific
   directories.
5) Provide additional utility and debugging functions to facilitate
   working with metadata (e.g., IntendedFor, EchoTimes, TaskNames, etc.).
"""

import os
import subprocess
import tarfile
import shutil
import sys
from datetime import datetime, timedelta
import json
import logging
import numpy as np
import pandas as pd
import re
from collections import defaultdict, Counter
import pprint
from pathlib import Path

# ----------------------------------------------------------------------
# CONSTANTS & PATHS
# ----------------------------------------------------------------------

# # Source data (folders with .nii.gz)
source_images = "/projects/f_ah1491_1/open_data/NAPLS3/sourcedata/image03"

# # BIDS directories
bids_directory = '/projects/f_ah1491_1/open_data/NAPLS3/bidsdata'
dicom_directory = '/scratch/kj537'


# # Create the main BIDS directory if it doesn't already exist
# if not os.path.exists(bids_directory):
#     os.makedirs(bids_directory, exist_ok=True)
#     print(f"Created bids_directory at {bids_directory}")

# ----------------------------------------------------------------------
# PERSONALIZED FUNCTIONS: Extract subject IDs, dates and mode 
#      !! Change based on your dataset !!
# ----------------------------------------------------------------------

def return_subject_id_given_session_id(session_id):
    """
    This function works based on parsing the string for underscores and taking the 
    first unit, becuase that's where subject IDs were for my files. You'll have to change
    this based on where subject IDs are for your files. They may be in the filename or 
    the associated JSON.


    Given a session_id string (e.g., "01S0300_2023-07-01"), extract the subject ID.
    Example:
        session_id = "01S0300_2023-07-01" 
        subject_id = "01S0300"
    """
    session_id_parts = session_id.split('_')
    if len(session_id_parts) == 3:
        subject_id = session_id_parts[0] + session_id_parts[1]
        return subject_id
    else:
        return None

def return_date_given_filename(session_id):
    """
    Attempt to extract a date or datetime from the file name using 
    multiple regex patterns. Returns a datetime object if found, else None.
    """
    patterns = [
        r'\d{8}',         # YYYYMMDD
        r'\d{6}',         # YYYYMM
        r'\d{10,14}',     # YYYYMMDDHHMM, YYYYMMDDHHMMSS
        r'\d{4}-\d{2}-\d{2}',  # YYYY-MM-DD
        r'\d{2}-\d{4}'    # MM-YYYY
    ]
    
    for pattern in patterns:
        matches = re.findall(pattern, session_id)
        for match in matches:
            try:
                # Attempt to parse the string as various date formats
                if len(match) == 6:
                    return datetime.strptime(match, "%Y%m"), "%Y%m"
                elif len(match) == 8:
                    return datetime.strptime(match, "%Y%m%d"), "%Y%m%d"
                elif len(match) == 10:
                    if '-' in match:
                        return datetime.strptime(match, "%Y-%m-%d"), "%Y-%m-%d"
                    else:
                        return datetime.strptime(match[:10], "%Y%m%d%H"), "%Y%m%d%H"
                elif len(match) == 12:
                    return datetime.strptime(match, "%Y%m%d%H%M"), "%Y%m%d%H%M"
                elif len(match) == 14:
                    return datetime.strptime(match, "%Y%m%d%H%M%S"), "%Y%m%d%H%M%S"
            except ValueError:
                continue
    return None


def find_mode(file_name): # this may also be a folder name after unzipping
    """
    Given a file name, guess the scanning mode (anat, func, dwi, fmap).
    If your filenames have mode information in them, !!change the strings!! 
    to adapt to your dataset.

    If your filenames don't have mode markers, you'll have to extract information
    from the JSONs. 
    """
    file_name = file_name.lower()
    if "fieldmap" in file_name:
        return "fmap"
    elif "bold" in file_name:
        return "func"
    elif "t1" in file_name or "t2" in file_name or "flash" in file_name or "flash2d" in file_name:
        return "anat"
    elif "diff" in file_name:
        return "dwi"
    else:
        return "UnrecognizedFileType"



# ----------------------------------------------------------------------
# CREATE SUBJECT-SESSIONS DICTIONARY
#   These functions parse folder names and file dates to build a 
#   structure of subjects, their sessions, and the associated dates.
# ----------------------------------------------------------------------


def create_subject_sessions_dict2(source_images, time_threshold):
    """
    A second variant that does a two-pass approach:
      1) Gather folder info (subject, date).
      2) Add any files discovered to the relevant session if
         dates are within time_threshold days. (Handles if 
         different scan-types are done on different days)

    source_images = your directory where unzipped .nii.gz images are
    time_threshold = an integer 
    """
    subject_sessions = {}

    # First pass: Gather folder info
    for subject_folder in os.listdir(source_images):
        folder_path = os.path.join(source_images, subject_folder)
        if os.path.isdir(folder_path):
            parts = subject_folder.split('_')
            subject_id = return_subject_id_given_session_id(subject_folder)
            session_date, date_format = return_date_given_filename(subject_folder)
            if isinstance(session_date, datetime): 
                session_info = (subject_id, "1", session_date, subject_folder)
            
                if subject_id in subject_sessions:
                    subject_sessions[subject_id].append(session_info)
                else:
                    subject_sessions[subject_id] = [session_info]
            else:
                print(f"Ignoring folder {subject_folder} as the date extracted was not a datetime variable")
        else:
            print(f"Ignoring folder {subject_folder} as it is not a folder")
    
    # Second pass: Check file dates to see if a session already exists within 50 days
    for dirpath, _, filenames in os.walk(source_images):
        relative_path = os.path.relpath(dirpath, source_images)
        first_subdir = relative_path.split(os.sep)[0] if os.sep in relative_path else ""
        
        for file in filenames:
            if file.endswith('.nii.gz') or file.endswith('.json'):
                session_parts = first_subdir.split('_')
                if len(session_parts) > 2:
                    subject_id = session_parts[0] + session_parts[1]
                    file_date, date_format = return_date_given_filename(subject_folder)
                    if file_date is None:
                        continue

                    # Convert file_date to datetime
                    if isinstance(file_date, datetime):
                        
                        # Attach new session if none found within 50 days
                        if subject_id in subject_sessions:
                            sessions = subject_sessions[subject_id]
                            found_session = False
                            for i, (sess_sub_id, _, sess_date, _) in enumerate(sessions):
                                if abs(sess_date - file_date) <= timedelta(days=time_threshold):
                                    found_session = True
                                    break
                            if not found_session:
                                subject_sessions[subject_id].append((subject_id, "1", file_date, file))
        
    # Sort sessions by date and assign session_nums
    for sessions in subject_sessions.values():
        sessions.sort(key=lambda x: x[2])
        for i, session in enumerate(sessions):
            sessions[i] = session[:1] + (str(i + 1),) + session[2:]
    return subject_sessions


def extract_ses_given_session_id(subject_sessions, session_id):
    """
    Return the session number (e.g., '1') for a given session_id 
    by searching in subject_sessions dictionary.
    """
    for key, sublist in subject_sessions.items():
        for item in sublist:
            if item[3] == session_id:
                return item[1]
    return None


def extract_id_given_session_id(subject_sessions, session_id):
    """
    Return the subject ID for a given session_id 
    by searching in subject_sessions dictionary.
    """
    for key, sublist in subject_sessions.items():
        for item in sublist:
            if item[3] == session_id:
                return key
    return None


def extract_subject_ids(subject_sessions):
    """
    Return a list of all subject IDs from the subject_sessions dictionary.
    """
    subject_ids = []
    for subject_id, _ in subject_sessions.items():
        subject_ids.append(subject_id)
    return subject_ids 


def extract_session_ids(subject_sessions):
    """
    Return a list of all session folder names from the subject_sessions dictionary.
    """
    session_ids = []
    for _, sublist in subject_sessions.items():
        for item in sublist:
            session_ids.append(item[3])
    return session_ids


def extract_session_nums(subject_sessions):
    """
    Return a list of all session numbers (e.g. '1', '2') from the subject_sessions dictionary.
    """
    session_nums = []
    for _, sublist in subject_sessions.items():
        for item in sublist:
            session_nums.append(item[1])
    return session_nums


def extract_date_given_sub_ses(subject_sessions, subject_id, session_num):
    """
    Given a subject and session number, return the associated date object/string.
    """
    for key, sublist in subject_sessions.items(): 
        if key == subject_id:
            for item in sublist:
                if item[1] == session_num:
                    # item = (subject_id, session_num, date, folder)
                    return item[2]
    return None


def extract_ses_given_date(subject_sessions, filename):
    """
    For a given subject and date, find the session number if a session
    date is within 50 days of the file_date.
    """
    file_date, date_format = return_date_given_filename(filename)
    subject_id = return_subject_id_given_session_id(filename)
    if isinstance(file_date, str):
        # Attempt to parse the file_date string
        try:
            file_date_obj = datetime.strptime(file_date, date_format)
        except ValueError:
            return f"Error: '{file_date}' is not a valid date string."
    elif isinstance(file_date, datetime):
        file_date_obj = file_date
    else:
        return f"Error: {file_date} is not a str or datetime"

    # Check the difference
    if file_date_obj:
        if subject_id in subject_sessions:
            for item in subject_sessions[subject_id]:
                date = item[2]
                if isinstance(date, str):
                    try:
                        session_date = datetime.strptime(date, date_format)
                    except ValueError:
                        return f"Error: '{date}' is not a valid date string."
                else:
                    session_date = date

                delta = abs(session_date - file_date_obj)
                # 50 days tolerance
                if delta <= timedelta(days=50):
                    return item[1]
        return f"Error: no sessions found within 90 days for {subject_id}, file_date {file_date_obj}"
    return None


def extract_dates_given_sub(subject_sessions, subject_id):
    """
    For a given subject, return a list of session dates (as strings or datetime).
    """
    datelist = []
    for key, sublist in subject_sessions.items(): 
        if key == subject_id:
            for item in sublist:
                folder_name = item[3]
                id_parts = folder_name.split('_')
                date_str = id_parts[-1]
                try:
                    date_obj = datetime.strptime(date_str, "%Y-%m-%d")
                    datelist.append(date_obj)
                except ValueError as e:
                    datelist.append(date_str)
    return datelist

# ----------------------------------------------------------------------
# BIDS FOLDER CREATION & FILE MOVE FUNCTIONS
# ----------------------------------------------------------------------



def create_subject_folder(subject_id):
    """
    Create the top-level sub-XXX folder for a subject in the BIDS directory.
    """
    subject_folder = os.path.join(bids_directory, f"sub-{subject_id}")
    if not os.path.exists(subject_folder):
        os.makedirs(subject_folder, exist_ok=True)

        
def create_session_folder(session_id, subject_sessions):
    """
    Creates the BIDS session folder structure for a given session ID.

    Example folder structure:
      sub-<subject_id>/
        ses-<session_num>/
          anat/
          func/
          fmap/
          dwi/
    """
    directories = ["anat", "func", "fmap", "dwi"]
    session_num = extract_ses_given_session_id(subject_sessions, session_id)
    subject_id = extract_id_given_session_id(subject_sessions, session_id)
    
    session_path = os.path.join(bids_directory, f"sub-{subject_id}", f"ses-{session_num}")
    for directory in directories:
        os.makedirs(os.path.join(session_path, directory), exist_ok=True)



def copy_file_with_new_name(filepath, destination_dir, new_filename):
    """
    Copy a file from filepath to destination_dir with a new name (new_filename).

    Returns a message indicating success, existence, or error.
    """
    try:
        os.makedirs(destination_dir, exist_ok=True)
        dest_file = os.path.join(destination_dir, new_filename)
        if os.path.isfile(filepath):
            if not os.path.isdir(dest_file):
                if not os.path.exists(dest_file):
                    shutil.copy2(filepath, dest_file)
                    if os.path.isfile(os.path.join(destination_dir, new_filename)):
                        return f"{new_filename} confirmed move"
                else:
                    return f"{new_filename} already exists, skipping"
            else:
                return f"DESTINATION FILEPATH IS A DIRECTORY {dest_file}"
        else:
            return "SOURCE FILEPATH DOES NOT EXIST"
    except FileNotFoundError:
        return "THE SOURCE FILE NOT FOUND."
    except PermissionError:
        return f"PERMISSION DENIED {dest_file}"
    except Exception as e:
        return f"ERROR OCCURRED: {e}"


def copy_file_to_bids(subject_id, session_num, file, filepath, bids_directory, parent_folder):
    """
    Move or copy a file into the correct BIDS folder (anat/func/fmap/dwi).
    The 'mode' is determined from either the file itself or the parent folder.
    """
    mode = find_mode(file)
    if mode == 'UnrecognizedFileType':
        # If the file name doesn't clarify, check the parent folder
        mode = find_mode(parent_folder)
    
    if mode != 'UnrecognizedFileType':
        destination_dir = os.path.join(
            bids_directory, f"sub-{subject_id}", f"ses-{session_num}", mode
        )
    else:
        print(f"Mode cannot be found for {file}")
        return
    
    try:
        os.makedirs(destination_dir, exist_ok=True)
        dest_file = os.path.join(destination_dir, file)
        if os.path.isfile(filepath):
            if not os.path.isdir(dest_file):
                if not os.path.exists(dest_file):
                    shutil.copy2(filepath, dest_file)
                    if os.path.isfile(os.path.join(destination_dir, file)):
                        print(f"{file} confirmed move")
                    else:
                        print(f"{file} attempted to be moved but was not successful")
                else:
                    print(f"{file} already exists, can't be moved to {destination_dir}")
            else:
                print(f"Is directory: {dest_file}")
    except Exception as e:
        print(f"Error: {e} from moving {file} to {dest_file}")


def is_folder_empty(folder_path):
    """Check if a folder is empty."""
    return len(os.listdir(folder_path)) == 0    


# ----------------------------------------------------------------------
# DICOM UNZIP & CONVERSION
# ----------------------------------------------------------------------

def unzip_and_convert_dicom(source_images, dicom_directory):
    """
    1) Unzip .tgz DICOM folders into /scratch/subject_id.
    2) Convert them to NIfTI with dcm2niix.
    3) Remove the DICOMs to reduce clutter.
    """
    for file in os.listdir(source_images):
        filepath = os.path.join(source_images, file)
        subject_id = return_subject_id_given_session_id(file)
        mode = find_mode(file)
        if file.endswith('.tgz'):
            dcms_folder = os.path.join(dicom_directory, subject_id, os.path.basename(filepath).split('.')[0])
            nii_folder = os.path.join(source_images, subject_id, os.path.basename(filepath).split('.')[0])
            if not os.path.exists(nii_folder) or is_folder_empty(nii_folder):
                os.makedirs(nii_folder, exist_ok=True)
                
                if not os.path.exists(dcms_folder):
                    os.makedirs(dcms_folder, exist_ok=True)
                with tarfile.open(filepath, 'r:gz') as tar:
                    tar.extractall(dcms_folder)
                    # Convert DICOMs
                    subprocess.run([
                        'dcm2niix', 
                        '-z', 'y',            # Compress as .nii.gz
                        '-f', '%s_%b',        # Output filename format
                        '-o', nii_folder,     # Output folder
                        dcms_folder
                    ])
                    # Remove the DICOM folder after conversion
                    os.system(f'rm -r {dcms_folder}')
                print(f"Unzipped/converted dicoms for subject {subject_id} in session folder {source_images}")


def find_delete_substring(original_string, substring):
    """
    Return a string with the specified substring removed.
    """
    index = original_string.find(substring)
    if index == -1:
        return original_string
    return original_string.replace(substring, '')


# ----------------------------------------------------------------------
# DATE & FILENAME UTILITIES
# ----------------------------------------------------------------------




def date_conversion(date_string, date_format):
    """
    Safely convert a date string of format YYYY-MM-DD into a datetime object.
    """
    try:
        date_object = datetime.strptime(date_string, date_format)
        return date_object
    except ValueError as e:
        print(f"{date_string} couldn't be datetime'd: {e}")


# ----------------------------------------------------------------------
# METADATA GATHERING & RUN INDEXING
# ----------------------------------------------------------------------

def gather_filenames_and_metadata(bidsdata_raw):
    """
    Build a dictionary describing files discovered in 'bidsdata_raw':
      { folder_path: [ (file, datetime_object, run-#), ... ] }

    1) We first check if the file can be assigned to a session (based on date).
    2) We then assign runs to each date group inside that folder.
    """
    file_metadata = {}

    # Walk through the directory
    for sub in os.listdir(bidsdata_raw):
        for ses in os.listdir(os.path.join(bidsdata_raw, sub)):
            for mode in os.listdir(os.path.join(bidsdata_raw, sub, ses)):
                for file in os.listdir(os.path.join(bidsdata_raw, sub, ses, mode)):
                    file_date_obj, date_format = return_date_given_filename(file)
                    if not file_date_obj:
                        print(f"No date extracted for {file}")
                    else:
                        # Build the path and deduce session number if needed
                        file_date = datetime.strftime(file_date_obj, date_format)
                        subject_id = sub.replace('sub-', '')
                        file_session_num = extract_ses_given_date(subject_sessions, subject_id, file_date_obj)

                        if file_session_num is not None:
                            file_ses = f'ses-{file_session_num}'
                            folder_path = os.path.join(bidsdata_raw, sub, file_ses, mode)

                            if folder_path not in file_metadata:
                                file_metadata[folder_path] = []
                            file_info = (file, file_date_obj)
                            if file_info not in file_metadata[folder_path]:
                                file_metadata[folder_path].append(file_info)

    # Assign runs within each folder path
    for folder, files in file_metadata.items():
        valid_files = [f for f in files if f[1] is not None]
        invalid_files = [f for f in files if f[1] is None]
        
        # Group valid files by date
        date_groups = defaultdict(list)
        for filename, datetime_obj in valid_files:
            date_only = datetime_obj.date()
            date_groups[date_only].append((filename, datetime_obj))
        
        # Sort each date group by datetime, assign run indices
        sorted_files = []
        run_idx = 1
        for date in sorted(date_groups.keys()):
            date_group_files = sorted(date_groups[date], key=lambda x: x[1])
            for filename, datetime_obj in date_group_files:
                sorted_files.append((filename, datetime_obj, f'run-{run_idx}'))
            run_idx += 1
        
        # Combine valid sorted files and invalid files
        file_metadata[folder] = sorted_files + invalid_files

    return file_metadata


def return_run_given_file(file_metadata, filepath, filename):
    """
    For a given file path (folder) and file name, return the assigned run
    from file_metadata if it exists.
    """
    if filepath in file_metadata:
        file_list = file_metadata[filepath]
        for file_entry in file_list:
            if file_entry[0] == filename:
                # file_entry = (filename, datetime_obj, run-#)
                return file_entry[2]
    return None


def print_duplicates_filedict(file_dict):
    """
    Debug function: print duplicates in the nested dictionary
    containing file info. Looks for repeated (filename, run#).
    """
    for folder, files in file_dict.items():
        print(f"Folder: {folder}")
        folder_list = []
        for file_info in files:
            if len(file_info) > 2:
                # file_info = (filename, datetime_obj, run)
                folder_list.append(f"{file_info[0]}, {file_info[2]}")
            else:
                print(f"{file_info} - shorter than expected")
        
        counts = Counter(folder_list)
        duplicates = {item: count for item, count in counts.items() if count > 1}
        
        if duplicates:
            print("Duplicates found:")
            for item, count in duplicates.items():
                print(f"{item}: {count} times")
        else:
            print("No duplicates found.")
        print()


def nested_dict_head(nested_dict, num_entries=5):
    """
    Print a limited 'head' of a nested dictionary (for debugging).
    """
    pp = pprint.PrettyPrinter(indent=2)
    count = 0
    for key, value in nested_dict.items():
        print(f"Key: {key}")
        if isinstance(value, dict):
            for sub_key, sub_value in value.items():
                print(f"  Sub Key: {sub_key}")
                pp.pprint(sub_value)
                count += 1
                if count >= num_entries:
                    return
        else:
            pp.pprint(value)
            count += 1
            if count >= num_entries:
                return


def count_keys(nested_dict):
    """
    Count total keys in a nested dictionary.
    """
    def recursive_count(d):
        c = 0
        if isinstance(d, dict):
            c += len(d)
            for k in d:
                c += recursive_count(d[k])
        return c
    
    total_keys = recursive_count(nested_dict)
    print(f"Total number of keys: {total_keys}")


# ----------------------------------------------------------------------
# METADATA-EDITING FUNCTIONS (INTENDEDFOR, ECHOTIMES, ETC.)
# ----------------------------------------------------------------------

def index_bids_directory(directory):
    """
    Recursively walk a BIDS directory, capturing relevant metadata for each JSON
    and storing it in a dictionary. 
      layout[file_path] = { 'entities': <dict>, 'metadata': <dict> }
    """
    layout = defaultdict(dict)
    for root, _, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            entities = extract_bids_entities(file_path)
            if file.endswith('.json'):
                metadata = parse_json(file_path)
                layout[file_path] = {'entities': entities, 'metadata': metadata}
            else:
                layout[file_path] = {'entities': entities}
    return layout


def update_intendedfor(layout, sub, ses, overwrite=False):
    """
    Assign 'IntendedFor' field to field maps and other files in a BIDS dataset.

    Steps:
      1) Query all NIfTIs for that subject and session.
      2) For fmaps (phasediff/magnitude), set 'IntendedFor' to refer to
         associated _bold or _dwi scans.
      3) For func or dwi, set 'IntendedFor' similarly in JSON if needed.
      4) If 'TaskName' is missing for BOLD scans, add 'TaskName'.
    """
    niftis = query_files(layout, sub=sub, ses=ses, suffix='.nii.gz')
    for nifti_filepath in niftis:
        if '.bidsignore' in nifti_filepath:
            continue
        nifti_filename = os.path.basename(nifti_filepath)
        
        # JSON sidecar
        json_filepath = nifti_filepath.replace('.nii.gz', '.json')
        json_filename = nifti_filename.replace('.nii.gz', '.json')

        # Add TaskName if we find a BOLD but no 'TaskName' field
        task = get_element('TaskName', layout, json_filepath)
        if 'task' in nifti_filename and (task is None):
            update_json_key(json_filepath, 'TaskName', 'Rest')
            print(f'Added TaskName to {json_filename}')

        # For fieldmap: add IntendedFor
        if ('phase' in nifti_filename or 'magnitude' in nifti_filename):
            if overwrite or get_element('IntendedFor', layout, json_filepath) is None:
                intended_prev = get_element_raw('IntendedFor', nifti_filepath)
                print(f"For {nifti_filename} - previous: {intended_prev}")
                IntendedFor_list = intended_for_gen(nifti_filepath, niftis, layout)
                print(f'Updated IntendedFor: {IntendedFor_list}')
                update_json_key(json_filepath, 'IntendedFor', IntendedFor_list)

        # For bold/dwi: add IntendedFor 
        if '_bold' in nifti_filename or '_dwi' in nifti_filename:
            intended_prev = get_element_raw('IntendedFor', nifti_filepath)
            print(f"For {nifti_filename} - previous: {intended_prev}")
            IntendedFor_list = intended_for_gen(nifti_filepath, niftis, layout)
            print(f'Updated IntendedFor: {IntendedFor_list}')
            update_json_key(json_filepath, 'IntendedFor', IntendedFor_list)


def intended_for_gen(matchto_json_path, niftis, layout):
    """
    A simplified function that pairs up fieldmaps with BOLD/DWI (or vice versa)
    within the same sub/ses. For each fieldmap, returns a list of BOLDs/DWIs,
    or for each BOLD, returns the list of fieldmaps found in the directory.
    """
    intended_for = []
    for matching_nifti in niftis:
        nifti_name = os.path.basename(matching_nifti)
        # If matchto_json_path is a fieldmap, attach it to BOLDs or DWIs
        if ('phase' in matchto_json_path or 'magnitude' in matchto_json_path):
            if ('_bold' in matching_nifti or '_dwi' in matching_nifti):
                path_piece = extract_IntendedFor_path(matching_nifti)
                if path_piece:
                    intended_for.append(path_piece)
        # If matchto_json_path is a BOLD or DWI, attach it to fieldmaps
        elif ('_bold' in matchto_json_path or '_dwi' in matchto_json_path):
            if ('phase' in matching_nifti or 'magnitude' in matching_nifti):
                path_piece = extract_IntendedFor_path(matching_nifti)
                if path_piece:
                    intended_for.append(path_piece)

    return sorted(set(intended_for))


# Helper: Query files in the layout that match sub, ses, and suffix
def query_files(layout, sub=None, ses=None, suffix=None):
    results = []
    for file_path, info in layout.items():
        if sub in file_path and ses in file_path:
            if suffix and file_path.endswith(suffix):
                results.append(file_path)
    return results


# Helper: Extract an element (e.g., 'TaskName') from the layout's metadata
def get_element(element, layout, file_path):
    if 'metadata' in layout[file_path]:
        return layout[file_path]['metadata'].get(element, None)
    return None


# Helper: Extract an element directly from the JSON sidecar (bypassing the layout dict)
def get_element_raw(element, nifti_file_path):
    """
    Open the corresponding .json file (same path) and retrieve <element>.
    """
    if '.nii.gz' in nifti_file_path:
        json_file_path = nifti_file_path.replace('.nii.gz', '.json')
    elif '.json' in nifti_file_path:
        json_file_path = nifti_file_path
    else:
        return f"Not a valid filepath: {nifti_file_path}"
    
    if os.path.isfile(json_file_path):
        with open(json_file_path, 'r') as file:
            metadata = json.load(file)
        return metadata.get(element, None)
    else:
        return f"No JSON file associated with {nifti_file_path}"


# Helper: BIDS Entities from filepath (e.g., sub-04S0300, ses-1)
def extract_bids_entities(file_path):
    pattern = r"sub-(?P<subject>[a-zA-Z0-9]+)(?:_ses-(?P<session>[a-zA-Z0-9]+))?"
    match = re.search(pattern, file_path)
    if match:
        return match.groupdict()
    return {}


# Helper: Parse JSON
def parse_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)


# Helper: Add or update a JSON element
def update_json_key(file_path, key, value):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    data[key] = value
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4)
    print(f'New field added/updated [{key}]: {value}')


def extract_IntendedFor_path(full_path):
    """
    Given a file path to something like:
      /projects/.../sub-04S0300/ses-1/func/sub-04S0300_ses-1_task-rest_bold.nii.gz
    Return the portion starting from 'sub-...' so that the JSON
    can store `IntendedFor` with a relative path (BIDS standard).
    """
    match = re.search(r'/sub-[^/]+/', full_path)
    if match:
        return full_path[match.end():]
    return None


# ----------------------------------------------------------------------
# ECHO TIMES / PHASE ENCODING DIRECTIONS, JSON STUFF ETC.
# ----------------------------------------------------------------------

def add_echotimes(layout, sub, ses):
    """
    For all 'phasediff' files in a session, try to discover their 
    EchoTime1 and EchoTime2 from matching magnitude1/magnitude2 files.
    """
    niftis = query_files(layout, sub=sub, ses=ses, suffix='.nii.gz')
    for nifti_path in niftis:
        if 'phasediff.' in nifti_path:
            json_path = nifti_path.replace('.nii.gz', '.json')
            EchoTime1 = get_element('EchoTime1', layout, json_path)
            EchoTime2 = get_element('EchoTime2', layout, json_path)

            # If missing EchoTime1 or EchoTime2, search matching mag1/mag2
            if not isinstance(EchoTime1, (int, float)) or not isinstance(EchoTime2, (int, float)):
                for nifti_path2 in niftis:
                    # Magnitude1
                    if 'magnitude1' in nifti_path2:
                        json_path2 = nifti_path2.replace('.nii.gz', '.json')
                        e_time = get_element('EchoTime', layout, json_path2)
                        if e_time:
                            update_json_key(json_path, 'EchoTime1', e_time)
                            print(f"Updated EchoTime1 in phasediff from {json_path2}")

                    # Magnitude2
                    if 'magnitude2' in nifti_path2:
                        json_path2 = nifti_path2.replace('.nii.gz', '.json')
                        e_time = get_element('EchoTime', layout, json_path2)
                        if e_time:
                            update_json_key(json_path, 'EchoTime2', e_time)
                            print(f"Updated EchoTime2 in phasediff from {json_path2}")

                # Check if both times are present now, set EchoNumber
                updated_EchoTime1 = get_element('EchoTime1', layout, json_path)
                updated_EchoTime2 = get_element('EchoTime2', layout, json_path)
                if updated_EchoTime1 and updated_EchoTime2:
                    update_json_key(json_path, 'EchoNumber', 2)
                elif updated_EchoTime1 or updated_EchoTime2:
                    update_json_key(json_path, 'EchoNumber', 1)


def add_direction(layout, sub, ses):
    """
    If a file is named 'dir-PA' or 'dir-AP', ensure that the JSON 
    sidecar has PhaseEncodingDirection set to 'PA' or 'AP'.
    """
    niftis = query_files(layout, sub=sub, ses=ses, suffix='.nii.gz')
    for nifti_path in niftis:
        if 'dir-PA' in nifti_path or 'dir-AP' in nifti_path:
            json_path = nifti_path.replace('.nii.gz', '.json')
            PhaseEncodingDirection = get_element('PhaseEncodingDirection', layout, json_path)
            if not PhaseEncodingDirection:
                if 'dir-PA' in nifti_path:
                    update_json_key(json_path, 'PhaseEncodingDirection', 'j')
                elif 'dir-AP' in nifti_path:
                    update_json_key(json_path, 'PhaseEncodingDirection', 'j-')


def scale_SliceTiming(numbers):
    """
    Example helper: scale values to be between 0.01 and 0.1 if the SliceTiming 
    is drastically off. Not necessarily used in all pipelines.
    """
    if not numbers:
        raise ValueError("Input list cannot be empty.")
    
    max_abs_value = max(abs(num) for num in numbers)
    scale_factor = 1
    while max_abs_value >= 1:
        max_abs_value /= 10
        scale_factor *= 10
    while max_abs_value < 0.05:
        max_abs_value *= 10
        scale_factor /= 10

    scaled_numbers = [num / scale_factor for num in numbers]
    return scaled_numbers, scale_factor





# ----------------------------------------------------------------------
# BIDS filenaming, for files which don't have a BidsGuess field in JSON
# ----------------------------------------------------------------------


# Move and rename those files with no "BidsGuess" in JSON
def filename_no_bidsguess(filepath, filename, sub, ses):
    path = os.path.normpath(filepath)
    filename = os.path.basename(path)
    task, acq, direction, run, end = "","","","",""
    
    ##### extension                                
    if filename.endswith('.nii.gz'):
        file_ext = 'nii.gz'
    else:
        file_ext = filename.split('.')[-1]

    dir_name = os.path.basename(os.path.dirname(path))
    
    ##### MODE
    protocol = str(get_element_raw("SeriesDescription", path))
    mode_of_folder = find_mode(dir_name)
    if protocol:
        mode = find_mode(protocol)
        if mode == "UnrecognizedFileType":
            mode = mode_of_folder
            if mode_of_folder == "UnrecognizedFileType":
                mode_of_file = find_mode(filename)
                mode = mode_of_file
                if mode_of_file == "UnrecognizedFileType":
                    return f'Unable to extract mode for: {path}'
    else:
        mode = mode_of_folder
        if mode_of_folder == "UnrecognizedFileType":
            mode_of_file = find_mode(filename)
            mode = mode_of_file
            if mode_of_file == "UnrecognizedFileType":
                return f'Unable to extract mode for: {path}'
    if "UnrecognizedFileType" not in mode:
        
        
        ###### DIR-
        direction_extracted = str(get_element_raw("PhaseEncodingDirection", path))
        axis_extracted = str(get_element_raw("PhaseEncodingAxis", path))
        if direction_extracted:
            if 'j-' in direction_extracted: #'j-': 'Anterior-Posterior'
                direction = 'dir-AP'
            elif 'j' in direction_extracted: #'j': Posterior-Anterior'
                direction = 'dir-PA'
        elif axis_extracted:
            if 'j-' in axis_extracted: #'j-': 'Anterior-Posterior'
                    direction = 'dir-AP'
            elif 'j' in axis_extracted: #'j': Posterior-Anterior'
                direction = 'dir-PA'
        else:
            if 'dwi' in mode or 'func' in mode:
                return f"Unable to extract dir (direction: {direction_extracted}) for {path}; mode is {mode}"
            else:
                direction = None


        ####### RUN-
        run_num = str(get_element_raw("SeriesNumber", path))
        if run_num:
            run = 'run-' + run_num
        else:
            if not 'fmap' in mode:
                return f"Unable to extract run for {path}; mode is {mode}"
            else:
                run = None

        ####### ACQ-, END & TASK-
        if mode == 'anat':
            protocol = str(get_element_raw("SeriesDescription", path))
            protocol_lower = protocol.lower()
            if 't1' in protocol_lower:
                end='T1w'
            elif 't2' in protocol_lower:
                end='T2w'
            
            if 'SPC' in protocol:
                acq = 'acq-spc3'
            elif 'MPR' in protocol:
                acq = 'acq-tfl3'
            elif 'flash' in protocol:
                if '3.5mm' in protocol:
                    acq='acq-flash3'
                if '2mm' in protocol:
                    acq='acq-flash2'
                else:
                    acq='acq-flash'
            else:
                return f"Unable to extract acq for {path}; mode is {mode}"

        if mode == 'func':
            task = 'task-rest'
            end = 'bold'

            protocol = str(get_element_raw("SeriesDescription", path))
            if 'MB_bold' in protocol:
                acq = 'acq-epfid2m5'
            elif 'ep2d_bold' in protocol:
                acq = 'acq-epfid2'
            else:
                return f"Unable to extract acq for {path}; mode is {mode}"

        if mode == 'fmap':
            e = str(get_element_raw("EchoNumber", path))
            image = get_element_raw("ImageType", path)
            if image:
                if len(image) > 2:
                    if image[2] == "OTHER":
                        if len(image) > 3:
                            if image[3] == "REAL":
                                acq = 'acq-real' + e
                                end = 'magnitude' + e
                            elif image[3] == "IMAGINARY":
                                acq = 'acq-imaginary' + e
                                end = 'magnitude' + e
                            elif image[3] == "PHASE":
                                end = 'phase' + e
                        else:
                            end = 'magnitude' + e

                    elif "M" in image[2] or "P" in image[2]:
                        acq = 'acq-fm2'
                        if image[2] == "M":
                            end = 'magnitude' + e
                        elif image[2] == "P":
                            end = 'phasediff'
                else:
                    return f"Unable to extract acq for {path}, ImageType is {image}"
            else:
                return f"Unable to extract acq for {path}, no ImageType"


        if mode == 'dwi':
            end = 'dwi'
            sequence = str(get_element_raw("SequenceName", path))
            scanseq = str(get_element_raw("ScanningSequence", path))
            if sequence:
                if "ep_b0" in sequence:
                    acq = 'acq-epb0'
            elif scanseq:
                if "EP\\RM" in scanseq:
                    acq = 'acq-eprm'
                    
            else:
                return f"Unable to extract acq for {path}"
            
                


        parts = [sub, ses, task, acq, direction, run, end]
        parts = [part for part in parts if part]  # Remove empty parts
        new_filename = '_'.join(parts)
        new_filename = new_filename.replace('__','_')

        # Add extension
        #print(f"Extension is: {file_ext}")
        name_final = new_filename + '.' + file_ext
        name_final = name_final.replace('..','.')
        
        return name_final
    
    else:
        return f"Unable to extract mode for {path}"



In [12]:
# Make subject_sessions list
subject_sessions = create_subject_sessions_dict2(source_images)

# Extract sublists
ids = extract_subject_ids(subject_sessions)
#session_ids = extract_session_ids(subject_sessions)
#session_nums = extract_session_nums(subject_sessions)


# Make filenames metadata list
bids_raw = '/projects/f_ah1491_1/open_data/NAPLS3/bidsdata_raw'
#file_metadata = gather_filenames_and_metadata(bids_raw)

# Example, subject sessions

session_num = extract_ses_given_date(subject_sessions, '01S0301', '2015-03-17')
print(f'Session for sub 01S0301, date 20150317, is: {session_num}')

print(count_keys(subject_sessions))

print("\nSubject sessions, 06S0333: \n")
print(subject_sessions['06S0333'])


None


In [None]:
## Step 1: Unzip and Convert DICOMS
source_images = "/projects/f_ah1491_1/open_data/NAPLS3/sourcedata/image03"
dicom_directory ="/scratch/f_ah1491_1/open_data/NAPLS3/sourcedata/image03" #This should be somewhere that can hold a lot of files, like a scratch folder
unzip_and_convert_dicom(source_images, dicom_directory)



In [None]:
########### Move unzipped source files to bidsdata ##################
source_folder = source_images
target_dir = '/projects/f_ah1491_1/open_data/NAPLS3/bidsdata'
if not os.path.exists(target_dir):
    os.makedirs(target_dir, exist_ok=True)

exists_already = []
no_session_number = []
failed_to_extract = []
no_bids_guess = []
bids_guess_invalid = []
not_valid_source_file = []
no_return_filename_no_bidsguess = []
no_json_file = []
moved = []
count_moved = 0

for session_folder in os.listdir(source_folder):
    if os.path.isdir(os.path.join(source_folder, session_folder)) and session_folder.startswith('0'):
        # Return subject_id from file
        subject_id = return_subject_id_given_session_id(session_folder)
        sub = 'sub-' + subject_id
        for mode_folder1 in os.listdir(os.path.join(source_folder, session_folder)):
            if os.path.isdir(os.path.join(source_folder, session_folder, mode_folder1)):
                for file in os.listdir(os.path.join(source_folder, session_folder, mode_folder1)):
                    src_path = False
                    if not os.path.isdir(os.path.join(source_folder, session_folder, mode_folder1, file)):
                        filename = file
                        mode_folder = mode_folder1
                        src_path = os.path.join(source_folder, session_folder, mode_folder, file)
                    elif file == subject_id:
                        for mode_folder2 in os.listdir(os.path.join(source_folder, session_folder, mode_folder1, file)):
                            if os.path.isdir(os.path.join(source_folder, session_folder, mode_folder1, file, mode_folder2)):
                                for file2 in os.listdir(os.path.join(source_folder, session_folder, mode_folder1, file, mode_folder2)):
                                    if os.path.isfile(os.path.join(source_folder, session_folder, mode_folder1, file, mode_folder2, file2)):
                                        src_path = os.path.join(source_folder, session_folder, mode_folder1, file, mode_folder2, file2)
                                        mode_folder = mode_folder2
                                        filename = file2
                                    
                    else:
                        continue
                                                  
                    if src_path:
                        # Return ses number from file
                        file_date_obj = return_date_given_filename(filename)
                        
                        
                        # Handle file_date_obj errors
                        if not file_date_obj:
                            print(f"No file_date extracted from {filename}\n")
                            no_session_number.append(file)
                        
                        else: # If file_date_obj successfully extracted, find session number
                            file_session_number = extract_ses_given_date(subject_sessions, subject_id, file_date_obj)

                            # Handle session number errors
                            if "Error" in file_session_number:
                                if "no sessions found within" in file_session_number: 
                                    print(f"File date could not be found for sub-{subject_id}, {file_date_obj}, {filename}")
                                    print(f"All dates for {subject_id}:\n {extract_dates_given_sub(subject_sessions, subject_id)}\n\n")
                                    no_session_number.append(filename)
                                else:
                                    print(f"No session number extracted from {filename}: {file_session_number}\n")
                                    failed_to_extract.append(filename)
                            
                            else: # If the file_session_number successfully extracted
                                
                                # Initialize correct session
                                
                                ses = f'ses-{str(file_session_number)}'

                                ## Create the new filename for renaming
                                new_filename = filename.lower()

                                # Keep the extension                                
                                if filename.endswith('.nii.gz'):
                                    file_ext = 'nii.gz'
                                else:
                                    file_ext = filename.split('.')[-1]
                                    
                                # Extract mode
                                series = get_element_raw("SeriesDescription", src_path)
                                mode = find_mode(series)
                                
                                # Make target folder
                                target_folder = os.path.join(target_dir, sub, ses, mode)

                                # Extract filename via BidsGuess
                                bids_guess = get_element_raw("BidsGuess", src_path)
                                print(bids_guess)
                                
                                # NO BidsGuess:
                                if 'BidsGuess not found in the metadata' in bids_guess:
                                    name_final = filename_no_bidsguess(src_path, filename, sub, ses)
                                    print(f'function filename_no_bidsguess-> {name_final}')
                                    
                                    if not 'Unable' in name_final or 'unable' in name_final:
                                        if not os.path.exists(target_folder):
                                            os.makedirs(target_folder, exist_ok=True)

                                        target_path = os.path.join(target_dir, sub, ses, mode, name_final)
                            
                                        # Handle case where the new path already exists
                                        if not os.path.exists(target_path):
                                            # Perform the renaming, setting target_path even if there is no duplicate
                                            target_path = os.path.join(target_folder, name_final)
                                            shutil.copy2(src_path, target_path)
                                            print(f"MOVED {name_final} from {file}")
                                            moved.append(target_path)
                                        else:
                                            exists_already.append(f'{src_path} -> {name_final}')
                                            #print(f'\nPath exists already: {target_path}; cannot move {file}\n\n')
                                    
                                    else: 
                                        no_return_filename_no_bidsguess.append(name_final)
                                        print('appended to no_return_filename_no_bidsguess')
                                                  
    
                                elif 'Not a valid filepath' in bids_guess:
                                    not_valid_source_file.append(src_path)
                                    print('appended to not_valid_source_file')
                                elif 'No JSON file associated' in bids_guess:
                                    no_json_file.append(src_path)
                                    print('appended to no_json_file')
                                
                                                  
                                # VALID BidsGuess: 
                                elif len(bids_guess) > 1:
                                    if 'acq' in bids_guess[1] or 'run' in bids_guess[1]:
                                    
                                        # Extract mode
                                        mode = bids_guess[0]
                                        if mode == 'UnrecognizedFileType':
                                            mode = find_mode(mode_folder)

                                        # Make target folder
                                        target_folder = os.path.join(target_dir, sub, ses, mode)
                                        if not os.path.exists(target_folder):
                                            os.makedirs(target_folder, exist_ok=True)

                                        # Create filename
                                        if sub in bids_guess[1] and ses in bids_guess[1]:
                                            if 'func' in mode:
                                                new_filename = bids_guess.replace(ses, f'{ses}_task-rest')
                                            else:
                                                new_filename = bids_guess

                                        elif sub in bids_guess[1] and ses not in bids_guess[1]:
                                            if 'func' in mode:
                                                new_filename = bids_guess[1].replace(sub, f'{sub}_{ses}_task-rest')
                                            else:
                                                new_filename = bids_guess[1].replace(sub, f'{sub}_{ses}')

                                        elif ses in bids_guess[1] and sub not in bids_guess[-1]:
                                            if 'func' in mode:
                                                end_filename = bids_guess[1].replace(ses, f'{ses}_task-rest')
                                                new_filename = '_'.join([sub, end_filename])
                                            else:
                                                new_filename = '_'.join([sub, bids_guess[-1]])

                                        else:
                                            if 'func' in mode:
                                                new_filename = '_'.join([sub, ses, 'task-rest', bids_guess[-1]])
                                            else:
                                                new_filename = '_'.join([sub, ses, bids_guess[-1]])


                                        # Clean and Add extension
                                        new_filename = new_filename.replace('__','_')
                                        name_final = new_filename + '.' + file_ext
                                        print('name_final is:', name_final)
                                        target_path = os.path.join(target_dir, sub, ses, mode, name_final)

                                        # Handle case where the new path already exists
                                        if not os.path.exists(target_path):
                                            # Perform the renaming, setting target_path even if there is no duplicate
                                            target_path = os.path.join(target_folder, name_final)
                                            shutil.copy2(src_path, target_path)
                                            print(f"MOVED {name_final} from {file}")
                                            moved.append(target_path)
                                            count_moved +=1
                                        else:
                                            exists_already.append(f'{src_path} -> {name_final}')
                                            count_moved +=1
                                            #print(f'\nPath exists already: {target_path}; cannot move {file}\n\n')

                  
                                
                                else:
                                    bids_guess_invalid.append(f'BidsGuess: {bids_guess}, file: {src_path}')
                                                  
                            
                                        
                                        
#print(f'Exists already, couldnt move: {len(exists_already)} files')
print(f'\n\n\nNo session number found for the date of {len(no_session_number)} files')
print(f'The function to extract_session_number didnt work for {len(failed_to_extract)} files')
print(f'There was no BidsGuess in the .json file for {len(no_bids_guess)} files')
print(f'BidsGuess didnt have valid format for {len(bids_guess_invalid)} files')
print(f'Couldnt convert source to .json to find BidsGuess for {len(not_valid_source_file)} files')
print(f'Couldnt find JSON file for {len(no_json_file)} files')
print(f'Couldnt move, exists already-- {len(exists_already)} files')
print(f'Function filename_no_bidsguess didnt return anything for {len(no_return_filename_no_bidsguess)} files')

#print(f'Moved: {count_moved} files')

src_path_not_moved = []
src_name_not_moved = []
name_final_already_exists = []

for string in exists_already: # len 2801 files
    # Find the index of the first occurrence of '->'
    parts = string.split(' -> ')
    
    # Check if ' -> ' is present and return the second part
    if len(parts) > 1:
        src_path = parts[0]
        src_path_not_moved.append(src_path)
        path = os.path.normpath(src_path)
        filename = os.path.basename(path)
        src_name_not_moved.append(filename)
        
        name_final = parts[1]
        name_final_already_exists.append(name_final)   

print('Unique source name not moved', len(set(src_name_not_moved)))
print('Unique  name_final which already_exists', len(set(name_final_already_exists)))

# See how many unique files are in no_bids_guess_filenames; len total 1969 files
print(len(set(no_return_filename_no_bidsguess))) # unique: 1915
#print(no_return_filename_no_bidsguess) 

# See how many have no json file
print(len(set(no_json_file))) #len = 30 files
#print(no_json_file)
          

In [None]:
####### COUNTING BEFORE & AFTER ###########
source_files = []

for dirpath, dirnames, filenames in os.walk(source_images):
    for filename in filenames:  # Loop over each file
        if filename.endswith('.nii.gz') or filename.endswith('.json') or filename.endswith('.bval') or filename.endswith('.bvec'):
            source_files.append(filename)

bids_files = []

for dirpath, dirnames, filenames in os.walk(bids_directory):
    for filename in filenames:  # Loop over each file
        if filename.endswith('.nii.gz') or filename.endswith('.json') or filename.endswith('.bval') or filename.endswith('.bvec'):
            bids_files.append(filename)

print('Count of source dir files:', len(source_files), 'bidsdata_all files:', len(bids_files)) 
#Count of source dir files: 127757 bidsdata_all files: 41155
print('Unique source dir files:', len(set(source_files)), 'Unique bidsdata_all files:', len(set(bids_files)))
#Unique source dir files: 83910 Unique bidsdata_all files: 41155

In [None]:
###### ADD INTENDEDFOR TO THE JSON ##########
directory = '/projects/f_ah1491_1/open_data/NAPLS3/fmap_test'

layout = index_bids_directory(directory)
for sub in os.listdir(bids_directory):
    if os.path.isdir(os.path.join(bids_directory,sub)):
        for ses in os.listdir(os.path.join(bids_directory,sub)):
            if os.path.isdir(os.path.join(bids_directory,sub,ses)):
                update_intendedfor(layout, sub, ses, overwrite=True)
 

In [None]:
######### Generate dataset_description.json #########
dataset_description = {
    "Name": "1/9 to 9/9 Predictors and Mechanisms of Conversion to Psychosis (NAPLS3)",
    "BIDSVersion": "1.4.0",
    "License": "License info here",
    "Authors": ["Jean Addington", "Kristin Cadenhead", "Tyrone Cannon", "Barbara Cornblatt", "Daniel Mathalon", "Diana Perkins", "Larry Seidman", "Elaine Walker", "Scott Woods"],
}
bidsdata = '/projects/f_ah1491_1/open_data/NAPLS3'
with open(os.path.join(bidsdata, "dataset_description.json"), "w") as json_file:
    json.dump(dataset_description, json_file, indent=4)

print(f"Dataset description created at {bidsdata}.")


### DEBUGGING & UTILITY CELLS

In [None]:
# Print/Check NIfTI header (dimensions, volumes, etc.)
import os
import nibabel as nib

for dirpath, dirnames, filenames in os.walk('/home/kj537/_f_ah1491_1/open_data/NAPLS3/synmap_test/sub-09S0348/'):
    for file in filenames:
        print('yes')
        if file.endswith('.nii.gz'):
            filepath = os.path.join(dirpath, file)
            img = nib.load(filepath)  # Use 'filepath' instead of 'file'

            # Extract the header
            header = img.header

            dim = header['dim']  # Access 'dim' field directly
            print(f"{file}: \n Header = \n{header}")


In [None]:
########### RENAMING files ###############

#subjIDs_list = ids
#subjIDs_list= ['sub-04S0318','sub-04S0350','sub-05S0326','sub-06S0309','sub-06S0341','sub-07S0341','sub-07S0399']
bids_directory = "/projects/f_ah1491_1/open_data/NAPLS3/bidsdata"

for root, _, files in os.walk(target_dir):
    for file in files:
        src_path = os.path.join(root, file)
                        
        if 'acq-_' in file:
            new_filename = file.replace('acq-_','')
            target_path = os.path.join(root, new_filename)
            try:
                os.rename(src_path, target_path)
                print(f'Renamed to {new_filename} from {file}')
            except Exception as e:
                print(f"Could not rename {file}, error: {e}")

In [None]:
########### Moving groups of files to .bidsignore ################

#subjIDs_list = ids
subjIDs_list= [] # 'sub-09S0348' 'sub-04S0300'
target_dir = "/projects/f_ah1491_1/open_data/NAPLS3/synmap_test"

for sub in os.listdir(target_dir):
    #print(f"\n\n\n SUBJECT: {subject}")
    if os.path.isdir(os.path.join(target_dir, sub)):
        for ses in os.listdir(os.path.join(target_dir, sub)):
            if os.path.isdir(os.path.join(target_dir, sub, ses)):
                #print(f"SES: {ses}")
                for mode in os.listdir(os.path.join(target_dir, sub, ses)):
                    src_path = False
                    target_folder = False
                    if 'fmap' in mode:
                    if os.path.isdir(os.path.join(target_dir, sub, ses, mode)):
                        #print(f"MODE: {mode}")
                        if 'discard' in mode:
                            try:
                                os.system(f'rm -r {os.path.join(target_dir, sub, ses, mode)}')
                            except Exception as e:
                                print(f"Could not delete 'discard': {e}")


                        
                        for file in os.listdir(os.path.join(target_dir, sub, ses, mode)):
                            ## What files go in .bidsignore
                            
                                src_path = (os.path.join(target_dir, sub, ses, mode, file))
                                target_folder = (os.path.join(target_dir, '.bidsignore', sub, ses, mode))
                                #print(file)
                            
                                if not os.path.exists(target_folder):
                                    os.makedirs(target_folder, exist_ok=True)
                                target_path = os.path.join(target_folder, file)
                                try:
                                    shutil.move(src_path, target_path)
                                    print("Moved to:",target_path)
                                except Exception as e:
                                    print(f"Could not rename {file}, error: {e}")

                        

In [None]:
############## Move a single file to .bidsignore #############
src_path = '/projects/f_ah1491_1/open_data/NAPLS3/synmap_test/sub-09S0348/ses-3/dwi/sub-09S0348_ses-3_acq-epb2_dir-AP_run-9_dwi.nii.gz'
file = 'sub-09S0348_ses-3_acq-epb2_dir-AP_run-9_dwi.nii.gz'
target_folder = '/projects/f_ah1491_1/open_data/NAPLS3/synmap_test/.bidsignore/sub-09S0348/ses-3/dwi/'
if not os.path.exists(target_folder):
    os.makedirs(target_folder, exist_ok=True)
target_path = os.path.join(target_folder, file)
try:
    shutil.move(src_path, target_path)
    print("Moved to:",target_path)
except Exception as e:
    print(f"Could not rename {file}, error: {e}")

                        

In [None]:
########### Check filenames: To check all .nii.jz have a corresponding .json #########
no_json=[]
#print("To check all .nii.jz have a corresponding .json:")
bids_directory = '/projects/f_ah1491_1/open_data/NAPLS3/bidsdata'
for subject in os.listdir(bids_directory):
    if os.path.isdir(os.path.join(bids_directory, subject)):
        if not 'bidsignore' in subject:
            for ses in os.listdir(os.path.join(bids_directory, subject)):
                if os.path.isdir(os.path.join(bids_directory, subject, ses)):
                    #print(f"SES: {ses}")
                    for mode in os.listdir(os.path.join(bids_directory, subject, ses)):
                        mode_fp = os.path.join(bids_directory, subject, ses, mode)
                        if os.path.isdir(mode_fp):
                            for file in os.listdir(mode_fp):
                                filename=file.split('.')
                                if filename[-1] == 'gz':
                                    file_base = file.replace('.nii.gz', '')
                                    if not os.path.exists(os.path.join(mode_fp, f'{file_base}.json')):
                                        print(f"No .json for {os.path.join(mode_fp,file)}")
                                        no_json.append(file)


In [None]:
########### Check filenames: To check all filetypes are in the appropriate mode folder ############
 

import shutil

bids_directory = "/projects/f_ah1491_1/open_data/NAPLS3/bidsdata"
#print("To check all filetypes are in the appropriate mode folder:")
for subject in os.listdir(bids_directory):
    if os.path.isdir(os.path.join(bids_directory, subject)) and '.' not in subject:
            for ses in os.listdir(os.path.join(bids_directory, subject)):
                if os.path.isdir(os.path.join(bids_directory, subject, ses)):
                    #print(f"SES: {ses}")
                    for mode in os.listdir(os.path.join(bids_directory, subject, ses)):
                        mode_fp = os.path.join(bids_directory, subject, ses, mode)
                        if os.path.isdir(mode_fp):
                            for file in os.listdir(mode_fp):
                                current_filepath = os.path.join(mode_fp, file)
                                # Separate the filename into the '_' separated parts
                                filename_parts=file.split('_')
                                
                                file_end = filename_parts[-1].split('.')
                                
                                # Check what ending the file has & compare to what mode it has
                                if file == '.ipynb_checkpoints':
                                    continue
                                if file_end[0] == 'bold':
                                    mode_should_be='func'
                                    if not mode == mode_should_be:
                                        destination_path = os.path.join(bids_directory, subject, ses, mode_should_be, file)
                                        try:
                                            #shutil.move(current_filepath, destination_path)
                                            print(f"File moved from {current_filepath} to {destination_path} successfully.")
                                        except Exception as e:
                                            print(f"An error occurred file from {current_filepath} to {destination_path}: {e}")
                                elif file_end[0] == 'T1w':
                                    mode_should_be='anat'
                                    if not mode == mode_should_be:
                                        destination_path = os.path.join(bids_directory, subject, ses, mode_should_be, file)
                                        try:
                                            #shutil.move(current_filepath, destination_path)
                                            print(f"File moved from {current_filepath} to {destination_path} successfully.")
                                        except Exception as e:
                                            print(f"An error occurred file from {current_filepath} to {destination_path}: {e}")
                                elif file_end[0] == 'T2w':
                                    mode_should_be='anat'
                                    if not mode == mode_should_be:
                                        destination_path = os.path.join(bids_directory, subject, ses, mode_should_be, file)
                                        try:
                                            #shutil.move(current_filepath, destination_path)
                                            print(f"File moved from {current_filepath} to {destination_path} successfully.")
                                        except Exception as e:
                                            print(f"An error occurred file from {current_filepath} to {destination_path}: {e}")
                                elif file_end[0] == 'FLASH':
                                    mode_should_be='anat'
                                    if not mode == mode_should_be:
                                        destination_path = os.path.join(bids_directory, subject, ses, mode_should_be, file)
                                        try:
                                            #shutil.move(current_filepath, destination_path)
                                            print(f"File moved from {current_filepath} to {destination_path} successfully.")
                                        except Exception as e:
                                            print(f"An error occurred file from {current_filepath} to {destination_path}: {e}")
                                elif file_end[0] == 'dwi':
                                    mode_should_be='dwi'
                                    if not mode == mode_should_be:
                                        destination_path = os.path.join(bids_directory, subject, ses, mode_should_be, file)
                                        try:
                                            #shutil.move(current_filepath, destination_path)
                                            print(f"File moved from {current_filepath} to {destination_path} successfully.")
                                        except Exception as e:
                                            print(f"An error occurred file from {current_filepath} to {destination_path}: {e}")
                                elif file_end[0] == 'magnitude1':
                                    mode_should_be='fmap'
                                    if not mode == mode_should_be:
                                        destination_path = os.path.join(bids_directory, subject, ses, mode_should_be, file)
                                        try:
                                            #shutil.move(current_filepath, destination_path)
                                            print(f"File moved from {current_filepath} to {destination_path} successfully.")
                                        except Exception as e:
                                            print(f"An error occurred file from {current_filepath} to {destination_path}: {e}")
                                elif file_end[0] == 'magnitude2':
                                    mode_should_be='fmap'
                                    if not mode == mode_should_be:
                                        destination_path = os.path.join(bids_directory, subject, ses, mode_should_be, file)
                                        try:
                                            #shutil.move(current_filepath, destination_path)
                                            print(f"File moved from {current_filepath} to {destination_path} successfully.")
                                        except Exception as e:
                                            print(f"An error occurred file from {current_filepath} to {destination_path}: {e}")
                                elif file_end[0] == 'phase2':
                                    mode_should_be='fmap'
                                    if not mode == mode_should_be:
                                        destination_path = os.path.join(bids_directory, subject, ses, mode_should_be, file)
                                        try:
                                            #shutil.move(current_filepath, destination_path)
                                            print(f"File moved from {current_filepath} to {destination_path} successfully.")
                                        except Exception as e:
                                            print(f"An error occurred file from {current_filepath} to {destination_path}: {e}")
                                elif file_end[0] == 'phase1':
                                    mode_should_be='fmap'
                                    if not mode == mode_should_be:
                                        destination_path = os.path.join(bids_directory, subject, ses, mode_should_be, file)
                                        try:
                                            #shutil.move(current_filepath, destination_path)
                                            print(f"File moved from {current_filepath} to {destination_path} successfully.")
                                        except Exception as e:
                                            print(f"An error occurred file from {current_filepath} to {destination_path}: {e}")
                                elif file_end[0]== 'phasediff':
                                    mode_should_be='fmap'
                                    if not mode == mode_should_be:
                                        destination_path = os.path.join(bids_directory, subject, ses, mode_should_be, file)
                                        try:
                                            #shutil.move(current_filepath, destination_path)
                                            print(f"File moved from {current_filepath} to {destination_path} successfully.")
                                        except Exception as e:
                                            print(f"An error occurred file from {current_filepath} to {destination_path}: {e}")
                                else: 
                                    print(f"MISSING MODE:-- file_end[0] is {file_end[0]} \n {current_filepath}: ")
                                

In [None]:
############ Make sure all JSONS are valid and UTF-8 Encoded #############


import os
import json

bids_directory = "/projects/f_ah1491_1/open_data/NAPLS3/bids_problem"


def validate_json(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            json.load(f)
        #print(f"Valid JSON file: {file_path}")
    except UnicodeDecodeError:
        print(f"Encoding error: {file_path} is not UTF-8 encoded.")
        try:
            encode_to_utf8(file_path)
        except Exception as e:
            print(f"Failed to encode {file_path} to UTF-8: {e}\n\n")
    except json.JSONDecodeError:
        print(f"Invalid JSON file: {file_path}")

def encode_to_utf8(file_path):
    # Attempt to read the file with a fallback encoding
    fallback_encodings = ['ISO-8859-1', 'latin1', 'cp1252']  # Add other encodings if necessary
    for encoding in fallback_encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as f:
                content = f.read()
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(content)
            print(f"Successfully re-encoded {file_path} to UTF-8 from {encoding}.")
            return
        except UnicodeDecodeError:
            continue
    raise UnicodeDecodeError(f"Unable to decode {file_path} with fallback encodings.")

def find_and_validate_json_files(directory):
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                validate_json(file_path)

if __name__ == "__main__":
    find_and_validate_json_files(bids_directory)

In [None]:
###### Adding or updating specific parts of the JSON metadata

bids_directory = '/home/kj537/_f_ah1491_1/open_data/NAPLS3/bidata'
layout = index_bids_directory(bids_directory)
rm_list = []
skip_files = ['dataset_description.json', 'participants.tsv', 'task-rest_bold.json']

# Walk through the source_directory files
for dirpath, dirnames, filenames in os.walk(bids_directory):
    # Extract the first level subdirectory (subject id & session date)
    relative_path = os.path.relpath(dirpath, source_images)
    first_subdir = relative_path.split(os.sep)[0] if os.sep in relative_path else ""
    # Go through files
    for filename in filenames:
        if filename.endswith('.json') and filename not in skip_files: #Extract the files
            json_path = os.path.join(dirpath, filename)
            nii_path = json_path.replace('.json','.nii.gz')
            # Look for PhaseEncodingDirection element
            ped = get_element('PhaseEncodingDirection', json_path)  # Look for 'PhaseEncodingDirection' in phasediff
            allowed_ped = [ "i", "i-", "j", "j-", "k", "k-"]

            # If it isn't there, add it
            if not ped:
                if 'dir-PA' in filename:
                    update_json_key(json_path, 'PhaseEncodingDirection', 'j')
                    print(f'Added PhaseEncodingDirection to {filename}')
                elif 'dir-AP' in filename:
                    update_json_key(json_path, 'PhaseEncodingDirection', 'j-')
                    print(f'Added PhaseEncodingDirection to {filename}')
                    
            #If PhaseEncodingDirection there but not valid
            elif ped not in allowed_ped:
                if 'dir-PA' in filename:
                    update_json_key(json_path, 'PhaseEncodingDirection', 'j')
                elif 'dir-AP' in filename:
                    update_json_key(json_path, 'PhaseEncodingDirection', 'j-')
                
            # Check if slicetiming is in seconds or milliseconds by comparing SliceTiming & RepetitionTime
            st = get_element('SliceTiming', json_path)
            
            nifti_img = nib.load(nii_path)
            header = nifti_img.header
            # Access Repetition Time (TR) from 'pixdim' field
            try:
                tr = header.get_zooms()[-1]  # TR is usually the 4th element of pixdim
            except Exception as e: 
                print(f"An error occurred: {e}\n {header.get_zooms()}")
    
            if st:
                if tr:
                    print(f'tr is {tr}')
                    #print(f'Sum of slicetimings is: {st}')
                    scaled_numbers, factor = scale_SliceTiming(st)
                    print(f'Scaled list by {factor} --> {scaled_numbers}\n\n\n')