In [5]:
#! pip install neuroHarmonize
import pandas as pd
import csv
import os
import numpy as np
from neuroHarmonize import harmonizationLearn
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
from matplotlib.font_manager import FontProperties

# Set path to your font
font_path = '/Users/jpillai/Downloads/EB_Garamond/EBGaramond-VariableFont_wght.ttf'
eb_garamond_prop = FontProperties(fname=font_path)

In [7]:
############ UTILS ##################
def process_fMRI_csv_files(csv_directory, atlas):
    # Get a list of CSV filenames in sorted order
    csv_filenames = sorted([filename for filename in os.listdir(csv_directory) if filename.endswith(".csv")])

    # Create an empty DataFrame to store the results
    result_df = pd.DataFrame()

    # Loop through each CSV filename
    for filename in csv_filenames:
        csv_file_path = os.path.join(csv_directory, filename)
        
        # Read the CSV file using pandas
        df = pd.read_csv(csv_file_path, header=None)
        
        if atlas == 'BNA':
            # Remove the 233rd row and 233rd column
            df.drop(232, axis=0, inplace=True)
            df.drop(232, axis=1, inplace=True)

        # Flatten the matrix into a single vector
        flattened_data = df.values.flatten()

        # Append to the result dataframe
        result_df = pd.concat([result_df, pd.DataFrame([flattened_data])], ignore_index=True)

    return result_df

In [4]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('/Users/jpillai/Desktop/ROI_BN_Atlas_thickness.csv')

# Drop columns that contain any NaN value
df_cleaned = df.dropna(axis=1, how='any')

# Save the cleaned data back to CSV
df_cleaned.to_csv('/Users/jpillai/Desktop/cleaned_33906_BNA.csv', index=False)

In [8]:
def harmonizeFunctionalMatrices(data, covariates, atlas, output_dir):

    """
    Harmonizes neuroimaging data using ComBat from the neuroHarmonize package.
    
    This function takes in the path to a directory of CSV files with neuroimaging data,
    performs harmonization using ComBat, reshapes the harmonized data based on the provided
    atlas, and then saves each reshaped matrix back to a separate CSV file in a specified 
    output directory.

    Parameters:
    -----------
    data : str
        Path to a directory containing CSV files with neuroimaging data. Each CSV should contain 
        matrix data flattened into a single vector. The filenames in this directory can be used 
        to derive participant identifiers (PIDN).
        
    covariates : str
        Path to a CSV file containing covariates. The mandatory covariate is 'SITE' and other 
        optional ones include 'AGE_M' & 'SEX'. The 'PIDN' column in this file should match the 
        order of the matrices in the `data` directory.

    atlas : str, one of ['Schaeffer400', 'BNA']
        Specifies the atlas used for the neuroimaging data. Determines the shape of the matrix 
        after reshaping. If the atlas is 'Schaeffer400', the shape is (244, 244). If the atlas is 
        'BNA', the shape is (400, 400).

    output_dir : str
        Path to the desired output directory where harmonized data matrices will be saved as CSV files.

    Returns:
    --------
    None. 
    Harmonized data matrices are saved to individual CSV files in the specified output directory.

    Raises:
    -------
    ValueError:
        If an unsupported atlas is provided.

    Example:
    --------
    > harmonizeFunctionalMatrices('./data/', './covariates.csv', 'BNA', './output/')
    """

    # Load in data
    modelData = pd.read_csv(covariates)
    funcMatrix = process_fMRI_csv_files(data, atlas=atlas)

    # 'funcMatrix' now contains the flattened data from all CSV files in the directory. 
    # Each row corresponds to a flattened vector for each participant.
    funcMatrix = funcMatrix.astype(float)
    funcMatrix = funcMatrix.values

    # Run ComBat (neuroHarmonize) on the data
    func_model, func_adj = harmonizationLearn(funcMatrix, modelData)

    # Convert the harmonized data back into its respective csv
    harmonized_df = pd.DataFrame(func_adj)
    
    # Extract PIDN values directly from the 'covariates' DataFrame
    pidn_list = modelData['PIDN'].tolist()

    # Decide the reshape dimensions based on the atlas
    if atlas == 'Schaeffer400':
        reshape_dims = (400, 400)
    elif atlas == 'BNA':
        reshape_dims = (244, 244)
    else:
        raise ValueError("This atlas has not been integrated into this harmonization yet.")

    # Loop through the DataFrame, unflatten each row into the desired matrix, and save it
    for index, row in harmonized_df.iterrows():
        vector = np.array(row, dtype=float)
        matrix = vector.reshape(reshape_dims)

        # Get the corresponding PIDN from the list
        pidn = pidn_list[index]

        # Construct the full path for the output file in the specified directory
        file_name = os.path.join(output_dir, f'matrix_{pidn}.csv')
        
        matrix_df = pd.DataFrame(matrix)
        
        # Save each matrix with PIDN in the filename to the specified directory
        matrix_df.to_csv(file_name, index=False, header=False)

    print("Harmonization and saving processes are complete!")
    
    return funcMatrix, func_adj

In [9]:
def harmonizeCorticalThickness(data, dx, covariates, output_dir, atlas):
    """
    Run ComBat (neuroHarmonize package) on Cortical Thickness data

    Parameters:
    ---------
    data : str
        Path to a csv with 'PIDN' columns and neuroimaging data separated by 
        the ROI columns to correct with ComBat (e.g., cortical thickness data).
    covariates : str
        Path to a csv with covariates. The mandatory covariate is SITE. 
        Optional ones include AGE_M & SEX.
    output_dir : str
        Path to the desired output directory.
    atlas : str
        Name of the atlas being used.

    Returns:
    -------
    None
        Writes a csv with the corrected values to a specific directory locally.
    """

    # Load data
    modelData = pd.read_csv(covariates).drop('PIDN', axis=1)
    structData = pd.read_csv(data)
    structData_matrix = structData.drop('PIDN', axis=1).values

    # Run ComBat (neuroHarmonize) on the data
    cort_model, cort_data_adj = harmonizationLearn(structData_matrix, modelData)

    # Format the corrected data for output
    harmonized_cortical_data = pd.DataFrame(cort_data_adj, columns=structData.columns[1:])  # Assuming PIDN is the first column
    harmonized_cortical_data.insert(0, 'PIDN', structData['PIDN'])  # Add PIDN to the front

    # Save the corrected data to the desired directory
    output_filename = f"Harmonized_{dx}_{atlas}_CorticalThickness_Data.csv"
    output_path = os.path.join(output_dir, output_filename)
    harmonized_cortical_data.to_csv(output_path, index=False)
    
    return structData, harmonized_cortical_data

In [18]:
ratl_data = ' '
ratl_covariates = ' '
output_struc = ' '

original_data, harmonized_data = harmonizeCorticalThickness(data=ratl_data, dx='CTRL', covariates=ratl_covariates, output_dir=output_struc, atlas='DK40')
#original_func, harmonized_func = harmonizeFunctionalMatrices(data=ratl_func, covariates=ratl_covariates, atlas='BNA', output_dir=output_func)

In [None]:
# # Select a feature to visualize
# feature_name = "lbankssts"

# # Extract data for the selected feature
# before = original_data[feature_name]
# after = harmonized_data[feature_name]

# # Create a new DataFrame for plotting
# plot_data = pd.DataFrame({
#     "Value": pd.concat([before, after]),
#     "Condition": ["Before"] * len(before) + ["After"] * len(after)
# })

# # Plot the boxplots
# plt.figure(figsize=(10, 6))
# sns.boxplot(x="Condition", y="Value", data=plot_data, width=0.4)
# plt.title(f"Distribution of {feature_name} before and after harmonization", fontproperties=eb_garamond_prop, fontsize=20)
# plt.xlabel("Feature", fontproperties=eb_garamond_prop, fontsize=14)
# plt.ylabel("Value", fontproperties=eb_garamond_prop, fontsize=14)
# plt.xticks(fontproperties=eb_garamond_prop)
# plt.yticks(fontproperties=eb_garamond_prop)
# plt.show()

# #correlation with age, sex --> biological effect remains the same 

In [None]:
# roi_index = 10  # for example, the 11th region of interest

# # Extract connectivity values for the selected ROI from the original and harmonized matrices
# before_connectivity = original_func[roi_index, :]
# after_connectivity = harmonized_func[roi_index, :]

# plt.figure(figsize=(14, 7))

# # Plot the connectivity values
# plt.plot(before_connectivity, label='Before Harmonization', color='blue', marker='o')
# plt.plot(after_connectivity, label='After Harmonization', color='red', marker='x')

# plt.title(f"Functional Connectivity of ROI {roi_index} before and after Harmonization", fontproperties=eb_garamond_prop, fontsize=20)
# plt.ylabel("Connectivity Strength", fontproperties=eb_garamond_prop, fontsize=14)
# plt.xlabel("Connected Regions", fontproperties=eb_garamond_prop, fontsize=14)
# plt.legend(prop=eb_garamond_prop)
# plt.xticks(fontproperties=eb_garamond_prop)
# plt.yticks(fontproperties=eb_garamond_prop)
# plt.grid(True, which='both', linestyle='--', linewidth=0.5)

# plt.tight_layout()
# plt.show()