In [96]:
import pandas as pd
from junifer.storage import HDF5FeatureStorage
from statsmodels.imputation.mice import MICEData

In [97]:
# Read the CSV file
df_behavioral = pd.read_csv('Behavioral_Data')

# Select Behavioral measures for HCP
HCP_behavioal_measures_columns = ['Subject','PicSeq_Unadj','CardSort_Unadj','Flanker_Unadj','PMAT24_A_CR','ReadEng_Unadj','PicVocab_Unadj','ProcSpeed_Unadj','VSPLOT_TC','SCPT_SEN','SCPT_SPEC','IWRD_TOT','ListSort_Unadj','MMSE_Score',
                     'PSQI_Score','Endurance_Unadj','Dexterity_Unadj','Strength_Unadj','Odor_Unadj','PainInterf_Tscore','Taste_Unadj','Mars_Final','Emotion_Task_Face_Acc','Language_Task_Math_Avg_Difficulty_Level',
                     'Language_Task_Story_Avg_Difficulty_Level','Relational_Task_Acc','Social_Task_Perc_Random','Social_Task_Perc_TOM','WM_Task_Acc','NEOFAC_A','NEOFAC_O','NEOFAC_C','NEOFAC_N','NEOFAC_E','ER40_CR','ER40ANG','ER40FEAR',
                     'ER40HAP','ER40NOE','ER40SAD','AngAffect_Unadj','AngHostil_Unadj','AngAggr_Unadj','FearAffect_Unadj','FearSomat_Unadj','Sadness_Unadj','LifeSatisf_Unadj','MeanPurp_Unadj','PosAffect_Unadj','Friendship_Unadj',
                     'Loneliness_Unadj','PercHostil_Unadj','PercReject_Unadj','EmotSupp_Unadj','InstruSupp_Unadj','PercStress_Unadj','SelfEff_Unadj','DDisc_AUC_40K','GaitSpeed_Comp']
HCP_behavioal_measures = df_behavioral[HCP_behavioal_measures_columns]

# Use MICE (Multiple Imputation by Chained Equations) for imputation
mice_data = MICEData(HCP_behavioal_measures)
# Perform the imputation process iteratively
mice_data.update_all()
# Get the imputed DataFrame
imputed_HCP_behavioal_measures = mice_data.data

# Convert 'Subject' column to int in imputed_HCP_behavioal_measures
imputed_HCP_behavioal_measures['Subject'] = imputed_HCP_behavioal_measures['Subject'].astype(int)

In [98]:
storage = HDF5FeatureStorage("features.hdf5")
df_features = storage.read_df('BOLD_Schaefer400x17_functional_connectivity')
df_features_reset = df_features.reset_index() # Reset the index to convert MultiIndex to columns

# Ignore ROI1==ROI2
columns_with_tilde = [col for col in df_features_reset.columns if '~' in col]
columns_to_keep = ['phase_encoding'] + ['subject'] + [col for col in columns_with_tilde if col.split('~')[0] != col.split('~')[1]]
df_filtered_features = df_features_reset[columns_to_keep]

# Select only numeric columns for the mean calculation
numeric_columns = df_filtered_features.select_dtypes(include='number').columns

# Group by 'subject' and calculate the mean for numeric columns across REST1/LR, REST1/RL, REST2/LR, REST2/RL 
averaged_features = df_filtered_features.groupby('subject', as_index=False)[numeric_columns].mean()

# Convert 'subject' column to int in averaged_features
averaged_features['subject'] = averaged_features['subject'].astype(int)

In [105]:
# Merge feature and behavioral measures together 
merged_df = pd.merge(averaged_features, imputed_HCP_behavioal_measures, left_on='subject', right_on='Subject')
# Drop 'Subject' column
final_df = merged_df.drop('Subject',axis=1)


# Ouput the merged dataframe
final_df.to_csv('HCP_data.csv', index=False)