In [None]:
import pandas as pd
import logging

In [None]:
file_paths = {
    "lh": {
        "aparc": {
            "volume": "../stats/aparc/lh/aparc/volume_stats.csv",
            "thickness": "../stats/aparc/lh/aparc/thickness_stats.csv",
            "meancurv": "../stats/aparc/lh/aparc/meancurv_stats.csv"
        },
        "a2009s": {
            "volume": "../stats/aparc/lh/aparc.a2009s/volume_stats.csv",
            "thickness": "../stats/aparc/lh/aparc.a2009s/thickness_stats.csv",
            "meancurv": "../stats/aparc/lh/aparc.a2009s/meancurv_stats.csv"
        },
        "pial": {
            "volume": "../stats/aparc/lh/aparc.pial/volume_stats.csv",
            "thickness": "../stats/aparc/lh/aparc.pial/thickness_stats.csv",
            "meancurv": "../stats/aparc/lh/aparc.pial/meancurv_stats.csv"
        }
    },
    "rh": {
        "aparc": {
            "volume": "../stats/aparc/rh/aparc/volume_stats.csv",
            "thickness": "../stats/aparc/rh/aparc/thickness_stats.csv",
            "meancurv": "../stats/aparc/rh/aparc/meancurv_stats.csv"
        },
        "a2009s": {
            "volume": "../stats/aparc/rh/aparc.a2009s/volume_stats.csv",
            "thickness": "../stats/aparc/rh/aparc.a2009s/thickness_stats.csv",
            "meancurv": "../stats/aparc/rh/aparc.a2009s/meancurv_stats.csv"
        },
        "pial": {
            "volume": "../stats/aparc/rh/aparc.pial/volume_stats.csv",
            "thickness": "../stats/aparc/rh/aparc.pial/thickness_stats.csv",
            "meancurv": "../stats/aparc/rh/aparc.pial/meancurv_stats.csv"
        }
    }
}

participants_file = "../ds004199/participants.tsv"

In [None]:
class DataProcessor:
    
    def __init__(self, file_paths, participants_file, v=False):

        logging.basicConfig(level=logging.INFO)
        self.__logger = logging.getLogger(__name__)

        self.__verbose = v

        self.__file_paths = file_paths
        self.__participants_file = participants_file

        self.__final_data = None


    def get_final_data(self):
        """Returns the final processed data."""

        return self.__final_data
    
    def _reads_n_standardizes(self, file_path):
        """Reads a CSV file and standardizes column names."""

        df = pd.read_csv(file_path)
        df.columns.values[0] = 'subject'
        df.columns = [col.replace('-', '_').lower() for col in df.columns]
        return df
    
    def _rename_columns(self, df, hemisphere, parcellation, measure):
        """Renames the columns based on hemisphere, parcellation, and measure."""

        match parcellation:
            case "aparc":
                df.columns = [df.columns[0]] + [f"{col}_{measure}_{hemisphere}" for col in df.columns[1:]]
            case _:
                df.columns = [df.columns[0]] + [f"{col}_{measure}_{hemisphere}_{parcellation}" for col in df.columns[1:]]
        return df
    
    def _merge_hemisphere_data(self, hemisphere_data, hemisphere):
        """Merges data for a given hemisphere across multiple parcellations and measures."""

        dfs = []
        for parcellation, measures in hemisphere_data.items():
            for measure, file_path in measures.items():
                if self.__verbose: self.__logger.info(f"Loading {file_path} for {measure} in {parcellation}")
                df = self._reads_n_standardizes(file_path)
                df = self._rename_columns(df, hemisphere, parcellation, measure)
                dfs.append(df)
        
        final_df = dfs[0]
        for df in dfs[1:]:
            final_df = pd.merge(final_df, df, on='subject')
        return final_df

    def _load_and_merge_hemisphere_data(self, hemisphere):
        """Loads and merges data for a single hemisphere (left or right)."""

        hemisphere_data = self.__file_paths[hemisphere]
        return self._merge_hemisphere_data(hemisphere_data, hemisphere)
    
    def _load_participant_data(self):
        """Loads the participants data and returns the group column."""

        participants = pd.read_csv(self.__participants_file, sep="\t")
        group = participants[["participant_id", "group"]]
        return group
    
    def process(self):
        """Main method to process all data, including merging left and right hemisphere data, aseg data, and participants."""
        
        lh_data = self._load_and_merge_hemisphere_data("lh")
        rh_data = self._load_and_merge_hemisphere_data("rh")
        
        self.__final_data = pd.merge(lh_data, rh_data, on='subject')

        aseg_data = self._reads_n_standardizes("../stats/aseg_stats.csv")
        self.__final_data = pd.merge(self.__final_data, aseg_data, on='subject')

        group = self._load_participant_data()
        self.__final_data = pd.merge(self.__final_data, group, left_on='subject', right_on='participant_id')

        self.__final_data.drop(columns=['participant_id'], inplace=True)

    def save_data(self, file_path):
        """Saves the final processed data to a CSV file."""

        self.__final_data.to_csv(file_path, index=False)

In [None]:
processor = DataProcessor(file_paths, participants_file, v=True)
processor.process()

In [None]:
final_data = processor.get_final_data()
final_data.head()

In [None]:
processor.save_data("../stats/merged_data.csv")