In [None]:
from pathlib import Path
import numpy as np
from fmri_decoder.data import DataConfig, ModelConfig, SurfaceData, TimeseriesData
from fmri_decoder.model import MVPA
from fmri_decoder.preprocessing import (
    FeatureSelection,
    TimeseriesPreproc,
    TimeseriesSampling,
)

# arguments
CONFIG_IN = ""  # file name of yaml configuration file
DIR_OUT = ""  # path of output directory to which all resulting files are written

# make output directory
dir_out = Path(DIR_OUT)
dir_out.mkdir(parents=True, exist_ok=True)

dir_sample = dir_out / "sample"
dir_label = dir_out / "label"
dir_model = dir_out / "model"

# load data
time_data = TimeseriesData.from_yaml(CONFIG_IN)
surf_data = SurfaceData.from_yaml(CONFIG_IN)
config_data = DataConfig.from_yaml(CONFIG_IN)
config_model = ModelConfig.from_yaml(CONFIG_IN)

In [None]:
import pandas as pd
from src.data import Data
import functools
from sklearn.feature_selection import f_classif

class Univariate:
    """Compute the univariate profile for different number of features."""

    def __init__(self, subj, sess, day):
        self.subj = subj
        self.sess = sess
        self.day = day
        self.data = Data(self.subj, f"{self.sess}{SESSION[self.subj][self.sess][self.day]}")
        self.label, self.hemi = self.get_label
        self.label_sorted, self.hemi_sorted = zip(*[self.sort_features(i) for i in range(N_LAYER)])

    @property
    @functools.lru_cache()
    def get_label(self):
        """Get label and hemisphere."""
        surf_data = SurfaceData(self.data.file_layer, None, self.data.file_label)

        label_left = surf_data.load_label_intersection("lh")
        label_right = surf_data.load_label_intersection("rh")

        hemi = np.zeros(len(label_left) + len(label_right))
        hemi[len(label_left):] = 1
        label = np.append(label_left, label_right)

        return label, hemi

    def sort_features(self, layer):
        """Sort label and hemi array based on features."""
        dtf = pd.read_parquet(self.data.get_sample_data(layer))

        # choose subset of features
        features = dtf.columns[2:]
        
        X = np.array(dtf.loc[:, features])
        y = np.array(dtf.loc[:, "label"])

        f_statistic = f_classif(X, y)[0]
        index = np.arange(len(features))
        index_sorted = np.array(
                    [x for _, x in sorted(zip(f_statistic, index), reverse=True)]
                )

        label_sorted= self.label[index_sorted]
        hemi_sorted = self.hemi[index_sorted]

        return label_sorted, hemi_sorted

In [None]:
# features selection
features_selected = {}
if surf_data.file_localizer is not None:
    features = FeatureSelection.from_yaml(CONFIG_IN)
    features_selected = features.sort_features(config_model.radius, config_model.nmax)

# timeseries preprocessing
preproc = TimeseriesPreproc.from_yaml(CONFIG_IN)
# detrend time series
_ = preproc.detrend_timeseries(config_data.tr, config_data.cutoff_sec)
# crop time series
data_vol, events = preproc.crop_data(config_data.n_skip)

# control condition: randomize labels
if config_model.randomize_labels:
    for i in events:
        np.random.shuffle(i)

# iterate over surfaces (layers)
n_surf = len(surf_data.file_layer["lh"])
for i in range(n_surf):
    data_sampled = {}
    for hemi in ["lh", "rh"]:
        vtx, fac = surf_data.load_layer(hemi, i)
        sampler = TimeseriesSampling(vtx, fac, data_vol)
        # sample time series
        file_deformation = config_data.file_deformation
        file_reference = time_data.file_series[0]
        data_sampled[hemi] = sampler.sample_timeseries(file_deformation, file_reference)
        # filter time series
        if config_data.filter_size:
            label = surf_data.load_label_intersection(hemi)
            data_sampled[hemi] = sampler.filter_timeseries(
                label, config_data.filter_size
            )

    if surf_data.file_localizer is not None:
        mvpa = MVPA.from_selected_data(data_sampled, features_selected, events)
    else:
        for hemi in ["lh", "rh"]:
            label = surf_data.load_label_intersection(hemi)
            data_sampled[hemi] = [
                data_sampled[hemi][x][label, :] for x in range(len(data_sampled[hemi]))
            ]
        mvpa = MVPA.from_data(
            data_sampled, events, nmax=config_model.nmax, remove_nan=True
        )

    # model preparation and fitting
    # scaling
    if config_model.feature_scaling:
        mvpa.scale_features(config_model.feature_scaling)
    if config_model.sample_scaling:
        mvpa.scale_samples(config_model.sample_scaling)
    _ = mvpa.evaluate

    # save results
    mvpa.save_results(dir_out / "accuracy.csv", "accuracy")
    mvpa.save_results(dir_out / "sensitivity.csv", "sensitivity")
    mvpa.save_results(dir_out / "specificity.csv", "specificity")
    mvpa.save_results(dir_out / "f1.csv", "f1")

    # compute p-value by permutation sampling and save to disk
    N_ITER = 1000
    mvpa.save_stats(dir_out / "pval_accuracy.csv", N_ITER, "accuracy")
    mvpa.save_stats(dir_out / "pval_sensitivity.csv", N_ITER, "sensitivity")
    mvpa.save_stats(dir_out / "pval_specificity.csv", N_ITER, "specificity")
    mvpa.save_stats(dir_out / "pval_f1.csv", N_ITER, "f1")

print("Done.")