# Ciric QC-FC Computation

Here we calculate the Quality Control - Functional Connectivity relation as described in Ciric et al. 2018:

$$QCFC_{i,j} \sim FD \times FC_{i,j} $$

In [8]:
import pandas as pd
import numpy as np
import os
import re

from itertools import groupby

import scipy.stats as scistats

from pandarallel import pandarallel
pandarallel.initialize(
    nb_workers=12,
    progress_bar=True,
    use_memory_fs=True
)

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
scratch = "/scratch/jjeyachandra/multiband-censoring-comparisons/"
subject_csvs = [
    f for f in
    os.listdir(os.path.join(scratch, "output"))
    if "sub-" in f and f.endswith("tsv")
]
f_meanfd = f"{scratch}/output/meanFD.csv"

In [91]:
SUB_REGEX = re.compile("sub-[A-Za-z0-9]+")
SES_REGEX = re.compile("ses-[A-Za-z0-9]+")
TASK_REGEX = re.compile("task-[A-Za-z0-9]+")
RUN_REGEX = re.compile("run-[A-Za-z0-9]+")
DESC_REGEX = re.compile("desc-[A-Za-z0-9]+")
MATCH_ENTITIES = [
    'sub','ses','task','run'
    ]

def extract_csv(csv, entities):
    '''
    - Read in a subject connectivity CSV file
    - Pull upper-triangular
    - Remove ??? parcels
    - Set entities
    - Set row/col to category data-type
    '''
    
    df = pd.read_csv(
    os.path.join(scratch,"output", csv),
    sep="\t",
    index_col=0
    )
    
    # Pivot to long format
    df = df.where(np.triu(np.ones(df.shape)).astype(np.bool))
    df = df.stack().reset_index()
    df.columns = ["row","column","value"]
    # Remove ??? columns
    remove_mask = df["row"].str.contains("\?\?\?") | \
              df["column"].str.contains("\?\?\?")
    df = df[~remove_mask].reset_index(drop=True)
    
    # Set filename information
    for k, v in entities:
        df[k] = v
    
    # Convert row/column to categorical to save memory
    df['row'] = df['row'].astype('category')
    df['column'] = df['column'].astype('category')
    return df

def remap_dfs2long(subject_list, outdir):
    '''
    Convert connectivity matrices to long form and:
    - add BIDS entity information to table
    - convert row/column to categorical
    - save in output directory as parquet/feather
    '''
    
    result_list = []
    for csv in subject_list:
        sub_entities = (
        ("sub", SUB_REGEX.search(csv)[0]),
        ("ses", SES_REGEX.search(csv)[0]),
        ("task",TASK_REGEX.search(csv)[0]),
        ("run",RUN_REGEX.search(csv)[0]),
        ("desc",DESC_REGEX.search(csv)[0])
    )
        out_file = "_".join([v for _,v in sub_entities])
        out_csv = f"{outdir}/{out_file}_connectivity.parquet"
        result_list.append(out_csv)
        if os.path.exists(out_csv):
            continue
        df = extract_csv(csv, sub_entities)
        df.to_parquet(out_csv,
                 index=False)
    return result_list 

def construct_df(parquets):
    '''
    - Read parquet file
    - Convert entities into category type
    '''
    df = pd.concat([pd.read_parquet(s) for s in parquets])
    df[['sub','ses','task','run','desc']] = \
        df[['sub','ses','task','run','desc']].astype('category')
    return df

def split_entities(row):
    '''
    Given row x, construct entities
    '''
    
    def get_reg(reg, x):
        res = reg.search(x)
        return res[0] if res else np.nan
    
    x = row.entity
    sub_entities = (
        ("sub", get_reg(SUB_REGEX, x)),
        ("ses", get_reg(SES_REGEX, x)),
        ("task",get_reg(TASK_REGEX, x)),
        ("run",get_reg(RUN_REGEX, x)),
        ("desc",get_reg(DESC_REGEX, x))
    )
    
    for k,v in sub_entities:
        row[k] = v 
    return row


def compute_qcfcs(df_list, fd, filters=None):
    '''
    Compute QC-FC statistics for each method
    - Filter out by entities dictionary if given
    - Group by method
    - For each method append FDs
    - For each edge compute pearson correlation
    - Collect results
    - Concatenate dataframes
    
    Uses pandarallel to parallelize across edge groups
    to speed up computation significantly
    '''
    
    FILTERS = {
    'ses': 'ses-01',
    'task': 'task-rest',
    'run': 'run-1'
    }
    
    
    if not filters:
        filters = FILTERS
        
    # Group DFs by method
    df_by_method = groupby(df_list,
                   key=lambda x: DESC_REGEX.search(x)[0])
    
    results = [] 
    for m, dfs in df_by_method:
        print(f"Processing method: {m}")
        df = construct_df(list(dfs))
        
        df_masks = np.logical_and.reduce(np.array([
            df[k] == v for k,v in filters.items()
        ]), axis=0)
        df = df.iloc[df_masks, :]
        
        fd_masks = np.logical_and.reduce(np.array([
            fd[k] == v for k,v in filters.items()
        ]), axis=0)
        fd = fd.iloc[fd_masks, :]
        
        fcqc = df.merge(fd, how='left',
                       left_on=MATCH_ENTITIES,
                       right_on=MATCH_ENTITIES)
        
        res = pd.DataFrame(
                list(
                    fcqc.groupby(['row','column'])\
                      .parallel_apply(qcfc))
        )
        res['method'] = m
        results.append(res)
        
    return pd.concat(results)
      
def qcfc(g):
    '''
    Compute pearson correlation
    '''
    
    
    row = g['row']
    row = row[row.first_valid_index()]
    col = g['column']
    col = col[col.first_valid_index()]
    
    results = {
        'source': row,
        'target': col,
        'pearson': np.nan,
        'pvalue': np.nan,
    }    
    
    if row == col:
        return results
    
    r, p = scistats.pearsonr(g['mean_fd'], g['value'])
    results.update({
        'pearson': r,
        'pvalue': p
    })
    return results

In [5]:
df_list = remap_dfs2long(subject_csvs, "../../data/intermediate/")
df_list = sorted(df_list, key=lambda x: DESC_REGEX.search(x)[0])

fd = pd.read_csv(f_meanfd)\
        .apply(split_entities, axis=1)\
        .dropna(axis=1)

In [92]:
res = compute_qcfcs(df_list, fd)

Processing method: desc-base


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2930), Label(value='0 / 2930'))), …

Processing method: desc-dct


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2930), Label(value='0 / 2930'))), …

Processing method: desc-fourier


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2930), Label(value='0 / 2930'))), …

Processing method: desc-powers


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2930), Label(value='0 / 2930'))), …

In [95]:
res_file = os.path.join(scratch,'output','QCFC_results.parquet')
res.to_parquet(res_file)