In [None]:
from operator import itemgetter
import pandas as pd
import numpy as np
import statsmodels as stt
import scipy.stats as sst
import scipy.linalg as lin
import os.path as osp
from statsmodels import api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import json
from time import gmtime, strftime
%matplotlib inline

In [None]:
print(osp.realpath(osp.curdir))

csv_filename = '2019-12-21-simple2_query_output2.csv'
csv_filename = '2020-02-11-simple2_query_output.csv'
csv_filename = '2020-03-26_simple2_query_output.csv'
relative_dir = './data'

# relative_path_filename = './data/2019-12-03-simple2_query_output.csv' 
#relative_path_filename = './data/2019-12-21-simple2_query_output2.csv'
relative_path_filename = osp.join(relative_dir, csv_filename)
assert osp.exists(relative_path_filename)
print(relative_path_filename)

In [None]:

hie = pd.read_csv(relative_path_filename, na_values='nd') #, low_memory=False)
original_col_names = list(hie)
# column names are unique
assert len(original_col_names) == len(set(original_col_names))
print(list(hie))
col_rename = {'federatedLabel':'structure'}
hie.rename(columns=col_rename, inplace=True)
print(list(hie))

In [None]:
set(hie['structure'])
[roi for roi in set(hie['softwareLabel']) if ('CC_' in roi and 'Volume_mm3' in roi)]

In [None]:
# Read the mapping file, that contains Freesurfer data elements with link to uberon isAbout

# mapping_file = '../segstats_jsonld/segstats_jsonld/mapping_data/freesurfermap.json'
mapping_file = '../segstats_jsonld/segstats_jsonld/mapping_data/freesurfer-cdes.json'
assert osp.exists(mapping_file)
with open(mapping_file, "r") as read_file:
    roi_map = json.load(read_file)

In [None]:
# this takes the mapping and spits out a dict with key is uberon 
# and value is roi definition
# eg: {'http://purl.obolibrary.org/obo/UBERON_0001874': 'Putamen'}
# will be use to get roi (eg Putamen) across software

ube2h = {}
label2ube = {}
countok=0
has_no_isAbout = []
has_no_label = []

for (k,v) in roi_map.items():
    
    # the mapping file starts with "count" at the level of data elements - need to discard 'count'
    if k == 'count': pass
    
    # v is a dict that contains the CDE - check that we have a isAbout and label    
    elif 'isAbout' in v:
        countok += 1
        if 'label' in v:
            if v['label'] != '' and v['label'] not in ('None','none'):
                #ube['<' + v['isAbout'] + '>'] = v['label']
                #ebu[v['label']] = '<' + v['isAbout'] + '>'
                label2ube[v['label']] = v['isAbout']
                if v['isAbout'] not in ube2h.keys():
                    no_right_or_left = v['label']
                    no_right_or_left = no_right_or_left.replace('Right-','')
                    no_right_or_left = no_right_or_left.replace('Right ','')
                    no_right_or_left = no_right_or_left.replace('Left-','')
                    no_right_or_left = no_right_or_left.replace('Left ','')
                    no_right_or_left = no_right_or_left.replace(' NVoxels','')
                    no_right_or_left = no_right_or_left.replace(' (mm^3)','')
                    ube2h[v['isAbout']] = no_right_or_left
            else:
                has_no_label.append(k)

assert has_no_isAbout == []
assert countok == len(label2ube)

In [None]:
label2ube;
ube2h
h2ube = {v: k for k, v in ube2h.items()}

In [None]:
print(ube2h)

In [None]:
def split_op_merge(df, index='ID', col='softwareLabel', 
                 values='volume', op='+', newcol='newCol', keepcols=[], verbose=False):
    """
    index: will be the index of the returned df
    columns: return df will be "wide" based on the values of columns
    values: content of the wide df
    add_col: name of the column where values of columns are added
    keepcols: list of column names to keep
    
    """
    # index='ID', columns='softwareLabel', values='volume'

    assert set(keepcols).issubset(list(df)) # 
    
    if verbose: print('len(df): ', len(df))    
    
    tmp_cols = list(set(df[col])) # find values in columns : on what we split
    if verbose: print('tmp_cols: ', tmp_cols)
        
    df.drop_duplicates([index, col, values], inplace=True)
    if verbose: print('len(df.dropduplicates): ', len(df))
    
    newdf = df.pivot(index=index, columns=col, values=values).dropna()
    newdf.reset_index() # put the index, 'ID' here, back in a column
    if verbose: print('len(newdf: ', len(newdf))

    if op == '+':
        newdf[newcol] = newdf.loc[:,tmp_cols].sum(axis=1)
        newdf.drop(tmp_cols, axis=1, inplace=True)
    else:
        print("not implemented: ", op)
        raise
        
    assert not ('ID' in keepcols)
    
    # now, create a df with keepcols:
    keepdf = df[[index] + keepcols].drop_duplicates()
    # print(len(keepdf),len(newdf))
    newdf = pd.merge(left=newdf, right=keepdf,
                     left_on=index, right_on=index, how='inner').dropna()
    if verbose: print('after merge and dropna len(newdf): ', len(newdf))
    # print(len(newdf),len(newdf.dropna()))
    del keepdf
    
    return newdf

# new_tmp = split_op_merge(tmp, 'ID', 'softwareLabel', 'volume', 'CC_vol', 
#                        ['study', 'Age', 'Gender']) # , 'Age', 'dx', 'Gender', 'FIQ'])

In [None]:
# A few dictionary for conveniency

tooldic = {'surfer':'https://surfer.nmr.mgh.harvard.edu/', 
       'fsl':'http://purl.org/nidash/fsl#',
       'ants':'http://stnava.github.io/ANTs/'}
normalDev = (2, '2', 'Typically Developing Children')
patient = (1, '1', 'ADHD-Combined', 'ADHD-Hyperactive/Impulsive', 'ADHD-Inattentive')

In [None]:
def define_conditions(df, tooldic={}, normalDev=(), patient=(), h2ube={}):
    """
    create dic of conditions associated with dataframe df 
    this will select some **rows** of the dataframe based on the 
    values of the columns (eg: all normal participants)
    """
    
    diccond={}
    diccond['left'] = ((df['laterality'] == 'L')|(df['laterality'] == 'Left'))
    diccond['right'] = ((df['laterality'] == 'R')|(df['laterality'] == 'Right'))
    diccond['latNan'] = (df['laterality'] != 'Right') & (df['laterality'] != 'Left') \
                        & (df['laterality'] != 'R') & (df['laterality'] != 'L')
#    diccond['latNan'] = (not diccond['left']) & (not diccond['right'])

    #========== age 
    diccond['age<=20'] = (df['Age'] <= 20)
    diccond['age<20'] = (df['Age'] < 20)
    diccond['age<12'] = (df['Age'] < 12)
    diccond['age>=12'] = (df['Age'] >= 12)
    
    #========== tool conditions
    diccond['fs'] = (df['tool'] == tooldic['surfer'])
    diccond['ants'] = (df['tool'] == tooldic['ants'])
    diccond['fsl'] = (df['tool'] == tooldic['fsl'])
    
    #========== IQ conditions
    diccond['fiq>0'] = (df['FIQ'] > 0 )
    
    #========== disease conditions
    pop_normDev = False
    for pop in normalDev:
        # print(np.sum(pop_cond))
        pop_normDev = pop_normDev | (df['dx'] == pop)
    diccond['normDev'] =  pop_normDev
    
    pop_patient = False
    for pop in patient:
        # print(np.sum(pop_cond))
        pop_patient = pop_patient | (df['dx'] == pop)
    diccond['patient'] =  pop_patient

    #=========== ROIs
    roi_ub = ''
    diccond['bvol'] = (df['softwareLabel'] == 'BVOL (mm^3)')
    diccond['brainseg'] = (df['softwareLabel'] == 'Brain Segmentation Volume (mm^3)') 
    diccond['brainsegwov'] = (df['softwareLabel'] == 'Brain Segmentation Volume Without Ventricles (mm^3)') 
    diccond['caudate'] = (df['structure'] == h2ube['Caudate'])
    diccond['putamen'] = (df['structure'] == h2ube['Putamen'])
    diccond['TIV'] = (df['structure'] == h2ube['Estimated Total Intracranial Volume'])
#
    diccond['wm'] =  (df['structure'] == h2ube['hemisphere cerebral white matter volume'])
    diccond['gm'] =  (df['structure'] == h2ube['Total gray matter volume'])
    diccond['csf'] =  (df['structure'] == h2ube['CSF'])
#
    diccond['wmfsl'] =  (df['softwareLabel'] == 'white (mm^3)')
    diccond['gmfsl'] =  (df['softwareLabel'] == 'gray (mm^3)')
    diccond['csffsl'] =  (df['softwareLabel'] == 'csf (mm^3)')
#
    diccond['ccant'] =  (df['structure'] == h2ube['CC_Anterior'])
    diccond['cccen'] =  (df['structure'] == h2ube['CC_Central'])
    diccond['ccpos'] =  (df['structure'] == h2ube['CC_Posterior'])

    #=========== CC all subcomponents
    CC_rois = [roi for roi in set(df['softwareLabel']) 
                           if ('CC_' in roi and 'Volume_mm3' in roi)]
    ccfs = False
    for _roi in CC_rois:
        ccfs = ccfs | (df['softwareLabel'] == _roi)
    diccond['ccfs'] =  ccfs
    
    #=========== site
    diccond['abide'] = (df['study'].str.contains("ABIDE"))
    diccond['adhd200'] = (df['study'].str.contains("ADHD"))
    
    #=========== Gender
    diccond['male'] = (df['Gender']=='Male')
    diccond['female'] = (df['Gender']=='Female')

    return diccond

In [None]:
condic = define_conditions(hie, tooldic=tooldic, normalDev=normalDev, patient=patient, h2ube=h2ube)
condic.keys()

In [None]:
def apply_cond(df, cndc, conditions, dropnaset=[],columns={}):
    """
    
    input:
    -------
    
    df: dataframe
    conditions: list of strings, each str should be a key of the cndc dict
    cndc: dict
        a dict (eg returned by define_condition) that contains the set of true false for that condition
    dropnaset: list
        the list of column names in which NaN should be dropped
    columns: dict
        the dict given to df.rename to rename columms eg: {'oldname':'newname'}
        
    returns
    -------
    dataframe, containing only rows that satisfy the list of conditions 'conditions'
    
    """
    # initialize array of True 
    cond = np.full((len(df),), True, dtype=bool)
    
    for c in conditions:
        assert len(cond) == len(cndc[c])
        cond = cond & cndc[c]
        # print(np.sum(cond))
        
    # condition = [cond & cndc[c] for c in conditions][0]
    # print(len(condition),np.sum(condition))
    
    # make a copy, drop na if there are some
    tmp = df.loc[cond].dropna(subset=dropnaset)
    if columns:
        tmp.rename(columns=columns, inplace=True)
        
    if len(tmp) == 0:
        print('Warning, len(df)==0' + ' '.join(conditions))
        
    return tmp    

In [None]:
cond = ['ccfs']
condic = define_conditions(hie, tooldic=tooldic, 
                           normalDev=normalDev, patient=patient, h2ube=h2ube)
tmp = apply_cond(hie, condic, cond)
set(tmp['softwareLabel'])

In [None]:
new_tmp = split_op_merge(tmp, index='ID', col='softwareLabel', values='volume', newcol='CC_vol', 
                        keepcols=['study', 'Age', 'Gender'], verbose=True) # , 'Age', 'dx', 'Gender', 'FIQ'])

In [None]:
softw = 'surfer'

In [None]:
# hyp1 = ['bvol',softw,'left','age<=20','fiq>0','normDev']
# hyp1 = ['fiq>0','abide','caudate', 'putamen']
hyp1 = ['female', 'caudate', 'fs', 'fiq>0'] # ,'abide']
condic = define_conditions(hie, tooldic=tooldic, normalDev=normalDev, patient=patient, h2ube=h2ube)
tmp = apply_cond(hie, condic, hyp1)
# print('apply:', len(tmp.dropna(subset=['FIQ', 'volume', 'Gender'])))

manual = hie.loc[(hie['Gender']=='Female') & 
#                 (hie['study'].str.contains("ABIDE")) &
                 (hie['structure'] == h2ube['Caudate']) &
                 (hie['tool'] == tooldic[softw]) & 
                 (hie['FIQ'] > 0) ] 
manual = manual.dropna(subset=['FIQ', 'volume', 'Gender']) #,inplace=True)
print('manual: ',len(manual))
assert len(tmp.dropna(subset=['FIQ', 'volume', 'Gender'])) == len(manual)

In [None]:
#cond = ['abide'] #,'fiq>0'] # ,'abide']
cond = ['adhd200'] #,'fiq>0'] # ,'abide']
dropnaset = [] #'FIQ']
tmp_df = apply_cond(hie, condic, cond, dropnaset=dropnaset, columns={'volume':'brainvol'})
len(set(tmp_df['ID']))

In [None]:
structures = set(hie.structure)


Hypotheses

PIET-1: Total Brain Volume will positively correlate with IQ (in both sexes across the complete age range).

MAC-1: Left striatum volume (caudate + putamen) will positively correlate with IQ in the total (male + female) child (age < 20) group.

MAC-2: Left striatum volume (caudate + putamen) will positively correlate with IQ in the male children group.

MAC-3: Left striatum volume (caudate + putamen) will not correlate with IQ in the female children group.

GANJ-1: Total Corpus Callosum midsagittal area, after correcting for total brain volume, will negatively correlate with IQ.

GANJ-2: Total Corpus Callosum midsagittal area, after correcting for total brain volume, will negatively correlate with IQ in the young (age < 12) group.

GANJ-3: Total Corpus Callosum midsagittal area, after correcting for total brain volume, will not significantly correlate with IQ in the adolescent (age > 12) group.

GANJ-4:. Total Corpus Callosum midsagittal area, after correcting for total brain volume, will negatively correlate with IQ in the male (age < 12) group.

GANJ-5: Total Corpus Callosum midsagittal area, after correcting for total brain volume, will not significantly correlate with IQ in the female (age < 12) group.


Kennedy.Dave: Yes, please add then ants (and FSL for the first 4 hypotheses) when you get the chance...

#### Can you also do the following extra analyses:  

GANJ-0a 
Corpus Callosum Area vs. Total Brain Volume covary for site

GANJ-0b
Corpus Callosum Area vs. Age, covary for site

MAC hypotheses and use total brain volume as a covariate

Correct hypotheses : 
[hypotheses](https://docs.google.com/spreadsheets/d/1IYbDCvisOjblUkuBs8GN6--gjVSe_qBeR0eGIlTShYU/edit#gid=1945227897)

summary variables for all models: mean, sd, min, max?

In [None]:

def md2dic(varnames, hyp_name, cond, mdf=None, debug=False):
    """
    Conveniency function for extracting the values from the output table
    of statsmodel.formula (import as smf) fit 
    
    
    input
    -------
    varnames: list of string (variables in the model)
        varnames[1] is the name of the variable for which we want p, t, nobs, etc
    hyp_name: string
        name of hyp
    cond: list
        list of string representing the conditions 
    
    returns
    -------
    dict 
        key is hyp_name, values is a dict with values for t, p, nobs, ... 
    
    """
    
    varname = varnames[1] 
    table2 = mdf.summary().tables[1].data
    cols = table2[0]
    #print(cols)
    #print(table2)
    if debug: 
        print('varname', varname)
        print('hyp_name', hyp_name)
        print('table2[0] :',cols)

    ther = [r for r in table2 if r[0] == varname]
    if debug:
        print(table2)
        print('the row: ', ther)
    ther = ther[0]
    
    resdic = {}
    resdic[hyp_name] = {'P>|t|':ther[cols.index('P>|t|')], 
             't':ther[cols.index('t')], 
             'rsquared_adj':"{:4.3}".format(mdf.rsquared_adj),
             'nobs': "{:3d}".format(int(mdf.nobs)),
             'conditions': cond,
             'variables': varnames
            }

    return(resdic)

In [None]:
def run_hyp(datadf, hyp_name, hyp, ynxs, resdic={}, correctfor=None, debug=False):
    """
    construct the formula and runs a specific hypothesis on dataframe datadf
    the ynxs contain the variables names. If resdic exist, an additional 
    entry is added 

    df: pandas dataframe
    hyp_name: str
        describe the hypothesis
    hyp: list
        list of string describing the hyp
    ynxs: list of str (Ys and Xs)
        [0] : explained var Y
        [1] : region X1 (for which we want the stats)
        [2] : X2, ... additional covariables of the form : ' X2 + X3 ... '
    resdic: dict
        current dictionary of results to which these resutls are added
    correctfor: list of strings
        will correct ynxs[1] for the columns which names contain one of the string
        
    returns
    --------
    updated result dictionary 
    """
    
    assert len(ynxs) == 3 # 
    md = smf.ols(ynxs[0] + " ~ " + ynxs[1] + " + " + ynxs[2], data=datadf)
    mdf = md.fit()
    resdicupdate = md2dic(ynxs, hyp_name, hyp, mdf=mdf, debug=debug)
    
    if correctfor is None:
        resdicupdate[hyp_name]['corrcoef'] = np.corrcoef(md.endog, datadf[ynxs[1]])
        resdicupdate[hyp_name]['exogcorrectionshape'] = 'NA'
        resdicupdate[hyp_name]['exogcorrectionnames'] = []
    # get columns in correctfor    # correctfor = ['study']
    else:
        indx = []
        for correct in correctfor:
            indx += [md.exog_names.index(col) for col in md.exog_names if correct in col]
        indx = list(set(indx)) # avoid possible duplicates
        C = md.exog[:,indx] # C for counfound, indx is a list, so should do fancy 
                            # indexing and pick these cols
        endog_corrected = md.endog - (C @ lin.pinv(C) @ md.endog)

#        resdicupdate[hyp_name]['corrcoef'] = np.corrcoef(datadf[ynxs[0]], datadf[ynxs[1]])[0,0]
        resdicupdate[hyp_name]['corrcoef'] = np.corrcoef(md.endog, datadf[ynxs[1]])[0,1]
        resdicupdate[hyp_name]['exogcorrectionshape'] = md.exog[:,indx].shape
        resdicupdate[hyp_name]['exogcorrectionnames'] = [md.exog_names[idx] for idx in indx]
        # list(itemgetter(*indx)(md.exog_names)) 
    
    
    # update resdic with resdicupdate dictionary
    resdic.update(resdicupdate)
    return resdic
    
"""    
iq = 'FIQ'
tmp = PIET_1_fs_df
# md = smf.ols(iq + " ~ Q('volume') + Gender + Age + study ", data=tmp) #  
md = smf.ols(iq + " ~ " + volume_of + " + study ", data=tmp) #  
mdf = md.fit()
print(mdf.summary())
resdic.update(md2dic(volume_of, hyp_name, hyp1, mdf=mdf))
""";

### PIET-1: Total Brain Volume will positively correlate with IQ (in both sexes across the complete age range).


In [None]:
condic = define_conditions(hie, tooldic=tooldic, normalDev=normalDev, patient=patient, h2ube=h2ube)
print(condic.keys())
dropnaset = ['FIQ', 'volume', 'Gender']


In [None]:
"""volume_of = 'brainseg'
hyp_name = 'PIET-1_fs'

print(list(hie))

PIET_1_fs = ['brainseg', 'fiq>0','normDev', 'fs'] # 'bvol','ants','abide']
PIET_1_fs_df = apply_cond(hie, condic, PIET_1_fs, dropnaset=dropnaset, 
                          columns={'volume':volume_of})

print('len(PIET_1_fs_df)',len(PIET_1_fs_df))
print(list(PIET_1_fs_df))

modeldata = smf.ols('FIQ ~ brainseg + study', data=PIET_1_fs_df)
modeldatafit = modeldata.fit()

print(modeldatafit)
# resdic = run_hyp(PIET_1_fs_df, hyp_name, PIET_1_fs, ['FIQ', volume_of, 'study'], 
#                 resdic={}, debug=False)
""";

In [None]:
#plt.plot(PIET_1_fs_df['FIQ'],PIET_1_fs_df['brainseg'],'.')
"""
print(modeldata.endog_names, modeldata.exog_names)

# get columns in correctfor
print(modeldata.exog.shape)
correctfor = ['study']
indx = []
for correct in correctfor:
    indx += [modeldata.exog_names.index(col) for col in modeldata.exog_names if correct in col]

print(list(set(indx)))
print(modeldata.exog[:,indx].shape)

assert np.alltrue(modeldata.exog[:,0]==1)
assert np.alltrue(modeldata.endog[:] == PIET_1_fs_df['FIQ'])

#modeldatafit.predict()
print(modeldata.exog[500:560,indx])
print(indx)
""";

In [None]:
volume_of = 'brainseg'
hyp_name = 'PIET-1_fs'

PIET_1_fs = ['brainseg', 'fiq>0','normDev', 'fs'] # 'bvol','ants','abide']
PIET_1_fs_df = apply_cond(hie, condic, PIET_1_fs, dropnaset=dropnaset, 
                          columns={'volume':volume_of})
print('len(PIET_1_fs_df)',len(PIET_1_fs_df))

resdic = run_hyp(PIET_1_fs_df, hyp_name, PIET_1_fs, ['FIQ', volume_of, 'study'], 
                 resdic={}, correctfor = ['study'], debug=False)
print(np.corrcoef(PIET_1_fs_df['FIQ'],PIET_1_fs_df['brainseg'])[0,1])
print(resdic[hyp_name])

In [None]:
volume_of = 'fsl_brainvol'
hyp_name = 'PIET-1_fsl'

PIET_1_fsl = ['fiq>0','normDev', 'fsl']
fsl_gm = PIET_1_fsl + ['gmfsl'] #
fsl_wm = PIET_1_fsl + ['wmfsl'] #
fsl_csf = PIET_1_fsl + ['csffsl'] #

fsl_gm_df = apply_cond(hie, condic, fsl_gm, dropnaset=dropnaset, columns={'volume':'fsl_gm'})
fsl_wm_df = apply_cond(hie, condic, fsl_wm, dropnaset=dropnaset, columns={'volume':'fsl_wm'})
fsl_csf_df = apply_cond(hie, condic, fsl_csf, dropnaset=dropnaset, columns={'volume':'fsl_csf'})

fsl_total = pd.merge(left=fsl_gm_df, right=fsl_wm_df[['ID','fsl_wm']], left_on='ID', right_on='ID')
fsl_total = pd.merge(left=fsl_total, right=fsl_csf_df[['ID','fsl_csf']], left_on='ID', right_on='ID')
fsl_total['fsl_brainvol'] = fsl_total['fsl_csf'] + fsl_total['fsl_wm'] + fsl_total['fsl_gm']
print('len(fsl_total)',len(fsl_total))


In [None]:
resdic = run_hyp(fsl_total, hyp_name, PIET_1_fsl, ['FIQ', volume_of, 'study'], resdic=resdic, debug=False)
print(resdic[hyp_name])

##### and with ANTS ?


In [None]:
volume_of = 'ants_brainvol'
hyp_name = 'PIET-1_ants'
PIET_1_ants = ['bvol', 'fiq>0', 'normDev', 'ants']
PIET_1_ants_df = apply_cond(hie, condic, PIET_1_ants, dropnaset=dropnaset, columns={'volume':'ants_brainvol'})
print('len(PIET_1_ants_df)',len(PIET_1_ants_df))

resdic = run_hyp(PIET_1_ants_df, hyp_name, PIET_1_ants, ['FIQ', volume_of, 'study'], resdic=resdic, debug=False)
print(resdic[hyp_name])

### MAC-1: Left striatum volume (caudate + putamen) will positively correlate with IQ in the total (male + female) child (age < 20) group.


#### Create df and conditions for left striatum, **all age all gender**

In [None]:
cond_mac = ['fiq>0', 'normDev', 'left'] # , 'fs' ,'age<20', 

mac1_caud = cond_mac + ['caudate'] #
mac1_put = cond_mac + ['putamen'] #
# mac1_tiv = cond_mac + ['fiq>0','normDev','fs','TIV'] #

left_caud = apply_cond(hie, condic, mac1_caud, dropnaset=dropnaset, columns={'volume':'caudate'})
print(len(left_caud), len(left_caud[(left_caud['tool']==tooldic['surfer'])] )) 

left_put = apply_cond(hie, condic, mac1_put, dropnaset=dropnaset, columns={'volume':'putamen'})
print(len(left_put), len(set(left_put['ID'])))

#left_caud.head(7)

In [None]:
left_stria = pd.merge(left=left_caud, right=left_put[['ID','putamen','tool']], on=['ID','tool'])
left_stria['striatum'] = left_stria['caudate'] + left_stria['putamen']
print(len(left_stria))

left_stria.head(3)

In [None]:
left_stria_condic = define_conditions(left_stria, tooldic=tooldic, 
                                      normalDev=normalDev, patient=patient, h2ube=h2ube)

#print((set(left_stria['dx'])))

### MAC-1: Left striatum volume (caudate + putamen) will positively correlate with IQ in the total (male + female) child (age < 20) group.


#### FS

In [None]:
print(list(left_stria))

In [None]:
volume_of = 'striatum_fs'
hyp_name = 'MAC-1-fs'
mac1_child_fs = ['age<20','fs']
mac1_child_df = apply_cond(left_stria, left_stria_condic, mac1_child_fs, columns={'striatum':'striatum_fs'})
print(len(mac1_child_df))
# print(list(mac1_child_df))
resdic = run_hyp(mac1_child_df, hyp_name, cond_mac+mac1_child_fs, 
                     ['FIQ', volume_of, 'study'], resdic=resdic, debug=False)

print(resdic[hyp_name])

#### FSL

In [None]:
volume_of = 'striatum_fsl'
hyp_name = 'MAC-1-fsl'
mac1_child_fsl = ['age<20','fsl']
mac1_child_df = apply_cond(left_stria, left_stria_condic, mac1_child_fsl, 
                                                          columns={'striatum':'striatum_fsl'})
print(len(mac1_child_df))

resdic = run_hyp(mac1_child_df, hyp_name, cond_mac+mac1_child_fsl, 
                     ['FIQ', volume_of, 'study'], resdic=resdic, debug=False)

print(resdic[hyp_name])

In [None]:
volume_of = 'striatum_ants'
hyp_name = 'MAC-1-ants'
mac1_child_ants = ['age<20','ants'] #,'abide']
mac1_child_df = apply_cond(left_stria, left_stria_condic, mac1_child_ants, 
                                                          columns={'striatum':'striatum_ants'})
print(len(mac1_child_df)) # list(mac1_child_df)
assert(len(mac1_child_df) == len(set(mac1_child_df['ID'])))
# mac1_child_df.head(5)

In [None]:
resdic = run_hyp(mac1_child_df, hyp_name, cond_mac+mac1_child_ants, 
                     ['FIQ', volume_of, 'study'], resdic=resdic, debug=False)

# print(resdic)

### MAC-2: Left striatum volume (caudate + putamen) will positively correlate with IQ in the male children group.


In [None]:
sftw = 'fs'
volume_of = 'striatum'
hyp_name = 'MAC-2-' + sftw
mac2_cond = ['age<20','male', sftw]
mac2_df = apply_cond(left_stria, left_stria_condic, mac2_cond)
print(len(mac2_df))

resdic = run_hyp(mac2_df, hyp_name, cond_mac+mac2_cond, 
                     ['FIQ', volume_of, 'study'], resdic=resdic, debug=False)

print(resdic[hyp_name])

#### MAC-2 ANTS

#### MAC-2 FSL

In [None]:
sftw = 'fsl'
volume_of = 'striatum'
hyp_name = 'MAC-2-' + sftw
mac2_cond = ['age<20','male', sftw]
mac2_df = apply_cond(left_stria, left_stria_condic, mac2_cond)
print(len(mac2_df))

resdic = run_hyp(mac2_df, hyp_name, cond_mac+mac2_cond, 
                     ['FIQ', volume_of, 'study'], resdic=resdic, debug=False)

print(resdic[hyp_name])

In [None]:
sftw = 'ants'
volume_of = 'striatum'
hyp_name = 'MAC-2-' + sftw
mac2_cond = ['age<20','male', sftw]

mac2_df = apply_cond(left_stria, left_stria_condic, mac2_cond)
print(len(mac2_df))

resdic = run_hyp(mac2_df, hyp_name, cond_mac+mac2_cond, 
                     ['FIQ', volume_of, 'study'], resdic=resdic, debug=False)

print(resdic[hyp_name])

### MAC-3: Left striatum volume (caudate + putamen) will not correlate with IQ in the female children group.


#### MAC-3 FS

In [None]:
sftw = 'fs'
volume_of = 'striatum'
hyp_name = 'MAC-3-' + sftw
hyp_cond = ['age<20','female', sftw]
hyp_df = apply_cond(left_stria, left_stria_condic, hyp_cond)
print(len(hyp_df))

resdic = run_hyp(hyp_df, hyp_name, cond_mac+hyp_cond, 
                     ['FIQ', volume_of, 'study'], resdic=resdic, debug=False)

print(resdic[hyp_name])

#### MAC-3 FSL

In [None]:
sftw = 'fsl'
volume_of = 'striatum'
hyp_name = 'MAC-3-' + sftw
hyp_cond = ['age<20','female', sftw]
hyp_df = apply_cond(left_stria, left_stria_condic, hyp_cond)
print(len(hyp_df))

resdic = run_hyp(hyp_df, hyp_name, cond_mac+hyp_cond, 
                     ['FIQ', volume_of, 'study'], resdic=resdic, debug=False)

print(resdic[hyp_name])

#### MAC-3 ANTS

In [None]:
sftw = 'ants'
volume_of = 'striatum'
hyp_name = 'MAC-3-' + sftw # + 'abide'
hyp_cond = ['age<20','female', sftw] #, 'abide']
hyp_df = apply_cond(left_stria, left_stria_condic, hyp_cond)
print(len(hyp_df))

resdic = run_hyp(hyp_df, hyp_name, cond_mac+hyp_cond, 
                     ['FIQ', volume_of, 'study'], resdic=resdic, debug=False)

print(resdic[hyp_name])

### compute cc and tbv  for GANJ, no age or gender condition 

In [None]:
condic = define_conditions(hie, tooldic=tooldic, normalDev=normalDev, patient=patient, h2ube=h2ube)
condic.keys()
print(list(hie))

In [None]:
"""
sftw = 'fs'
hyp_Ganj = ['fiq>0','normDev', sftw] #, 'adhd200']# , 'adhd' ,'abide','age<20'

hyp_ccant = hyp_Ganj + ['ccant'] #
hyp_cccen = hyp_Ganj + ['cccen'] #
hyp_ccpos = hyp_Ganj + ['ccpos'] #

ccant = apply_cond(hie, condic, hyp_ccant, dropnaset=dropnaset, columns={'volume':'ccant'})
cccen = apply_cond(hie, condic, hyp_cccen, dropnaset=dropnaset, columns={'volume':'cccen'})
ccpos = apply_cond(hie, condic, hyp_ccpos, dropnaset=dropnaset, columns={'volume':'ccpos'})

# Note: Only freesurfer measures CC_* therefore no need to merge on tools as well
cc_df = pd.merge(left=ccant, right=cccen[['ID','cccen']], on=['ID']) #,'tools']) # left_on='ID', right_on='ID')
cc_df = pd.merge(left=cc_df, right=ccpos[['ID','ccpos']], on=['ID']) #,'tools']) # left_on='ID', right_on='ID')

cc_df['cc'] = cc_df['ccant']+cc_df['cccen']+cc_df['ccpos']
len(cc_df), len(ccant), len(cccen), len(ccpos)
# (654, 654, 654, 654)
""";

In [None]:
sftw = 'fs'
hyp_Ganj = ['fiq>0','normDev', sftw] #, 'adhd200']# , 'adhd' ,'abide','age<20'
cond_ccfs = hyp_Ganj+['ccfs']
cond_ccfs_df = apply_cond(hie, condic, cond_ccfs)

# print(set(ccfs['softwareLabel']))
# 
cc_df = split_op_merge(cond_ccfs_df, index='ID', col='softwareLabel', values='volume', newcol='cc', 
                        keepcols=['study', 'Age', 'Gender'], verbose=True) # , 'Age', 'dx', 'Gender', 'FIQ'])

In [None]:
tmp_df = cc_df[cc_df['study'].str.contains("ADHD")]
print('664 - len(tmp_df): ', 664 - len(tmp_df))
#tmp_df.head(3)
del tmp_df

In [None]:
hyp_ganj = 'GANJ-'

hyp_gm = hyp_Ganj + ['gm','latNan'] # + ['adhd200']#
gm = apply_cond(hie, condic, hyp_gm, dropnaset=dropnaset, columns={'volume':'gm'})
print(gm[['ID','gm','laterality']].head(2),len(gm))

hyp_wm = hyp_Ganj + ['wm','latNan'] #
wm = apply_cond(hie, condic, hyp_wm, dropnaset=dropnaset, columns={'volume':'wm'})
print(wm[['ID','wm','laterality']].head(2),len(wm))

hyp_csf = hyp_Ganj + ['csf','latNan'] #
csf = apply_cond(hie, condic, hyp_csf, dropnaset=dropnaset, columns={'volume':'csf'})
print(csf[['ID','csf','laterality']].head(2),len(csf))

tbv = pd.merge(left=gm, right=wm[['ID','wm']], left_on='ID', right_on='ID')
tbv = pd.merge(left=tbv, right=csf[['ID','csf']], left_on='ID', right_on='ID')
tbv['tbv'] = tbv['wm'] + tbv['gm'] +  tbv['csf']
tbv.drop(['wm','gm','csf'], axis=1, inplace=True)

print(len(gm), len(wm), len(csf), len(tbv))

In [None]:
cc_tbv = pd.merge(left=tbv, right=cc_df[['ID','cc']], left_on='ID', right_on='ID')
condic_cc_tbv = define_conditions(cc_tbv, tooldic=tooldic, normalDev=normalDev, patient=patient, h2ube=h2ube)

In [None]:
list(cc_tbv)

### GANJ-0a Corpus Callosum Area vs. Total Brain Volume covary for site
### GANJ-0b Corpus Callosum Area vs. Age, covary for site

In [None]:
volume_of = 'cc'
hyp_df =  cc_tbv

hyp_name = hyp_ganj + '0a_cc_tbv'
#print(len(hyp_df))
resdic = run_hyp(hyp_df, hyp_name, hyp_Ganj, 
                     [ volume_of, 'tbv', 'study'], resdic=resdic, debug=False)
print(resdic[hyp_name])

hyp_name = hyp_ganj + '0a_tbv_cc'
resdic = run_hyp(hyp_df, hyp_name, hyp_Ganj, 
                     ['tbv',  volume_of, 'study'], resdic=resdic, debug=False)
print(resdic[hyp_name])

hyp_name = hyp_ganj + '0b_cc_age'
resdic = run_hyp(hyp_df, hyp_name, hyp_Ganj, 
                     [volume_of, 'Age', 'study'], resdic=resdic, debug=False)
print(resdic[hyp_name])

hyp_name = hyp_ganj + '0b_age_cc'
resdic = run_hyp(hyp_df, hyp_name, hyp_Ganj, 
                     ['Age', volume_of, 'study'], resdic=resdic, debug=False)
print(resdic[hyp_name])

hyp_name = hyp_ganj + '0c_age_tbv'
resdic = run_hyp(hyp_df, hyp_name, hyp_Ganj, 
                     ['Age', 'tbv', 'study'], resdic=resdic, debug=False)
print(resdic[hyp_name])

hyp_name = hyp_ganj + '0c_tbv_age'
resdic = run_hyp(hyp_df, hyp_name, hyp_Ganj, 
                     ['tbv', 'Age', 'study'], resdic=resdic, debug=False)
print(resdic[hyp_name])

### GANJ-1: Total Corpus Callosum midsagittal area, after correcting for total brain volume, will negatively correlate with IQ.


In [None]:
volume_of = 'cc'
hyp_name = hyp_ganj + '1' # + 'abide'
hyp_cond = ['age<20'] #'fiq>0','normDev','fs','age<=20','ccant'] #
hyp_df = apply_cond(cc_tbv, condic_cc_tbv, hyp_cond)
print(len(hyp_df))

resdic = run_hyp(hyp_df, hyp_name, hyp_Ganj+hyp_cond, 
                     ['FIQ', volume_of, 'study + tbv'], resdic=resdic, debug=False)
print(resdic[hyp_name])

### GANJ-2: Total Corpus Callosum midsagittal area, after correcting for total brain volume, will negatively correlate with IQ in the young (age < 12) group.

In [None]:
volume_of = 'cc'
hyp_name = hyp_ganj + '2' # + 'abide'
hyp_cond = ['age<12'] #'fiq>0','normDev','fs','age<=20','ccant'] #
hyp_df = apply_cond(cc_tbv, condic_cc_tbv, hyp_cond)
print(len(hyp_df))

resdic = run_hyp(hyp_df, hyp_name, hyp_Ganj+hyp_cond, 
                     ['FIQ', volume_of, 'study + tbv'], resdic=resdic, debug=False)
print(resdic[hyp_name])

### GANJ-3: Total Corpus Callosum midsagittal area, after correcting for total brain volume, will not significantly correlate with IQ in the adolescent (age > 12) group.


In [None]:
volume_of = 'cc'
hyp_name = hyp_ganj + '3' # + 'abide'
hyp_cond = ['age>=12'] #'fiq>0','normDev','fs','age<=20','ccant'] #
hyp_df = apply_cond(cc_tbv, condic_cc_tbv, hyp_cond)
print(len(hyp_df))

resdic = run_hyp(hyp_df, hyp_name, hyp_Ganj+hyp_cond, 
                     ['FIQ', volume_of, 'study + tbv'], resdic=resdic, debug=False)
print(resdic[hyp_name])

### GANJ-4:. Total Corpus Callosum midsagittal area, after correcting for total brain volume, will negatively correlate with IQ in the male (age < 12) group.


In [None]:
volume_of = 'cc'
hyp_name = hyp_ganj + '4' # + 'abide'
hyp_cond = ['age<12','male'] #'fiq>0','normDev','fs','age<=20','ccant'] #
hyp_df = apply_cond(cc_tbv, condic_cc_tbv, hyp_cond)
print(len(hyp_df))

resdic = run_hyp(hyp_df, hyp_name, hyp_Ganj+hyp_cond, 
                     ['FIQ', volume_of, 'study + tbv'], resdic=resdic, debug=False)
print(resdic[hyp_name])

### GANJ-5: Total Corpus Callosum midsagittal area, after correcting for total brain volume, will not significantly correlate with IQ in the female (age < 12) group.


In [None]:
volume_of = 'cc'
hyp_name = hyp_ganj + '5' 
hyp_cond = ['age<12','female'] # ,'abide', 'fiq>0','normDev','fs','age<=20','ccant'] #
hyp_df = apply_cond(cc_tbv, condic_cc_tbv, hyp_cond)
print(len(hyp_df))

resdic = run_hyp(hyp_df, hyp_name, hyp_Ganj+hyp_cond, 
                     ['FIQ', volume_of, 'study + tbv'], resdic=resdic, debug=False)
print(resdic[hyp_name])

In [None]:
#print(resdic)

In [None]:
datares = []
lhyp = list(resdic.keys())
hyp = ['hyp'] + lhyp

col_names = list(resdic[lhyp[0]].keys())
for idx, col in enumerate(col_names):
    datares.append( [col] + [resdic[h][col] for h in lhyp]  )

datares = [hyp] + datares
resdf = pd.DataFrame.from_records(datares).transpose()
resdf.head(50)

In [None]:
timestr = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
output_filename = csv_filename[:-4]+'-'+timestr+'.csv'
resdf.to_csv(path_or_buf = output_filename, index=False)

In [None]:
pwd

### Scrap

In [None]:
"""
hyp2 = ['fiq>0','normDev','fs','age<20']
hyp2_caud = hyp2 + ['caudate'] # ,'abide']
hyp2_put = hyp2 + ['putamen'] # ,'abide']
hyp2_tiv = hyp2 + ['TIV'] # ,'abide']
tmp_caud = apply_cond(hie, condic, hyp2_caud, dropnaset=dropnaset)
caud = split_merge_df(tmp_caud, indx='ID', spliton='laterality', levels=['Left','Right'], 
                       keep_col='volume', op='+',colrename='caudate')
tmp_put = apply_cond(hie, condic, hyp2_put, dropnaset=dropnaset)
put = split_merge_df(tmp_put, indx='ID', spliton='laterality', levels=['Left','Right'], 
                       keep_col='volume', op='+',colrename='putamen')
tmp_tiv = apply_cond(hie, condic, hyp2_tiv, dropnaset=dropnaset)
print(len(caud), len(put), len(tmp_tiv))

stria = pd.merge(left=caud, right=put[['ID','putamen']], left_on='ID', right_on='ID')
stria['striatum'] = stria['caudate']+stria['putamen']
stria = pd.merge(left=stria, right=tmp_tiv[['ID','volume']], left_on='ID', right_on='ID')
stria.rename(columns={'volume':'TIV'},inplace=True)
print(list(stria),len(stria))
""";

"""
cond_mac = ['fiq>0','normDev', softw,] # ,'age<20']
mac_caud = cond_mac + ['caudate'] #
mac_put = cond_mac + ['putamen'] #
mac_tiv = cond_mac + ['fiq>0','normDev',softw,'TIV'] #

left_caud = apply_cond(hie, condic, mac_caud, dropnaset=dropnaset, columns={'volume':'caudate'})
left_put = apply_cond(hie, condic, mac_put, dropnaset=dropnaset, columns={'volume':'putamen'})

left_stria = pd.merge(left=left_caud, right=left_put[['ID','putamen']], left_on='ID', right_on='ID')
left_stria['striatum'] = left_stria['caudate'] + left_stria['putamen']
left_stria_condic = define_conditions(left_stria, tooldic=tooldic, normalDev=normalDev, adhd=adhd, h2ube=h2ube)
print(len(left_stria))

""";


""" 
iq = 'FIQ'

# md = smf.ols(iq + " ~ Q('volume') + Gender + Age + study ", data=tmp) #  
# md = smf.ols(iq + " ~ Q('striatum') + study + TIV ", data=stria) #  
md = smf.ols(iq + " ~ Q('striatum') + study ", data=mac1) #  
mdf = md.fit()
print(mdf.summary())

varname = "Q('striatum')"
hyp_name = 'MAC-1'
resdic.update(md2dic(varname, hyp_name, cond_mac+mac1_cond, mdf=mdf))
""";


"""
#ganj1 = apply_cond(cc_tbv, condic_cc_tbv, ganj1_cond)
#print(list(ganj1),len(ganj1))

#iq = 'FIQ'
# md = smf.ols(iq + " ~ Q('volume') + Gender + Age + study ", data=tmp) #  
#md = smf.ols(iq + " ~ Q('striatum') + study + TIV ", data=stria) #  
#md = smf.ols(iq + " ~ Q('cc') + study + tbv ", data=ganj1) #  
#mdf = md.fit()
#print(mdf.summary())

#varname, hyp_name = "Q('cc')",'GANJ-1'
#resdic.update(md2dic(varname, hyp_name, hypGanj+ganj1_cond, mdf=mdf))
""";

"""
ganj2_cond = ['age<12'] #'fiq>0','normDev','fs','age<=20','ccant'] #
ganj2 = apply_cond(cc_tbv, condic_cc_tbv, ganj2_cond)
print(list(ganj2),len(ganj2));

iq = 'FIQ'
# md = smf.ols(iq + " ~ Q('volume') + Gender + Age + study ", data=tmp) #  
#md = smf.ols(iq + " ~ Q('striatum') + study + TIV ", data=stria) #  
md = smf.ols(iq + " ~ Q('cc') + study + tbv ", data=ganj2) #  
mdf = md.fit()
print(mdf.summary())

varname, hyp_name = "Q('cc')",'GANJ-2'
resdic.update(md2dic(varname, hyp_name, hypGanj+ganj2_cond, mdf=mdf))
""";

"""ganj3_cond = ['age>=12'] #
ganj3 = apply_cond(cc_tbv, condic_cc_tbv, ganj3_cond)
print(len(ganj3))

iq = 'FIQ'
# md = smf.ols(iq + " ~ Q('volume') + Gender + Age + study ", data=tmp) #  
#md = smf.ols(iq + " ~ Q('striatum') + study + TIV ", data=stria) #  
md = smf.ols(iq + " ~ Q('cc') + study + tbv ", data=ganj3) #  
mdf = md.fit()
print(mdf.summary())


varname, hyp_name = "Q('cc')",'GANJ-3'
resdic.update(md2dic(varname, hyp_name, hypGanj+ganj3_cond, mdf=mdf))""";

In [None]:

# original_col_names = list(hie)
# column names are unique
# assert len(original_col_names) == len(set(original_col_names))
# print(list(hie))
#col_rename = {'federatedLabel':'structure'}
#hie.rename(columns=col_rename, inplace=True)


In [None]:
# check numbers directly, bypassing convenient functions

"""csv_filename = '2020-02-11-simple2_query_output.csv'
relative_dir = './data'
relative_path_filename = osp.join(relative_dir, csv_filename)
assert osp.exists(relative_path_filename)

hie = pd.read_csv(relative_path_filename, na_values='nd') #, low_memory=False)set(hie['study'])

htmp = hie[(hie['study'].str.contains("ADHD"))]
_adhd = len(set(htmp['ID']))
htmp = hie[(hie['study'].str.contains("ABIDE"))]
_abide = len(set(htmp['ID']))

print(len(set(hie['ID'])))
print(_adhd, _abide, _adhd + _abide)
""";

In [None]:
"""
condic = define_conditions(hie, tooldic=tooldic, normalDev=normalDev, patient=patient, h2ube=h2ube)
condic.keys()
print(list(hie))
# hyp_cond = ['age<20','female', 'adhd200' ]
hyp_cond = ['adhd200' ]
hyp_df = apply_cond(hie, condic, hyp_cond)
print(len(hyp_df))
len(set(hyp_df['ID']))
""";

In [None]:
"""
def split_merge_df(df, indx='ID', spliton='laterality', levels=['Left','Right'], 
                       keep_col='volume', op='+', colrename=None):
    """
    1. split the df according to 2 (n?) levels of "spliton"
    2. merge the 2 (n?) dataframes using indx as index
    3. keep only "keep_col" for the right temporary dataframe
    4. perform operation 'op' on the columns "keep_col" and name it 
       'keep_col'+'_'+ levels[0] + op + levels[1]    
    
    Was developed for adding volumes in right and left structures
    """
    
    dflev1 = df[df[spliton]==levels[0]]
    dflev2 = df[df[spliton]==levels[1]] 

    # check that the new dfs have no duplicates in the indx

    assert set(dflev1[indx]) == set(dflev2[indx])
    assert len(set(dflev1[indx])) == len(dflev1[indx])
    
    # assert len(set(dflev2[indx])) == len(dflev2[indx])
    # suffixes=('_l','_r')
    merged_inner = pd.merge(left=dflev1, right=dflev2[[indx,keep_col]], 
                            left_on=indx, right_on=indx, suffixes=levels, how='inner')
#    merged_inner.rename(columns={cols+'_x': cols+'_'+lev1, cols+'_y': cols+'_'+lev2}, inplace=True)

    # sum keep_col values in a new column
    add_col_name = keep_col + levels[0] + op + levels[1]
    if op == '+':
        merged_inner[add_col_name] = \
                        merged_inner[keep_col+levels[0]] + merged_inner[keep_col+levels[1]]  
    if colrename is not None:
        merged_inner.rename(columns={add_col_name:colrename}, inplace=True)
    return merged_inner
"""

"""
    if droplist != []:
        for colname in droplist:
            colname_y = colname + '_y'
            colname_x = colname + '_x'
            merged_inner.drop(colname_y, axis=1, inplace=True)
            merged_inner.rename(columns={colname_x: colname}, inplace=True)
""";

In [None]:
"""
def pivot_and_add(df, index, columns, values, add_col, keep_cols):
    """
    index: will be the index of the returned df
    columns: return df will be "wide" based on the values of columns
    values: content of the wide df
    add_col: name of the column where values of columns are added
    keep_cols: list of column names to keep
    
    """
    # index='ID', columns='softwareLabel', values='volume'
    
    add_cols = list(set(df[columns])) # find values in columns
    df.drop_duplicates([index, columns, values], inplace=True)
    newdf = df.pivot(index=index, columns=columns, values=values).dropna()
    newdf.reset_index() # put the index, 'ID' here, back in a column
    newdf[add_col] = newdf.loc[:,add_cols].sum(axis=1)
    newdf.drop(add_cols, axis=1, inplace=True)
    
    assert not ('ID' in keep_cols)
    
    # now, create a df with keep_cols:
    keepdf = df[['ID'] + keep_cols].drop_duplicates()
    # print(len(keepdf),len(newdf))
    newdf = pd.merge(left=newdf, right=keepdf, left_on='ID', right_on='ID').dropna()
    # print(len(newdf),len(newdf.dropna()))
    del keepdf
    
    return newdf
""";