In [1]:
import pandas as pd
import numpy as np
import statsmodels as stt
import scipy.stats as sst
import os.path as osp
from statsmodels import api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import json
%matplotlib inline

In [2]:
print(osp.realpath(osp.curdir))
relative_path_filename = './data/2019-12-03-simple2_query_output.csv'
assert osp.exists(relative_path_filename)

/home/jb/code/repronim/simple2/simple2_analysis


In [3]:
hie = pd.read_csv(relative_path_filename, na_values='nd') #, low_memory=False)
original_col_names = list(hie)
# column names are unique
assert len(original_col_names) == len(set(original_col_names))
print(list(hie))
col_rename = {'federatedLabel':'structure'}
hie.rename(columns=col_rename, inplace=True)
print(list(hie))

['study', 'ID', 'Age', 'dx', 'Gender', 'FIQ', 'PIQ', 'VIQ', 'tool', 'softwareLabel', 'federatedLabel', 'laterality', 'volume']
['study', 'ID', 'Age', 'dx', 'Gender', 'FIQ', 'PIQ', 'VIQ', 'tool', 'softwareLabel', 'structure', 'laterality', 'volume']


  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# mapping_file = '../segstats_jsonld/segstats_jsonld/mapping_data/freesurfermap.json'
mapping_file = '../segstats_jsonld/segstats_jsonld/mapping_data/freesurfer-cdes.json'
assert osp.exists(mapping_file)
with open(mapping_file, "r") as read_file:
    roi_map = json.load(read_file)

In [5]:
ube2h = {}
label2ube = {}
countok=0
has_no_isAbout = []
has_no_label = []

for (k,v) in roi_map.items():
    
    # discard 'count'
    if k == 'count': pass
    
    # v is a dict that contains the CDE - check that we have a isAbout and label    
    elif 'isAbout' in v:
        countok += 1
        if 'label' in v:
            if v['label'] != '' and v['label'] not in ('None','none'):
                #ube['<' + v['isAbout'] + '>'] = v['label']
                #ebu[v['label']] = '<' + v['isAbout'] + '>'
                label2ube[v['label']] = v['isAbout']
                if v['isAbout'] not in ube2h.keys():
                    no_right_or_left = v['label']
                    no_right_or_left = no_right_or_left.replace('Right-','')
                    no_right_or_left = no_right_or_left.replace('Right ','')
                    no_right_or_left = no_right_or_left.replace('Left-','')
                    no_right_or_left = no_right_or_left.replace('Left ','')
                    no_right_or_left = no_right_or_left.replace(' NVoxels','')
                    no_right_or_left = no_right_or_left.replace(' (mm^3)','')
                    ube2h[v['isAbout']] = no_right_or_left
            else:
                has_no_label.append(k)

assert has_no_isAbout == []
assert countok == len(label2ube)

In [6]:
label2ube;
ube2h
h2ube = {v: k for k, v in ube2h.items()}

In [7]:
def split_merge_df(df, indx='ID', spliton='laterality', levels=['Left','Right'], 
                       keep_col='volume', op='+', colrename=None):
    """
    1. split the df according to 2 (n?) levels of "spliton"
    2. merge the 2 (n?) df using indx as index
    3. keep only "keep_col" for the right temporary df
    4. perform op on the columns "keep_col" and name it 'keep_col'+'_'+ levels[0] + op + levels[1]    
    
    example: for adding volumes in right and left structures
    """
    
    dflev1 = df[df[spliton]==levels[0]]
    dflev2 = df[df[spliton]==levels[1]] 

    # check that the new dfs have no duplicates in the indx

    assert set(dflev1[indx]) == set(dflev2[indx])
    assert len(set(dflev1[indx])) == len(dflev1[indx])
    
    # assert len(set(dflev2[indx])) == len(dflev2[indx])
    # suffixes=('_l','_r')
    merged_inner = pd.merge(left=dflev1, right=dflev2[[indx,keep_col]], 
                            left_on=indx, right_on=indx, suffixes=levels, how='inner')
#    merged_inner.rename(columns={cols+'_x': cols+'_'+lev1, cols+'_y': cols+'_'+lev2}, inplace=True)

    # sum keep_col values in a new column
    add_col_name = keep_col + levels[0] + op + levels[1]
    if op == '+':
        merged_inner[add_col_name] = \
                        merged_inner[keep_col+levels[0]] + merged_inner[keep_col+levels[1]]  
    if colrename is not None:
        merged_inner.rename(columns={add_col_name:colrename}, inplace=True)
    return merged_inner

"""
    if droplist != []:
        for colname in droplist:
            colname_y = colname + '_y'
            colname_x = colname + '_x'
            merged_inner.drop(colname_y, axis=1, inplace=True)
            merged_inner.rename(columns={colname_x: colname}, inplace=True)
""";

In [8]:
tooldic = {'surfer':'https://surfer.nmr.mgh.harvard.edu/', 
       'fsl':'http://purl.org/nidash/fsl#',
       'ants':'http://stnava.github.io/ANTs/'}
normalDev = (2, '2', 'Typically Developing Children')
adhd = (1, '1', 'ADHD-Combined', 'ADHD-Hyperactive/Impulsive', 'ADHD-Inattentive')


def define_conditions(df, tooldic={}, normalDev=(), adhd=(), h2ube={}):
    """
    create dic of conditions
    
    """
    diccond={}
    diccond['left'] = ((df['laterality'] == 'L')|(hie['laterality'] == 'Left'))
    diccond['right'] = ((df['laterality'] == 'R')|(hie['laterality'] == 'Right'))
    diccond['latNan'] = (df['laterality'] != 'Right') & (df['laterality'] != 'Left')

    #========== age 
    diccond['age<=20'] = (df['Age'] <= 20)
    diccond['age<20'] = (df['Age'] < 20)
    diccond['age<12'] = (df['Age'] < 12)
    diccond['age>=12'] = (df['Age'] >= 12)
    
    #========== tool conditions
    diccond['fs'] = (df['tool'] == tooldic['surfer'])
    diccond['ants'] = (df['tool'] == tooldic['ants'])
    diccond['fsl'] = (df['tool'] == tooldic['fsl'])
    
    #========== IQ conditions
    diccond['fiq>0'] = (df['FIQ'] > 0 )
    
    #========== disease conditions
    pop_normDev = False
    for pop in normalDev:
        # print(np.sum(pop_cond))
        pop_normDev = pop_normDev | (df['dx'] == pop)
    diccond['normDev'] =  pop_normDev
    
    pop_adhd = False
    for pop in adhd:
        # print(np.sum(pop_cond))
        pop_adhd = pop_adhd | (df['dx'] == pop)
    diccond['adhd'] =  pop_adhd

    #=========== ROIs
    roi_ub = ''
    diccond['bvol'] = (df['softwareLabel'] == 'BVOL (mm^3)')
    diccond['brainseg'] = (df['softwareLabel'] == 'Brain Segmentation Volume (mm^3)') 
    diccond['caudate'] = (df['structure'] == h2ube['Caudate'])
    diccond['putamen'] = (df['structure'] == h2ube['Putamen'])
    diccond['TIV'] = (df['structure'] == h2ube['Estimated Total Intracranial Volume'])
    diccond['wm'] =  (df['structure'] == h2ube['hemisphere cerebral white matter volume'])
    diccond['gm'] =  (df['structure'] == h2ube['Total gray matter volume'])
    diccond['csf'] =  (df['structure'] == h2ube['CSF'])
    diccond['ccant'] =  (df['structure'] == h2ube['CC_Anterior'])
    diccond['cccen'] =  (df['structure'] == h2ube['CC_Central'])
    diccond['ccpos'] =  (df['structure'] == h2ube['CC_Posterior'])

    #=========== site
    diccond['abide'] = (df['study'].str.contains("ABIDE"))
    diccond['adhd200'] = (df['study'] == 'ADHD200')
    
    #=========== Gender
    diccond['male'] = (df['Gender']=='Male')
    diccond['female'] = (df['Gender']=='Female')

    return diccond


In [9]:
condic = define_conditions(hie, tooldic=tooldic, normalDev=normalDev, adhd=adhd, h2ube=h2ube)
condic.keys()

dict_keys(['left', 'right', 'latNan', 'age<=20', 'age<20', 'age<12', 'age>=12', 'fs', 'ants', 'fsl', 'fiq>0', 'normDev', 'adhd', 'bvol', 'brainseg', 'caudate', 'putamen', 'TIV', 'wm', 'gm', 'csf', 'ccant', 'cccen', 'ccpos', 'abide', 'adhd200', 'male', 'female'])

In [10]:
def apply_cond(df, cndc, conditions, dropnaset=[],columns={}):
    """
    List of conditions
    """
    cond = np.full((len(df),), True, dtype=bool)
    
    for c in conditions:
        cond = cond & cndc[c]
        # print(np.sum(cond))
        
    # condition = [cond & cndc[c] for c in conditions][0]
    # print(len(condition),np.sum(condition))
    
    # make a copy
    tmp = df.loc[cond].dropna(subset=dropnaset)
    if columns:
        tmp.rename(columns=columns, inplace=True)
        
    return tmp
    

In [11]:
# hyp1 = ['bvol','ants','left','age<=20','fiq>0','normDev']
# hyp1 = ['fiq>0','abide','caudate', 'putamen']
hyp1 = ['female', 'caudate','fs','fiq>0'] # ,'abide']
condic = define_conditions(hie, tooldic=tooldic, normalDev=normalDev, adhd=adhd, h2ube=h2ube)
tmp = apply_cond(hie, condic, hyp1)
# print('apply:', len(tmp.dropna(subset=['FIQ', 'volume', 'Gender'])))

manual = hie.loc[(hie['Gender']=='Female') & 
#                 (hie['study'].str.contains("ABIDE")) &
                 (hie['structure'] == h2ube['Caudate']) &
                 (hie['tool'] == tooldic['surfer']) & 
                 (hie['FIQ'] > 0) ] 
manual = manual.dropna(subset=['FIQ', 'volume', 'Gender']) #,inplace=True)
print('manual: ',len(manual))
assert len(tmp.dropna(subset=['FIQ', 'volume', 'Gender'])) == len(manual)


manual:  408


In [12]:
structures = set(hie.structure)


Hypotheses

PIET-1: Total Brain Volume will positively correlate with IQ (in both sexes across the complete age range).

MAC-1: Left striatum volume (caudate + putamen) will positively correlate with IQ in the total (male + female) child (age < 20) group.

MAC-2: Left striatum volume (caudate + putamen) will positively correlate with IQ in the male children group.

MAC-3: Left striatum volume (caudate + putamen) will not correlate with IQ in the female children group.

GANJ-1: Total Corpus Callosum midsagittal area, after correcting for total brain volume, will negatively correlate with IQ.

GANJ-2: Total Corpus Callosum midsagittal area, after correcting for total brain volume, will negatively correlate with IQ in the young (age < 12) group.

GANJ-3: Total Corpus Callosum midsagittal area, after correcting for total brain volume, will not significantly correlate with IQ in the adolescent (age > 12) group.

GANJ-4:. Total Corpus Callosum midsagittal area, after correcting for total brain volume, will negatively correlate with IQ in the male (age < 12) group.

GANJ-5: Total Corpus Callosum midsagittal area, after correcting for total brain volume, will not significantly correlate with IQ in the female (age < 12) group.


In [13]:
resdic = {}


def md2dic(varname, hyp_name, mdf=None):
    table2 = mdf.summary().tables[1].data
    cols = table2[0]
    ther = [r for r in table2 if r[0] == varname][0]
    resdic = {}
    resdic[hyp_name] = {'P>|t|':ther[cols.index('P>|t|')], 
             't':ther[cols.index('t')], 
             'rsquared_adj':"{:4.3}".format(md.fit().rsquared_adj),
             'nobs': "{:3d}".format(int(md.nobs))
            }

    return(resdic)

### PIET-1: Total Brain Volume will positively correlate with IQ (in both sexes across the complete age range).


In [14]:
condic = define_conditions(hie, tooldic=tooldic, normalDev=normalDev, adhd=adhd, h2ube=h2ube)
print(condic.keys())
dropnaset = ['FIQ', 'volume', 'Gender']


dict_keys(['left', 'right', 'latNan', 'age<=20', 'age<20', 'age<12', 'age>=12', 'fs', 'ants', 'fsl', 'fiq>0', 'normDev', 'adhd', 'bvol', 'brainseg', 'caudate', 'putamen', 'TIV', 'wm', 'gm', 'csf', 'ccant', 'cccen', 'ccpos', 'abide', 'adhd200', 'male', 'female'])


In [15]:
hyp1 = ['bvol', 'fiq>0','normDev'] # ,'ants','abide']
tmp = apply_cond(hie, condic, hyp1, dropnaset=dropnaset)
print(len(tmp))

461


In [16]:
print(" Structure = ", 'bvol')
#assert ube2h[tmp.iloc[0]['structure']] == roi

iq = 'FIQ'

# md = smf.ols(iq + " ~ Q('volume') + Gender + Age + study ", data=tmp) #  
md = smf.ols(iq + " ~ Q('volume') + study ", data=tmp) #  
mdf = md.fit()
print(mdf.summary())

varname = "Q('volume')"
hyp_name = 'PIET-1'
resdic.update(md2dic(varname,hyp_name,mdf=mdf))

 Structure =  bvol
                            OLS Regression Results                            
Dep. Variable:                    FIQ   R-squared:                       0.147
Model:                            OLS   Adj. R-squared:                  0.104
Method:                 Least Squares   F-statistic:                     3.434
Date:                Thu, 05 Dec 2019   Prob (F-statistic):           4.18e-07
Time:                        19:32:12   Log-Likelihood:                -1800.5
No. Observations:                 461   AIC:                             3647.
Df Residuals:                     438   BIC:                             3742.
Df Model:                          22                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------

#### Create df and conditions for left striatum, all age all gender

In [17]:
cond_mac = ['fiq>0','normDev','fs','left'] # ,'age<20']
mac_caud = cond_mac + ['caudate'] #
mac_put = cond_mac + ['putamen'] #
mac_tiv = cond_mac + ['fiq>0','normDev','fs','TIV'] #

left_caud = apply_cond(hie, condic, mac_caud, dropnaset=dropnaset, columns={'volume':'caudate'})
left_put = apply_cond(hie, condic, mac_put, dropnaset=dropnaset, columns={'volume':'putamen'})

left_stria = pd.merge(left=left_caud, right=left_put[['ID','putamen']], left_on='ID', right_on='ID')
left_stria['striatum'] = left_stria['caudate'] + left_stria['putamen']
left_stria_condic = define_conditions(left_stria, tooldic=tooldic, normalDev=normalDev, adhd=adhd, h2ube=h2ube)
print(len(left_stria))



543


### MAC-1: Left striatum volume (caudate + putamen) will positively correlate with IQ in the total (male + female) child (age < 20) group.


In [18]:
mac1_cond = ['age<20']
mac1 = apply_cond(left_stria, left_stria_condic, mac1_cond)
print(len(mac1))


iq = 'FIQ'

# md = smf.ols(iq + " ~ Q('volume') + Gender + Age + study ", data=tmp) #  
# md = smf.ols(iq + " ~ Q('striatum') + study + TIV ", data=stria) #  
md = smf.ols(iq + " ~ Q('striatum') + study ", data=mac1) #  
mdf = md.fit()
print(mdf.summary())


varname = "Q('striatum')"
hyp_name = 'MAC-1'
resdic.update(md2dic(varname,hyp_name,mdf=mdf))

428
                            OLS Regression Results                            
Dep. Variable:                    FIQ   R-squared:                       0.125
Model:                            OLS   Adj. R-squared:                  0.082
Method:                 Least Squares   F-statistic:                     2.912
Date:                Thu, 05 Dec 2019   Prob (F-statistic):           3.32e-05
Time:                        19:32:13   Log-Likelihood:                -1665.7
No. Observations:                 428   AIC:                             3373.
Df Residuals:                     407   BIC:                             3459.
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
Intercept 

### MAC-2: Left striatum volume (caudate + putamen) will positively correlate with IQ in the male children group.


In [19]:
mac2_cond = ['age<20','male']
mac2 = apply_cond(left_stria, left_stria_condic, mac2_cond)
print(len(mac2))

iq = 'FIQ'
# md = smf.ols(iq + " ~ Q('volume') + Gender + Age + study ", data=tmp) #  
# md = smf.ols(iq + " ~ Q('striatum') + study + TIV ", data=stria) #  
md = smf.ols(iq + " ~ Q('striatum') + study ", data=mac2) #  
mdf = md.fit()
print(mdf.summary())


varname = "Q('striatum')"
hyp_name = 'MAC-2'
resdic.update(md2dic(varname,hyp_name,mdf=mdf))

336
                            OLS Regression Results                            
Dep. Variable:                    FIQ   R-squared:                       0.146
Model:                            OLS   Adj. R-squared:                  0.092
Method:                 Least Squares   F-statistic:                     2.693
Date:                Thu, 05 Dec 2019   Prob (F-statistic):           0.000153
Time:                        19:32:13   Log-Likelihood:                -1292.9
No. Observations:                 336   AIC:                             2628.
Df Residuals:                     315   BIC:                             2708.
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
Intercept 

### MAC-3: Left striatum volume (caudate + putamen) will not correlate with IQ in the female children group.


In [20]:
mac3_cond = ['age<20','female']
mac3 = apply_cond(left_stria, left_stria_condic, mac3_cond)
print(len(mac3))

iq = 'FIQ'
# md = smf.ols(iq + " ~ Q('volume') + Gender + Age + study ", data=tmp) #  
# md = smf.ols(iq + " ~ Q('striatum') + study + TIV ", data=stria) #  
md = smf.ols(iq + " ~ Q('striatum') + study ", data=mac3) #  
mdf = md.fit()
print(mdf.summary())


varname, hyp_name = "Q('striatum')",'MAC-3'
resdic.update(md2dic(varname,hyp_name,mdf=mdf))

92
                            OLS Regression Results                            
Dep. Variable:                    FIQ   R-squared:                       0.232
Model:                            OLS   Adj. R-squared:                  0.080
Method:                 Least Squares   F-statistic:                     1.528
Date:                Thu, 05 Dec 2019   Prob (F-statistic):              0.117
Time:                        19:32:13   Log-Likelihood:                -361.48
No. Observations:                  92   AIC:                             755.0
Df Residuals:                      76   BIC:                             795.3
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
Intercept  

### compute cc and tbv  for GANJ, no age or gender condition 

In [21]:
condic = define_conditions(hie, tooldic=tooldic, normalDev=normalDev, adhd=adhd, h2ube=h2ube)
condic.keys()

dict_keys(['left', 'right', 'latNan', 'age<=20', 'age<20', 'age<12', 'age>=12', 'fs', 'ants', 'fsl', 'fiq>0', 'normDev', 'adhd', 'bvol', 'brainseg', 'caudate', 'putamen', 'TIV', 'wm', 'gm', 'csf', 'ccant', 'cccen', 'ccpos', 'abide', 'adhd200', 'male', 'female'])

In [22]:
hypGanj = ['fiq>0','normDev','fs'] # ,'age<20'
hyp_ccant = hypGanj + ['ccant'] #
hyp_cccen = hypGanj + ['cccen'] #
hyp_ccpos = hypGanj + ['ccpos'] #

ccant = apply_cond(hie, condic, hyp_ccant, dropnaset=dropnaset, columns={'volume':'ccant'})
cccen = apply_cond(hie, condic, hyp_cccen, dropnaset=dropnaset, columns={'volume':'cccen'})
ccpos = apply_cond(hie, condic, hyp_ccpos, dropnaset=dropnaset, columns={'volume':'ccpos'})

cc = pd.merge(left=ccant, right=cccen[['ID','cccen']], left_on='ID', right_on='ID')
cc = pd.merge(left=cc, right=ccpos[['ID','ccpos']], left_on='ID', right_on='ID')

cc['cc'] = cc['ccant']+cc['cccen']+cc['ccpos']
len(cc), len(ccant), len(cccen), len(ccpos)

(543, 543, 543, 543)

In [23]:
hyp_gm = hypGanj + ['gm'] #
gm = apply_cond(hie, condic, hyp_gm, dropnaset=dropnaset, columns={'volume':'gm'})
#print(gm[['ID','gm','laterality']].head(2),len(gm))

hyp_wm = hypGanj + ['wm','latNan'] #
wm = apply_cond(hie, condic, hyp_wm, dropnaset=dropnaset, columns={'volume':'wm'})
#print(wm[['ID','wm','laterality']].head(2),len(wm))

hyp_csf = hypGanj + ['csf'] #
csf = apply_cond(hie, condic, hyp_csf, dropnaset=dropnaset, columns={'volume':'csf'})
#print(csf[['ID','csf','laterality']].head(2),len(csf))
tbv = pd.merge(left=gm, right=wm[['ID','wm']], left_on='ID', right_on='ID')
tbv = pd.merge(left=tbv, right=csf[['ID','csf']], left_on='ID', right_on='ID')
tbv['tbv'] = tbv['wm'] + tbv['gm'] +  tbv['csf']
print(len(gm), len(wm), len(csf), len(tbv))

543 543 543 543


In [24]:
cc_tbv = pd.merge(left=tbv, right=cc[['ID','cc']], left_on='ID', right_on='ID')
condic_cc_tbv = define_conditions(cc_tbv, tooldic=tooldic, normalDev=normalDev, adhd=adhd, h2ube=h2ube)


### GANJ-1: Total Corpus Callosum midsagittal area, after correcting for total brain volume, will negatively correlate with IQ.


In [25]:
ganj1_cond = ['age<20'] #'fiq>0','normDev','fs','age<=20','ccant'] #
ganj1 = apply_cond(cc_tbv, condic_cc_tbv, ganj1_cond)
print(list(ganj1),len(ganj1))

iq = 'FIQ'
# md = smf.ols(iq + " ~ Q('volume') + Gender + Age + study ", data=tmp) #  
#md = smf.ols(iq + " ~ Q('striatum') + study + TIV ", data=stria) #  
md = smf.ols(iq + " ~ Q('cc') + study + tbv ", data=ganj1) #  
mdf = md.fit()
print(mdf.summary())

varname, hyp_name = "Q('cc')",'GANJ-1'
resdic.update(md2dic(varname,hyp_name,mdf=mdf))

['study', 'ID', 'Age', 'dx', 'Gender', 'FIQ', 'PIQ', 'VIQ', 'tool', 'softwareLabel', 'structure', 'laterality', 'gm', 'wm', 'csf', 'tbv', 'cc'] 428
                            OLS Regression Results                            
Dep. Variable:                    FIQ   R-squared:                       0.136
Model:                            OLS   Adj. R-squared:                  0.092
Method:                 Least Squares   F-statistic:                     3.055
Date:                Thu, 05 Dec 2019   Prob (F-statistic):           9.24e-06
Time:                        19:32:13   Log-Likelihood:                -1663.0
No. Observations:                 428   AIC:                             3370.
Df Residuals:                     406   BIC:                             3459.
Df Model:                          21                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t

### GANJ-2: Total Corpus Callosum midsagittal area, after correcting for total brain volume, will negatively correlate with IQ in the young (age < 12) group.

In [26]:
ganj2_cond = ['age<12'] #'fiq>0','normDev','fs','age<=20','ccant'] #
ganj2 = apply_cond(cc_tbv, condic_cc_tbv, ganj2_cond)
print(list(ganj2),len(ganj2));

iq = 'FIQ'
# md = smf.ols(iq + " ~ Q('volume') + Gender + Age + study ", data=tmp) #  
#md = smf.ols(iq + " ~ Q('striatum') + study + TIV ", data=stria) #  
md = smf.ols(iq + " ~ Q('cc') + study + tbv ", data=ganj2) #  
mdf = md.fit()
print(mdf.summary())


varname, hyp_name = "Q('cc')",'GANJ-2'
resdic.update(md2dic(varname,hyp_name,mdf=mdf))

['study', 'ID', 'Age', 'dx', 'Gender', 'FIQ', 'PIQ', 'VIQ', 'tool', 'softwareLabel', 'structure', 'laterality', 'gm', 'wm', 'csf', 'tbv', 'cc'] 188
                            OLS Regression Results                            
Dep. Variable:                    FIQ   R-squared:                       0.169
Model:                            OLS   Adj. R-squared:                  0.085
Method:                 Least Squares   F-statistic:                     2.027
Date:                Thu, 05 Dec 2019   Prob (F-statistic):             0.0122
Time:                        19:32:13   Log-Likelihood:                -728.56
No. Observations:                 188   AIC:                             1493.
Df Residuals:                     170   BIC:                             1551.
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                                   coef    std err          t 

### GANJ-3: Total Corpus Callosum midsagittal area, after correcting for total brain volume, will not significantly correlate with IQ in the adolescent (age > 12) group.


In [27]:
ganj3_cond = ['age>=12'] #
ganj3 = apply_cond(cc_tbv, condic_cc_tbv, ganj3_cond)
print(len(ganj3))

iq = 'FIQ'
# md = smf.ols(iq + " ~ Q('volume') + Gender + Age + study ", data=tmp) #  
#md = smf.ols(iq + " ~ Q('striatum') + study + TIV ", data=stria) #  
md = smf.ols(iq + " ~ Q('cc') + study + tbv ", data=ganj3) #  
mdf = md.fit()
print(mdf.summary())


varname, hyp_name = "Q('cc')",'GANJ-3'
resdic.update(md2dic(varname,hyp_name,mdf=mdf))

355
                            OLS Regression Results                            
Dep. Variable:                    FIQ   R-squared:                       0.153
Model:                            OLS   Adj. R-squared:                  0.088
Method:                 Least Squares   F-statistic:                     2.372
Date:                Thu, 05 Dec 2019   Prob (F-statistic):           0.000323
Time:                        19:32:14   Log-Likelihood:                -1354.8
No. Observations:                 355   AIC:                             2762.
Df Residuals:                     329   BIC:                             2862.
Df Model:                          25                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
Intercept 

### GANJ-4:. Total Corpus Callosum midsagittal area, after correcting for total brain volume, will negatively correlate with IQ in the male (age < 12) group.


In [28]:
ganj4_cond = ['age<12','male'] #+ hypGanj #'fiq>0','normDev','fs','age<=20','ccant'] #
ganj4 = apply_cond(cc_tbv,  condic_cc_tbv, ganj4_cond)
print(list(ganj4),len(ganj4),'\n')

iq = 'FIQ'
# md = smf.ols(iq + " ~ Q('volume') + Gender + Age + study ", data=tmp) #  
#md = smf.ols(iq + " ~ Q('striatum') + study + TIV ", data=stria) #  
md = smf.ols(iq + " ~ Q('cc') + study + tbv ", data=ganj4) #  
mdf = md.fit()
print(mdf.summary())


varname, hyp_name = "Q('cc')",'GANJ-4'
resdic.update(md2dic(varname,hyp_name,mdf=mdf))

['study', 'ID', 'Age', 'dx', 'Gender', 'FIQ', 'PIQ', 'VIQ', 'tool', 'softwareLabel', 'structure', 'laterality', 'gm', 'wm', 'csf', 'tbv', 'cc'] 145 

                            OLS Regression Results                            
Dep. Variable:                    FIQ   R-squared:                       0.197
Model:                            OLS   Adj. R-squared:                  0.097
Method:                 Least Squares   F-statistic:                     1.967
Date:                Thu, 05 Dec 2019   Prob (F-statistic):             0.0200
Time:                        19:32:14   Log-Likelihood:                -554.52
No. Observations:                 145   AIC:                             1143.
Df Residuals:                     128   BIC:                             1194.
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                                   coef    std err          

### GANJ-5: Total Corpus Callosum midsagittal area, after correcting for total brain volume, will not significantly correlate with IQ in the female (age < 12) group.


In [29]:
ganj5_cond = ['age<12','female'] #,'fiq>0','normDev','fs','age<=20'
ganj5 = apply_cond(cc_tbv,  condic_cc_tbv, ganj5_cond)
print(list(ganj5),len(ganj5),'\n')

iq = 'FIQ'
# md = smf.ols(iq + " ~ Q('volume') + Gender + Age + study ", data=tmp) #  
#md = smf.ols(iq + " ~ Q('striatum') + study + TIV ", data=stria) #  
md = smf.ols(iq + " ~ Q('cc') + study + tbv ", data=ganj5) #  
mdf = md.fit()
print(mdf.summary())


varname, hyp_name = "Q('cc')",'GANJ-5'
resdic.update(md2dic(varname,hyp_name,mdf=mdf))

['study', 'ID', 'Age', 'dx', 'Gender', 'FIQ', 'PIQ', 'VIQ', 'tool', 'softwareLabel', 'structure', 'laterality', 'gm', 'wm', 'csf', 'tbv', 'cc'] 43 

                            OLS Regression Results                            
Dep. Variable:                    FIQ   R-squared:                       0.187
Model:                            OLS   Adj. R-squared:                 -0.102
Method:                 Least Squares   F-statistic:                    0.6473
Date:                Thu, 05 Dec 2019   Prob (F-statistic):              0.775
Time:                        19:32:14   Log-Likelihood:                -169.68
No. Observations:                  43   AIC:                             363.4
Df Residuals:                      31   BIC:                             384.5
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                                   coef    std err          t

In [30]:
print(resdic)

{'PIET-1': {'P>|t|': ' 0.001', 't': '    3.442', 'rsquared_adj': '0.104', 'nobs': '461'}, 'MAC-1': {'P>|t|': ' 0.001', 't': '    3.217', 'rsquared_adj': '0.0822', 'nobs': '428'}, 'MAC-2': {'P>|t|': ' 0.001', 't': '    3.341', 'rsquared_adj': '0.0918', 'nobs': '336'}, 'MAC-3': {'P>|t|': ' 0.218', 't': '    1.243', 'rsquared_adj': '0.08', 'nobs': ' 92'}, 'GANJ-1': {'P>|t|': ' 0.257', 't': '   -1.134', 'rsquared_adj': '0.0918', 'nobs': '428'}, 'GANJ-2': {'P>|t|': ' 0.156', 't': '   -1.425', 'rsquared_adj': '0.0854', 'nobs': '188'}, 'GANJ-3': {'P>|t|': ' 0.972', 't': '   -0.035', 'rsquared_adj': '0.0883', 'nobs': '355'}, 'GANJ-4': {'P>|t|': ' 0.467', 't': '   -0.730', 'rsquared_adj': '0.097', 'nobs': '145'}, 'GANJ-5': {'P>|t|': ' 0.520', 't': '   -0.650', 'rsquared_adj': '-0.102', 'nobs': ' 43'}}


### Scrap

In [31]:
"""
hyp2 = ['fiq>0','normDev','fs','age<20']
hyp2_caud = hyp2 + ['caudate'] # ,'abide']
hyp2_put = hyp2 + ['putamen'] # ,'abide']
hyp2_tiv = hyp2 + ['TIV'] # ,'abide']
tmp_caud = apply_cond(hie, condic, hyp2_caud, dropnaset=dropnaset)
caud = split_merge_df(tmp_caud, indx='ID', spliton='laterality', levels=['Left','Right'], 
                       keep_col='volume', op='+',colrename='caudate')
tmp_put = apply_cond(hie, condic, hyp2_put, dropnaset=dropnaset)
put = split_merge_df(tmp_put, indx='ID', spliton='laterality', levels=['Left','Right'], 
                       keep_col='volume', op='+',colrename='putamen')
tmp_tiv = apply_cond(hie, condic, hyp2_tiv, dropnaset=dropnaset)
print(len(caud), len(put), len(tmp_tiv))

stria = pd.merge(left=caud, right=put[['ID','putamen']], left_on='ID', right_on='ID')
stria['striatum'] = stria['caudate']+stria['putamen']
stria = pd.merge(left=stria, right=tmp_tiv[['ID','volume']], left_on='ID', right_on='ID')
stria.rename(columns={'volume':'TIV'},inplace=True)
print(list(stria),len(stria))
""";