# wrangling with the csv

In [1]:
import pandas as pd
import numpy as np
# import statsmodels.discrete.discrete_model as sm
import statsmodels as stt
import scipy.stats as sst
import os.path as osp

In [2]:
from statsmodels import api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import json
%matplotlib inline

### read the mapping file to find the correspondance between uberon numbers and labels

In [5]:
# find mapping file
mapping_file = '../segstats_jsonld/segstats_jsonld/mapping_data/freesurfermap.json'
assert osp.exists(mapping_file)
with open(mapping_file, "r") as read_file:
    roi_map = json.load(read_file)

In [10]:
print(list(roi_map['Anatomy'].keys())[:10])
print(len(roi_map['Anatomy'].keys()))

['Left-Lateral-Ventricle', 'Left-Inf-Lat-Vent', 'Left-Cerebellum-White-Matter', 'Left-Cerebellum-Cortex', 'Left-Thalamus-Proper', 'Left-Caudate', 'Left-Putamen', 'Left-Pallidum', '3rd-Ventricle', '4th-Ventricle']
117


In [55]:
ube = {}
has_no_isAbout = []
for (k,v) in roi_map['Anatomy'].items():
    if v['isAbout'] != '' or v['isAbout'] in ('None','none'):
        if v['label'] != '' and v['label'] not in ('None','none'):
            ube['<' + v['isAbout'] + '>'] = v['label']
        else:
            if '<' + v['isAbout'] + '>' not in ube.keys():
                if v['hasLaterality'] in ('Right','Left'):
                    ube['<' + v['isAbout'] + '>'] = 'strip-Right-Left-of-'+k
                else:
                    ube['<' + v['isAbout'] + '>'] = k
    else:
        #print(k, 'has no isAbout')
        has_no_isAbout.append(k)
print(has_no_isAbout)

['Left-VentralDC', 'Right-VentralDC', 'WM-hypointensities', 'Left-WM-hypointensities', 'Right-WM-hypointensities', 'non-WM-hypointensities', 'Left-non-WM-hypointensities', 'Right-non-WM-hypointensities', 'Left-Cerebral-Exterior', 'Left-Cerebellum-Exterior', 'Left-Operculum', 'Right-Cerebral-Exterior', 'Right-Cerebellum-Exterior', 'Right-Operculum']


In [56]:
list(zip(list(ube)[:5],list(ube.values())[:5]))

[('<http://purl.obolibrary.org/obo/UBERON_0002285>', 'lateral ventricle'),
 ('<http://purl.obolibrary.org/obo/UBERON_0006091>',
  'inferior horn of the lateral ventricle'),
 ('<http://purl.obolibrary.org/obo/UBERON_0002037>',
  'cerebellum white matter'),
 ('<http://purl.obolibrary.org/obo/UBERON_0002129>', 'cerebellar cortex'),
 ('<http://purl.obolibrary.org/obo/UBERON_0001897>', 'thalamus')]

### Read CSV file 

In [57]:
print(osp.realpath(osp.curdir))
relative_path_filename = './data/blazegraph-query-all.csv'
assert osp.exists(relative_path_filename)

/home/jb/code/repronim/simple2/simple2_analysis


In [58]:
hie = pd.read_csv(relative_path_filename, na_values='nd')
original_col_names = list(hie)
print(original_col_names)
assert len(original_col_names) == len(set(original_col_names))

['study', 'ID', 'Age', 'Gender', 'dx', 'fiq', 'piq', 'hand', 'latera', 'value', 'tool', 'structure']


In [59]:
structures = set(hie.structure)
print(structures)

{'<http://purl.obolibrary.org/obo/UBERON_0001869>', '<http://purl.obolibrary.org/obo/UBERON_0002037>', '<http://purl.obolibrary.org/obo/UBERON_0006514>', '<http://purl.obolibrary.org/obo/UBERON_0000959>', '<http://purl.obolibrary.org/obo/UBERON_0006091>', '<http://purl.obolibrary.org/obo/UBERON_0001954>', '<http://purl.obolibrary.org/obo/UBERON_0001886>', '<http://purl.obolibrary.org/obo/UBERON_0002129>', '<http://purl.obolibrary.org/obo/UBERON_0002298>', '<http://purl.obolibrary.org/obo/UBERON_0014930>', '<http://purl.obolibrary.org/obo/UBERON_0001876>', '<http://purl.obolibrary.org/obo/UBERON_0002286>', '<http://purl.obolibrary.org/obo/UBERON_0001882>', '<http://purl.obolibrary.org/obo/UBERON_0001874>', '<http://purl.obolibrary.org/obo/UBERON_0002422>', '<http://purl.obolibrary.org/obo/UBERON_0009857>', '<http://purl.obolibrary.org/obo/UBERON_0002285>', '<http://purl.obolibrary.org/obo/UBERON_0001873>', '<http://purl.obolibrary.org/obo/UBERON_0001897>'}


In [61]:
ube2h = {}

for idx, s in enumerate(structures):
    ube2h[s] = ube[s]
    print(idx, s, ube[s])


0 <http://purl.obolibrary.org/obo/UBERON_0001869> cerebrum lesion
1 <http://purl.obolibrary.org/obo/UBERON_0002037> cerebellum white matter
2 <http://purl.obolibrary.org/obo/UBERON_0006514> pallidum
3 <http://purl.obolibrary.org/obo/UBERON_0000959> optic chiasm
4 <http://purl.obolibrary.org/obo/UBERON_0006091> inferior horn of the lateral ventricle
5 <http://purl.obolibrary.org/obo/UBERON_0001954> hippocampus
6 <http://purl.obolibrary.org/obo/UBERON_0001886> choroid plexus
7 <http://purl.obolibrary.org/obo/UBERON_0002129> cerebellar cortex
8 <http://purl.obolibrary.org/obo/UBERON_0002298> brainstem
9 <http://purl.obolibrary.org/obo/UBERON_0014930> perivascular space
10 <http://purl.obolibrary.org/obo/UBERON_0001876> amygdala
11 <http://purl.obolibrary.org/obo/UBERON_0002286> third ventricle
12 <http://purl.obolibrary.org/obo/UBERON_0001882> nucleus accumbens
13 <http://purl.obolibrary.org/obo/UBERON_0001874> putamen
14 <http://purl.obolibrary.org/obo/UBERON_0002422> fourth ventricle
15

In [62]:
"""
looking at the class name from the purl urls I got :

u2human[list(u2human.keys())[0]] = 'cavum septum pellucidum'
u2human[list(u2human.keys())[1]] = 'cerebellum'
u2human[list(u2human.keys())[2]] = 'cerebellar cortex'
u2human[list(u2human.keys())[3]] = 'perivascular space'
u2human[list(u2human.keys())[4]] = "Ammon's horn"
u2human[list(u2human.keys())[5]] = "inferior horn of the lateral ventricle"
u2human[list(u2human.keys())[6]] = "brainstem"
u2human[list(u2human.keys())[7]] = "amygdala"
u2human[list(u2human.keys())[8]] = "optic chiasma"
u2human[list(u2human.keys())[9]] = "dorsal plus ventral thalamus"
u2human[list(u2human.keys())[10]] = "cerebral hemisphere"
u2human[list(u2human.keys())[11]] = "caudate nucleus"
u2human[list(u2human.keys())[12]] = "telencephalic ventricle"
u2human[list(u2human.keys())[13]] = "putamen"
u2human[list(u2human.keys())[14]] = "third ventricle"
u2human[list(u2human.keys())[15]] = "choroid plexus"
u2human[list(u2human.keys())[16]] = "fourth ventricle"
u2human[list(u2human.keys())[17]] = "pallidum"
u2human[list(u2human.keys())[18]] = "nucleus accumbens"
""";

In [63]:
set1 = {'<http://purl.obolibrary.org/obo/UBERON_0000959>', 
'<http://purl.obolibrary.org/obo/UBERON_0002422>', 
'<http://purl.obolibrary.org/obo/UBERON_0009857>', 
'<http://purl.obolibrary.org/obo/UBERON_0001869>', 
'<http://purl.obolibrary.org/obo/UBERON_0002129>', 
'<http://purl.obolibrary.org/obo/UBERON_0001882>', 
'<http://purl.obolibrary.org/obo/UBERON_0001886>', 
'<http://purl.obolibrary.org/obo/UBERON_0006091>', 
'<http://purl.obolibrary.org/obo/UBERON_0002286>', 
'<http://purl.obolibrary.org/obo/UBERON_0002298>', 
'<http://purl.obolibrary.org/obo/UBERON_0001897>', 
'<http://purl.obolibrary.org/obo/UBERON_0001954>', 
'<http://purl.obolibrary.org/obo/UBERON_0001873>', 
'<http://purl.obolibrary.org/obo/UBERON_0014930>', 
'<http://purl.obolibrary.org/obo/UBERON_0001874>', 
'<http://purl.obolibrary.org/obo/UBERON_0001876>', 
'<http://purl.obolibrary.org/obo/UBERON_0006514>', 
'<http://purl.obolibrary.org/obo/UBERON_0002037>', 
'<http://purl.obolibrary.org/obo/UBERON_0002285>'};
set2 = set(ube2h.keys())
set1.difference(set2)

set()

In [64]:
h2ube = {v: k for k, v in ube2h.items()}
#print(h2uberon)

In [65]:
# print(set(hie['tool']))
tooldic = {'surfer':'<https://surfer.nmr.mgh.harvard.edu/>', 
           'fsl':'<http://purl.org/nidash/fsl#>'}
print(tooldic)

{'surfer': '<https://surfer.nmr.mgh.harvard.edu/>', 'fsl': '<http://purl.org/nidash/fsl#>'}


In [66]:
hie.head(3)

Unnamed: 0,study,ID,Age,Gender,dx,fiq,piq,hand,latera,value,tool,structure
0,ABIDE Stanford Site,51169,9.7775,1,1,127.0,117.0,L,Left,8078.44,<http://purl.org/nidash/fsl#>,<http://purl.obolibrary.org/obo/UBERON_0001897>
1,ABIDE Stanford Site,51169,9.7775,1,1,127.0,117.0,L,Left,3602.727,<http://purl.org/nidash/fsl#>,<http://purl.obolibrary.org/obo/UBERON_0001873>
2,ABIDE Stanford Site,51169,9.7775,1,1,127.0,117.0,L,Left,5410.738,<http://purl.org/nidash/fsl#>,<http://purl.obolibrary.org/obo/UBERON_0001874>


## hypotheses

Hypotheses

PIET-1: Total Brain Volume will positively correlate with IQ (in both sexes across the complete age range).

MAC-1: Left striatum volume (caudate + putamen) will positively correlate with IQ in the total (male + female) child (age < 20) group.

MAC-2: Left striatum volume (caudate + putamen) will positively correlate with IQ in the male children group.

MAC-3: Left striatum volume (caudate + putamen) will not correlate with IQ in the female children group.

MAC-3: Left striatum volume (caudate + putamen) will not correlate with IQ in the female children group.

GANJ-1: Total Corpus Callosum midsagittal area, after correcting for total brain volume, will negatively correlate with IQ.

GANJ-2: Total Corpus Callosum midsagittal area, after correcting for total brain volume, will negatively correlate with IQ in the young (age < 12) group.

GANJ-3: Total Corpus Callosum midsagittal area, after correcting for total brain volume, will not significantly correlate with IQ in the adolescent (age > 12) group.

GANJ-4:. Total Corpus Callosum midsagittal area, after correcting for total brain volume, will negatively correlate with IQ in the male (age < 12) group.

GANJ-5: Total Corpus Callosum midsagittal area, after correcting for total brain volume, will not significantly correlate with IQ in the female (age < 12) group.


## Analyses

### sandbox

In [67]:
u_caudate = h2ube['caudate nucleus']
print(u_caudate)

<http://purl.obolibrary.org/obo/UBERON_0001873>


In [68]:
# df.loc[(df['column_name'] >= A) & (df['column_name'] <= B)]

# tmp = hie[hie.structure==u_caudate][hie.Gender=='1']
# tmp = hie.loc[(hie['Gender']=='1')] # & (hie['Gender']=='Male')]

# Male or '1' for caudate
tmp = hie.loc[((hie['Gender']=='1') | (hie['Gender']=='Male')) 
                               & (hie['structure'] == u_caudate ) 
                               & (hie['tool']==tooldic['surfer'])]


In [69]:
print(set(hie['tool']))
#print(tooldic['fsl'])
#print(len(hie),len(tmp))
print(set(hie['hand']))
#print(set(hie['tool']))
#print(set(hie['Gender']))
#print(set(hie['dx']))
print(set(hie['latera']))


{'<https://surfer.nmr.mgh.harvard.edu/>', '<http://purl.org/nidash/fsl#>'}
{nan, '0.2', '0.36', 'L->R', '0.44', '0.22', '1', '0.73', '0.82', '0.91', '0.55', '0.18', '0.5', '0.61', '-9999', 'L', 'Left', '0.15', 'Ambi', '0.53', '0.48', 'Mixed', '0.68', '0.75', '-0.31', '0.81', 'R', '0.69', '0.33', '-0.61', '0.72', 'Right', '0.28'}
{'None', 'Left', 'Right'}


In [70]:
h2ube

{'cerebrum lesion': '<http://purl.obolibrary.org/obo/UBERON_0001869>',
 'cerebellum white matter': '<http://purl.obolibrary.org/obo/UBERON_0002037>',
 'pallidum': '<http://purl.obolibrary.org/obo/UBERON_0006514>',
 'optic chiasm': '<http://purl.obolibrary.org/obo/UBERON_0000959>',
 'inferior horn of the lateral ventricle': '<http://purl.obolibrary.org/obo/UBERON_0006091>',
 'hippocampus': '<http://purl.obolibrary.org/obo/UBERON_0001954>',
 'choroid plexus': '<http://purl.obolibrary.org/obo/UBERON_0001886>',
 'cerebellar cortex': '<http://purl.obolibrary.org/obo/UBERON_0002129>',
 'brainstem': '<http://purl.obolibrary.org/obo/UBERON_0002298>',
 'perivascular space': '<http://purl.obolibrary.org/obo/UBERON_0014930>',
 'amygdala': '<http://purl.obolibrary.org/obo/UBERON_0001876>',
 'third ventricle': '<http://purl.obolibrary.org/obo/UBERON_0002286>',
 'nucleus accumbens': '<http://purl.obolibrary.org/obo/UBERON_0001882>',
 'putamen': '<http://purl.obolibrary.org/obo/UBERON_0001874>',
 'fo

### PIET-1: Total Brain Volume will positively correlate with IQ (in both sexes across the complete age range).


In [71]:
roi = 'cerebellar cortex'
# roi = 'cerebellar cortex'
tool = 'surfer'
#tool = 'fsl'

tmp = hie[['study','ID','Gender','structure','tool','value','piq','fiq']]
# tmp = tmp.loc[((hie['Gender']=='1') | (hie['Gender']=='Male')) 

# hand_cond = ((hie['hand'] == 'L')|(hie['hand'] == 'Left')|(hie['latera'] == 'Left'))
age_cond = (hie['Age'] <= 20)
roi_cond = (hie['structure'] == h2ube[roi])
tool_cond = (hie['tool']==tooldic[tool])

condition = roi_cond & tool_cond # & age_cond 

tmp = tmp.loc[condition].drop_duplicates()

print(len(hie),len(tmp))

58055 2370


In [72]:
list(tmp)

['study', 'ID', 'Gender', 'structure', 'tool', 'value', 'piq', 'fiq']

In [74]:
print(" Structure = ", roi)
assert ube2h[tmp.iloc[0]['structure']] == roi

iq = 'fiq'

md = smf.ols(iq + " ~ Q('value') + Gender + study ", data=tmp) # + study 
mdf = md.fit()
print(mdf.summary())


 Structure =  cerebellar cortex
                            OLS Regression Results                            
Dep. Variable:                    fiq   R-squared:                       0.587
Model:                            OLS   Adj. R-squared:                  0.582
Method:                 Least Squares   F-statistic:                     124.0
Date:                Sun, 29 Sep 2019   Prob (F-statistic):               0.00
Time:                        15:21:58   Log-Likelihood:                -19437.
No. Observations:                2300   AIC:                         3.893e+04
Df Residuals:                    2273   BIC:                         3.908e+04
Df Model:                          26                                         
Covariance Type:            nonrobust                                         
                                   coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------

### conclusion : based on the wrong structure ...
1. no significance for piq for what seems to be the closest region:
Q('value')                      -0.0410      0.032     -1.286      0.199    
1. negatively correlated for fiq for what seems to be the closest region:
Q('value')                      -0.1232      0.032     -3.881      0.000      

### MAC-1: Left striatum volume (caudate + putamen) will positively correlate with IQ in the total (male + female) child (age < 20) group.


In [None]:
roi1 = 'caudate nucleus'
roi2 = 'putamen'
# roi = 'cerebellar cortex'
tool = 'surfer'
# tool = 'fsl'

tmp = hie[['study','ID','Gender','structure','tool','value', 'fiq', 'piq', 'latera']]
# condition1 = ((hie['Gender']=='1') | (hie['Gender']=='Male')) & 

hand_cond = ((hie['hand'] == 'L') | (hie['hand'] == 'Left')) # | (hie['latera'] == 'Left'))
age_cond = (hie['Age'] <= 20)
roi1_cond = (hie['structure'] == h2uberon[roi1])
roi2_cond = (hie['structure'] == h2uberon[roi2])
tool_cond = (hie['tool']==tooldic[tool])

condition1 = roi1_cond & tool_cond & age_cond & hand_cond  
condition2 = roi2_cond & tool_cond & age_cond & hand_cond

tmp1 = tmp.loc[condition1].drop_duplicates()
tmp2 = tmp.loc[condition2].drop_duplicates()

#tmp1 = tmp1.set_index('ID')
#tmp2 = tmp2.set_index('ID')

print(len(hie))
print('caudate: tmp1', len(tmp1), len(tmp1.drop_duplicates()))
print('putamen: tmp2', len(tmp2), len(tmp2.drop_duplicates()))

In [None]:
list(tmp2)

In [None]:
import numbers
import decimal
hand_nb = [v for v in hie['hand'] if isinstance(v, numbers.Number)]
len(hand_nb)

In [None]:
ax = plt.hist(tmp1['value'])

In [None]:
ax = plt.hist(tmp2['value'])

In [None]:
# tmp2.head(2)

In [None]:
tmp1.head(5)

In [None]:
# hie.loc[(hie['ID']==21002) & (hie['structure']==h2uberon[roi1]) & (hie['tool']==tooldic['surfer'])]

#### try to add values of the same index

In [None]:
datadf = tmp1[tmp1['latera'] == 'Left' ]
datadf.set_index('ID', inplace=True)
len(datadf)
datadf.head(3)
datadf_right = tmp1[tmp1['latera'] == 'Right' ]
datadf['value_right'] = datadf_right['value']

In [None]:
tmp_1 = tmp1.groupby(['ID']).mean().dropna()
tmp_2 = tmp2.groupby(['ID']).mean().dropna()
print(len(tmp_2), len(tmp_1))
assert np.all(tmp_1.index == tmp_2.index)

In [None]:
tmp_2.head()

In [None]:
""" print(type(tmp_2))
print(list(tmp_2))
print(tmp_2['value'][:5])
print(tmp_1['value'][:5]) 

list(tmp_2)
len(set(tmp_2.index))


lists_index = [set(tmp1['ID']),  set(tmp2['ID'])]
len( set.intersection(*lists_index))
""";

In [None]:
datadf1 = tmp1.set_index('ID')
datadf2 = tmp2.set_index('ID')
len(datadf1), len(datadf2)

In [None]:
datadf = pd.merge(left=tmp1,right=tmp2, left_on='ID', right_on='ID')

In [None]:
datadf.head()

In [None]:

datadf['cau_put'] = tmp_2.loc[:,'value'].add(tmp_1.loc[:,'value'])
datadf[iq] = tmp_2[iq]
datadf['study'] = tmp_2['study']
datadf['Gender'] = tmp_2['Gender']

print(list(datadf))

In [None]:
#datadf = datadf[datadf['cau_put'] < 15000]
len(datadf)
ax = plt.hist(datadf['cau_put'])

In [None]:
iq = 'piq'

md = smf.ols(iq + " ~ Q('cau_put') + study + Gender", data=datadf)
mdf = md.fit()
print(mdf.summary())


### Conclusion

1. does not seem to replicate with fiq (only 58 values ?)
2. does not seem to replicate with piq


### MAC-2: Left striatum volume (caudate + putamen) will positively correlate with IQ in the male children group.


In [None]:
#merged_inner = pd.merge(left=tmp_1, right=tmp_2) #, left_on='ID', right_on='ID') #how='join', 

In [None]:
roi1 = 'caudate nucleus'
roi2 = 'putamen'
# roi = 'cerebellar cortex'
tool = 'surfer'
# tool = 'fsl'

# tmp = hie[['ID','Gender','structure','tool','value','fiq']]
# condition1 = ((hie['Gender']=='1') | (hie['Gender']=='Male')) & 

condition1 = (hie['structure'] == h2uberon[roi1]) & (hie['tool']==tooldic[tool]) \
             & (hie['Age'] < 20) \
             & ((hie['Gender']=='1') | (hie['Gender']=='Male'))
condition2 = (hie['structure'] == h2uberon[roi2]) & (hie['tool']==tooldic[tool]) \
             & (hie['Age'] < 20) \
             & ((hie['Gender']=='1') | (hie['Gender']=='Male'))

    
tmp1 = tmp.loc[condition1].drop_duplicates()
tmp2 = tmp.loc[condition2].drop_duplicates()
#tmp1 = tmp1.set_index('ID')
#tmp2 = tmp2.set_index('ID')


In [None]:
print(len(hie),len(tmp1),len(tmp2))
print(len(tmp1.drop_duplicates()),len(tmp2.drop_duplicates()))

In [None]:
tmp_1 = tmp1.groupby(['ID']).mean().dropna()
tmp_2 = tmp2.groupby(['ID']).mean().dropna()
print(len(tmp_2), len(tmp_1))
assert np.all(tmp_1.index == tmp_2.index)

In [None]:
datadf = tmp_2
datadf['cau_put'] = tmp_2.loc[:,'value'].add(tmp_1.loc[:,'value'])
datadf['fiq'] = tmp_2['fiq']
print(list(datadf))

In [None]:
print(len(datadf))
# ax = plt.hist(datadf['cau_put'])

In [None]:

md = smf.ols("fiq ~ Q('cau_put')", data=datadf)
mdf = md.fit()
print(mdf.summary())


### 

In [None]:
#tmp = hie.loc[ (hie['structure']==h2uberon[roi]) & (hie['tool']==tooldic['surfer'])]
#tmp.head(3)

In [None]:
#tmp_ = tmp1.groupby(['ID']).min()
#tmp_.head(3)

In [None]:
# horizontal_stack = pd.concat([tmp1, tmp2], axis=1)
merged_inner = pd.merge(left=tmp1, right=tmp2, left_on='ID', right_on='ID') #how='join', 

In [None]:
len(merged_inner.drop_duplicates())

In [None]:
merged_inner.head(4)

In [None]:
md = smf.ols("FIQ ~ Q('striatal volume')", data=data2)
mdf = md.fit()
print(mdf.summary())
print(pearsonr(data2['FIQ'], data2['striatal volume']))

In [None]:
print(tmp.loc[(tmp['ID']==3559087)])

In [None]:
hie.iloc[1578]

In [None]:
hie.iloc[1577]

In [None]:
uberon = "http://purl.obolibrary.org/obo/UBERON_0001897"

cmd = 'wget -O - ' + uberon\
                   + '  | grep -A 1 ' +  "'" + 'Class rdf:about='\
                   + '"' + uberon + '"' +  "'"

In [None]:
print(cmd.split())

In [None]:
import subprocess
MyOut = subprocess.Popen(cmd.split(), 
            stdout=subprocess.PIPE, 
           stderr=subprocess.STDOUT, shell=True)
stdout,stderr = MyOut.communicate()
print(stdout)
#print(stderr)

In [None]:
process = subprocess.run(cmd.split())

In [None]:
list(hie)

In [None]:
uberon= '<http://purl.obolibrary.org/obo/UBERON_0001897>'


In [None]:
len(tmp)

In [None]:
# Load in the dataset, appending 'nd' to the list of possible na_values
hie = pd.read_csv(relative_path_filename, na_values='nd')
original_col_names = list(hie)
print(original_col_names[:10])

# column names are unique
assert len(original_col_names) == len(set(original_col_names))