In [1]:
import os
import numpy as np
import pandas as pd
import utils
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

# BRAIN PREPROCESSING

In [2]:
valbase = '/storage/gablab001/data/genus/current/structured/validate'

# 183_cor_subcor_D_thickness.csv contains the 170 features for the validation set
valdatacob = '183_cor_subcor_D_thickness.csv'
valdatacob = pd.read_csv(os.path.join(valbase, valdatacob))

# the file containing the covariates
valcov = pd.read_csv(os.path.join(valbase, '183_covariates_to_use.csv'))

# this is y in the classification analysis
valres = valcov['diag'].values

txtb = '/storage/gablab001/data/genus/current/structured/genus/text_files_for_indexing'
    
# directory to where the genus brain that is
bd = '/storage/gablab001/data/genus/current/structured/brain/'

# the header text file for the 170 columns, this will be used to subset the 
# entire genus data
thickness = np.genfromtxt(os.path.join(txtb, '170_columns.txt'), dtype=str)

# making equivalent volume headers from the thickness headers
volume = ' '.join(thickness).replace('thickness_D','volume_D').split(' ')

# this variable is for convenience so that i dont hav to change
# where the headers are in multiple places, just here
colheads = thickness

# loading the covariate headers that will be one hot encoded
cvar_encode = np.genfromtxt(os.path.join(txtb, 'covars_ecn.txt'), dtype=str)

# the covariate headers that wont be one hot encoded
cvar = np.genfromtxt(os.path.join(txtb, 'covars_no_ecn.txt'), dtype=str)

# GENUS brain data
brain = pd.read_csv(os.path.join(bd, 'GENUS_FS_ATLAS_D.csv'), low_memory=False)

# GENUS response variable
response = brain[['IID','GROUP']]

# here i combined all the needed data so that I can drop rows all together and 
# make sure all parts of the data, brain regions, covariates, ID, response are
# sorted by the same rows
combined = pd.concat([
    brain[colheads],
    brain[cvar],
    brain[cvar_encode],
    brain[['IID','GROUP']]
], axis=1).dropna().drop_duplicates('IID')

# getting the covariates that wont be one hot encoded
cvar_ne = combined[cvar].reset_index(drop=True)

# and the covariates that will be one hot encoded
cvar_e = combined[cvar_encode].reset_index(drop=True)

# performing one hot encoding
cvar_e = pd.concat([
    pd.DataFrame(utils.encoder(cvar_e[col])) for col in cvar_e.columns
], axis=1, ignore_index=True)

# recombining covariates
cvars = pd.concat([cvar_ne, cvar_e], axis=1)

# COGNITIVE DOMAINS PREPROCESSING

In [3]:
# load and read the data
cog_base = '/storage/gablab001/data/genus/current/structured/cognitive'
cog_file = 'GENUS_neuropsych_data_Domain_Scores.csv'
cog = pd.read_csv(os.path.join(cog_base, cog_file))

# load the fam file to assign diagnosis
fam_file = '/storage/gablab001/data/genus/current/structured/fam/FAM_N5338_EUR.csv'
fam = pd.read_csv(fam_file)

# match them on ids
ids = np.intersect1d(cog.IID.values, fam.IID.values)

# subset each dataset on the matched ids
fam_reduced = fam.set_index('IID').loc[ids].reset_index().drop_duplicates('IID')
cog_reduced = cog.set_index('IID').loc[ids].reset_index().drop_duplicates('IID')

# add sex and diagnosis to the cog data
# 1 is patient 0 is control
cog_reduced['GROUP'] = fam_reduced.affected.values - 1
cog_reduced['SEX'] = fam_reduced.sex.values

# removing these because they have to many missing values
scores_to_remove = 'NVLMdomainAvgZ ATVIdomainAvgZ NVWMdomainAvgZ'.split(' ')
five_scores = np.setdiff1d(utils.domain_scores, scores_to_remove).tolist() 
cog_five_scores = cog_reduced[five_scores + ['GROUP','SEX','IID']].dropna()

# COMBINING COG AND BRAIN DATA

In [4]:
ids = np.intersect1d(cog_five_scores.IID.values, combined.IID.values)
print(ids.shape)

(368,)


In [5]:
cog_data = cog_five_scores.set_index('IID').loc[ids].reset_index()
genus_data = combined.set_index('IID').loc[ids].reset_index()

In [6]:
cog_data.head()

Unnamed: 0,IID,RPSdomainAvgZ,SOPdomainAvgZ,VISPAdomainAvgZ,VLMdomainAvgZ,VWMdomainAvgZ,GROUP,SEX
0,32-1-2-00-009,-0.137259,0.261825,0.576001,-0.633401,0.317417,1,1
1,32-1-2-00-010,-0.596795,-0.244088,0.435217,-0.356669,-0.625201,1,2
2,32-1-2-00-018,0.970423,-0.155064,1.014507,-1.006689,0.371461,1,1
3,32-1-2-00-022,-0.667989,-0.08111,-0.300298,0.048958,0.626954,1,1
4,32-1-2-00-026,-0.315741,-0.523805,0.984547,-1.323432,-1.114922,1,1


In [7]:
genus_data.head()

Unnamed: 0,IID,rh_G_and_S_frontomargin_thickness_D,lh_G_and_S_frontomargin_thickness_D,rh_G_and_S_occipital_inf_thickness_D,lh_G_and_S_occipital_inf_thickness_D,rh_G_and_S_paracentral_thickness_D,lh_G_and_S_paracentral_thickness_D,rh_G_and_S_subcentral_thickness_D,lh_G_and_S_subcentral_thickness_D,rh_G_and_S_transv_frontopol_thickness_D,...,lh_Amygdala,SEX,EstimatedTotalIntraCranialVol,AGE_MRI,HANDED,FS_VERSION,SOFT_VERS,HEAD_COIL_NCHANNELS,MAGN_FIELD_STR,GROUP
0,32-1-2-00-009,2.313,2.29,2.414,2.052,2.097,2.214,2.458,2.589,2.543,...,1315.1,1.0,1590706.0,21.0,Right,FS5.3,GE,8,3T,Schizophrenia
1,32-1-2-00-010,2.506,2.501,2.547,2.193,2.457,2.477,2.606,2.493,2.508,...,1291.2,0.0,1546777.0,20.0,Right,FS5.3,GE,8,3T,Schizophrenia
2,32-1-2-00-018,2.11,1.891,2.531,2.457,2.299,2.342,2.659,2.887,2.331,...,1457.0,1.0,1604038.0,27.0,Left,FS5.3,GE,8,3T,Schizophrenia
3,32-1-2-00-022,2.432,2.25,2.463,2.141,2.448,2.302,2.619,2.876,2.513,...,1278.8,1.0,1439026.0,20.0,Right,FS5.3,GE,8,3T,Schizophrenia
4,32-1-2-00-026,2.295,1.994,2.488,2.624,2.332,2.425,2.81,2.822,2.51,...,1302.3,1.0,1547894.0,31.0,Right,FS5.3,GE,8,3T,Schizophrenia


In [8]:
genus_y = np.array([1 if i == 'Schizophrenia' else 0 for i in genus_data.GROUP.values])

In [9]:
# QA check
(genus_y == cog_data.GROUP).mean()

1.0

In [10]:
brain_cog = pd.concat([
    genus_data,
    cog_data
], axis=1)

In [11]:
brain_cog.head()

Unnamed: 0,IID,rh_G_and_S_frontomargin_thickness_D,lh_G_and_S_frontomargin_thickness_D,rh_G_and_S_occipital_inf_thickness_D,lh_G_and_S_occipital_inf_thickness_D,rh_G_and_S_paracentral_thickness_D,lh_G_and_S_paracentral_thickness_D,rh_G_and_S_subcentral_thickness_D,lh_G_and_S_subcentral_thickness_D,rh_G_and_S_transv_frontopol_thickness_D,...,MAGN_FIELD_STR,GROUP,IID.1,RPSdomainAvgZ,SOPdomainAvgZ,VISPAdomainAvgZ,VLMdomainAvgZ,VWMdomainAvgZ,GROUP.1,SEX
0,32-1-2-00-009,2.313,2.29,2.414,2.052,2.097,2.214,2.458,2.589,2.543,...,3T,Schizophrenia,32-1-2-00-009,-0.137259,0.261825,0.576001,-0.633401,0.317417,1,1
1,32-1-2-00-010,2.506,2.501,2.547,2.193,2.457,2.477,2.606,2.493,2.508,...,3T,Schizophrenia,32-1-2-00-010,-0.596795,-0.244088,0.435217,-0.356669,-0.625201,1,2
2,32-1-2-00-018,2.11,1.891,2.531,2.457,2.299,2.342,2.659,2.887,2.331,...,3T,Schizophrenia,32-1-2-00-018,0.970423,-0.155064,1.014507,-1.006689,0.371461,1,1
3,32-1-2-00-022,2.432,2.25,2.463,2.141,2.448,2.302,2.619,2.876,2.513,...,3T,Schizophrenia,32-1-2-00-022,-0.667989,-0.08111,-0.300298,0.048958,0.626954,1,1
4,32-1-2-00-026,2.295,1.994,2.488,2.624,2.332,2.425,2.81,2.822,2.51,...,3T,Schizophrenia,32-1-2-00-026,-0.315741,-0.523805,0.984547,-1.323432,-1.114922,1,1


In [12]:
cog_data[five_scores]

Unnamed: 0,RPSdomainAvgZ,SOPdomainAvgZ,VISPAdomainAvgZ,VLMdomainAvgZ,VWMdomainAvgZ
0,-0.137259,0.261825,0.576001,-0.633401,0.317417
1,-0.596795,-0.244088,0.435217,-0.356669,-0.625201
2,0.970423,-0.155064,1.014507,-1.006689,0.371461
3,-0.667989,-0.081110,-0.300298,0.048958,0.626954
4,-0.315741,-0.523805,0.984547,-1.323432,-1.114922
5,0.318492,0.053449,1.236995,0.244592,0.687638
6,-0.652069,-0.636629,-0.776559,-0.992831,-0.631376
7,-0.045267,-0.372114,1.696646,-0.632695,-1.226413
8,-0.158079,0.044732,1.148695,-0.886437,0.311585
9,-1.127706,-0.877088,-0.291401,0.778301,-2.388255


In [13]:
genus_data[thickness]

Unnamed: 0,rh_G_and_S_frontomargin_thickness_D,lh_G_and_S_frontomargin_thickness_D,rh_G_and_S_occipital_inf_thickness_D,lh_G_and_S_occipital_inf_thickness_D,rh_G_and_S_paracentral_thickness_D,lh_G_and_S_paracentral_thickness_D,rh_G_and_S_subcentral_thickness_D,lh_G_and_S_subcentral_thickness_D,rh_G_and_S_transv_frontopol_thickness_D,lh_G_and_S_transv_frontopol_thickness_D,...,rh_Caudate,lh_Caudate,rh_Lateral.Ventricle,lh_Lateral.Ventricle,rh_Inf.Lat.Vent,lh_Inf.Lat.Vent,rh_Thalamus.Proper,lh_Thalamus.Proper,rh_Amygdala,lh_Amygdala
0,2.313,2.290,2.414,2.052,2.097,2.214,2.458,2.589,2.543,2.237,...,3580.0,3378.1,13852.1,24116.1,365.9,738.5,7069.7,7220.4,1573.9,1315.1
1,2.506,2.501,2.547,2.193,2.457,2.477,2.606,2.493,2.508,2.611,...,4294.4,4033.4,2046.5,1764.3,164.1,214.9,7798.4,7953.2,1516.2,1291.2
2,2.110,1.891,2.531,2.457,2.299,2.342,2.659,2.887,2.331,2.418,...,3787.4,3733.5,6240.0,6916.2,328.0,274.8,7940.2,8399.2,1533.2,1457.0
3,2.432,2.250,2.463,2.141,2.448,2.302,2.619,2.876,2.513,2.788,...,3946.5,4042.4,5494.2,6652.4,199.1,536.3,7027.7,7178.0,1431.7,1278.8
4,2.295,1.994,2.488,2.624,2.332,2.425,2.810,2.822,2.510,2.689,...,3868.2,3622.5,12707.1,8259.3,364.8,303.6,7646.2,7839.5,1400.9,1302.3
5,2.241,1.618,2.465,2.355,2.382,2.410,2.652,2.544,2.565,2.107,...,3985.8,4119.2,8533.1,6740.8,500.4,397.0,7242.3,7919.0,1535.5,1470.4
6,2.408,2.292,2.579,2.528,2.217,2.351,2.929,2.948,2.450,2.649,...,3455.9,3500.2,8168.4,6232.0,192.7,195.8,6914.6,7179.7,1614.3,1458.1
7,2.111,2.087,2.642,2.517,2.160,2.396,2.441,2.661,2.734,2.339,...,4779.8,4700.5,9862.9,11984.8,365.5,545.6,8678.7,10123.4,1627.7,1738.6
8,2.318,2.264,2.743,2.383,2.300,2.352,2.748,2.581,2.485,2.523,...,4415.9,4173.2,9232.8,7780.3,126.8,193.0,7306.9,8103.6,1412.7,1632.7
9,2.096,2.079,2.491,2.000,2.365,2.191,2.395,2.536,2.323,2.478,...,3548.5,3513.9,12892.4,13076.6,730.0,715.2,7383.6,8181.4,1425.3,1468.6


# Running analysis w/o removing effect of covs

In [14]:
# 36.956521739130432 % of sample are patients
Xdf = pd.concat([genus_data[thickness], cog_data[five_scores]], axis=1)
X = Xdf.values
y = genus_y

In [88]:
res = {'auc':[]}

for train, test in StratifiedShuffleSplit().split(X, y):
    clf = Pipeline([
        ('scale', StandardScaler()),
        ('lg', linear_model.LogisticRegressionCV(
            penalty='l1',
            solver='liblinear',
            cv=StratifiedShuffleSplit(),
            Cs=1000
        ))
    ])
    
    clf.fit(X[train], y[train])
    res['auc'].append(roc_auc_score(y[test], clf.predict(X[test])))

In [89]:
np.mean(res['auc'])

0.77763975155279508

# Removing effect of covariates

In [15]:
cvars = pd.concat([cvars, combined[['IID']].reset_index(drop=True)], axis=1)
cvars_reduced = cvars.set_index('IID').loc[ids].reset_index(drop=True)
non_sing_cvars = utils.make_non_singular(cvars_reduced.values)

In [28]:
# decisions need to be made here
# do we combine the brain and cog data and then remove effects of covars from entire thing?
# that seems wrong, i'd be removing effects of scanner covariate from cognitive score..
# alternatively I can remove effects from brain and cog scores seperately 
brain_CR = utils.proj(genus_data[thickness].values, non_sing_cvars)
cog_CR = utils.proj(cog_data[five_scores].values, cvars_reduced[['SEX','AGE_MRI']].values)

In [31]:
XCR = np.hstack((brain_CR, cog_CR))

In [33]:
res = {'auc':[]}

for train, test in StratifiedShuffleSplit().split(XCR, y):
    clf = Pipeline([
        ('scale', StandardScaler()),
        ('lg', linear_model.LogisticRegressionCV(
            penalty='l1',
            solver='liblinear',
            cv=StratifiedShuffleSplit(),
            Cs=1000
        ))
    ])
    
    clf.fit(XCR[train], y[train])
    res['auc'].append(roc_auc_score(y[test], clf.predict(XCR[test])))

In [41]:
print(np.mean(res['auc']))
res['auc']

0.698602484472


[0.69875776397515521,
 0.66304347826086951,
 0.66304347826086951,
 0.81366459627329202,
 0.7562111801242235,
 0.60559006211180133,
 0.63354037267080732,
 0.69099378881987572,
 0.66304347826086951,
 0.79813664596273293]

In [46]:
Xdf.ix[:, np.nonzero(clf.named_steps['lg'].coef_[0])[0]]

Unnamed: 0,rh_G_and_S_cingul.Mid.Post_thickness_D,lh_S_temporal_inf_thickness_D,SOPdomainAvgZ,VLMdomainAvgZ,VWMdomainAvgZ
0,2.410,2.384,0.261825,-0.633401,0.317417
1,2.584,2.315,-0.244088,-0.356669,-0.625201
2,2.463,2.479,-0.155064,-1.006689,0.371461
3,2.532,2.440,-0.081110,0.048958,0.626954
4,2.504,2.175,-0.523805,-1.323432,-1.114922
5,2.372,2.466,0.053449,0.244592,0.687638
6,2.733,2.587,-0.636629,-0.992831,-0.631376
7,2.461,2.361,-0.372114,-0.632695,-1.226413
8,2.685,2.287,0.044732,-0.886437,0.311585
9,2.594,2.318,-0.877088,0.778301,-2.388255


(368, 175)