In [47]:
import os
import utils
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

In [4]:
cog_base = '/storage/gablab001/data/genus/current/structured/cognitive'
cog_file = 'GENUS_neuropsych_data_Domain_Scores.csv'

In [21]:
cog = pd.read_csv(os.path.join(cog_base, cog_file))

In [7]:
cog.head()

Unnamed: 0,FID,IID,SOPdomainAvgZ,ATVIdomainAvgZ,VWMdomainAvgZ,NVWMdomainAvgZ,VLMdomainAvgZ,NVLMdomainAvgZ,RPSdomainAvgZ,VISPAdomainAvgZ
0,H001,06H001,-0.994502,,-0.05866,,0.254863,0.733938,0.495773,
1,H002,06H002,-0.647871,,,,,,1.258445,
2,H003,06H003,,,,,,,,
3,H004,06H004,-0.565002,,0.515166,,0.142402,1.153805,0.109642,
4,H005,06H005,-0.564811,,-1.029433,,-1.2419,-1.582192,0.826519,


In [10]:
cog.apply(pd.isnull).sum()

FID                   0
IID                   0
SOPdomainAvgZ       878
ATVIdomainAvgZ     3512
VWMdomainAvgZ      1390
NVWMdomainAvgZ     4844
VLMdomainAvgZ      2159
NVLMdomainAvgZ     5032
RPSdomainAvgZ      2398
VISPAdomainAvgZ    2402
dtype: int64

In [11]:
cog.dropna().shape

(547, 10)

In [16]:
fam_file = '/storage/gablab001/data/genus/current/structured/fam/FAM_N5338_EUR.csv'
fam = pd.read_csv(fam_file)

In [17]:
fam.head()

Unnamed: 0,IID,FID,pid,mid,sex,affected
0,1106.04,1106,0,0,1,1
1,11260-1,11260,0,0,2,2
2,G_93180,GEN_03_1151,0,0,1,1
3,RPG6034,RPG6034,0,0,1,2
4,32-1-2-00-009,CIDAR_02_2009-2009,0,0,1,2


In [18]:
ids = np.intersect1d(cog.IID.values, fam.IID.values)

In [19]:
ids

array(['00225_A', '00226_B', '00228_B', ..., 'U042411', 'U042561',
       'U042562'], dtype=object)

In [29]:
fam_reduced = fam.set_index('IID').loc[ids].reset_index().drop_duplicates('IID')
cog_reduced = cog.set_index('IID').loc[ids].reset_index().drop_duplicates('IID')
print(fam_reduced.shape, cog_reduced.shape)

((4926, 6), (4926, 10))


In [30]:
fam_reduced.head()

Unnamed: 0,IID,FID,pid,mid,sex,affected
0,00225_A,DEC_JWS_00225,0,0,2,2
1,00226_B,DEC_JWS_00226,0,0,1,2
2,00228_B,DEC_JWS_00228,0,0,1,2
3,00233_B,DEC_JWS_00233,0,0,1,2
4,06H001,H001,0,0,2,1


In [31]:
cog_reduced.head()

Unnamed: 0,IID,FID,SOPdomainAvgZ,ATVIdomainAvgZ,VWMdomainAvgZ,NVWMdomainAvgZ,VLMdomainAvgZ,NVLMdomainAvgZ,RPSdomainAvgZ,VISPAdomainAvgZ
0,00225_A,DEC_JWS_00225,-2.397181,,-1.539926,,-3.778566,,0.981764,-0.807805
1,00226_B,DEC_JWS_00226,-1.113352,,-1.579818,,-1.492848,,3.559235,-1.213359
2,00228_B,DEC_JWS_00228,-0.465473,,-0.469545,,-0.126744,,1.579159,1.236407
3,00233_B,DEC_JWS_00233,-1.682203,,-1.914997,,-2.86574,,4.003487,-1.697674
4,06H001,H001,-0.994502,,-0.05866,,0.254863,0.733938,0.495773,


In [33]:
(fam_reduced.IID.values == cog_reduced.IID.values).mean()

1.0

In [42]:
# 1 = patient, 0 = HC
cog_reduced['GROUP'] = fam_reduced.affected.values - 1
cog_reduced['SEX'] = fam_reduced.sex.values

In [43]:
cog_reduced.head()

Unnamed: 0,IID,FID,SOPdomainAvgZ,ATVIdomainAvgZ,VWMdomainAvgZ,NVWMdomainAvgZ,VLMdomainAvgZ,NVLMdomainAvgZ,RPSdomainAvgZ,VISPAdomainAvgZ,GROUP,SEX
0,00225_A,DEC_JWS_00225,-2.397181,,-1.539926,,-3.778566,,0.981764,-0.807805,1,2
1,00226_B,DEC_JWS_00226,-1.113352,,-1.579818,,-1.492848,,3.559235,-1.213359,1,1
2,00228_B,DEC_JWS_00228,-0.465473,,-0.469545,,-0.126744,,1.579159,1.236407,1,1
3,00233_B,DEC_JWS_00233,-1.682203,,-1.914997,,-2.86574,,4.003487,-1.697674,1,1
4,06H001,H001,-0.994502,,-0.05866,,0.254863,0.733938,0.495773,,0,2


In [44]:
cog_reduced.apply(pd.isnull).mean() # to get percentages

IID                0.000000
FID                0.000000
SOPdomainAvgZ      0.131547
ATVIdomainAvgZ     0.485384
VWMdomainAvgZ      0.080187
NVWMdomainAvgZ     0.585668
VLMdomainAvgZ      0.270402
NVLMdomainAvgZ     0.646366
RPSdomainAvgZ      0.359927
VISPAdomainAvgZ    0.276289
GROUP              0.000000
SEX                0.000000
dtype: float64

In [55]:
# use everything while dropping missing values
cog_allscores_reduced = cog_reduced.dropna()
print(cog_allscores_reduced.shape)

(448, 12)


In [80]:
clf = Pipeline([
    ('scale', StandardScaler()),
    ('lg', linear_model.LogisticRegressionCV(
        penalty='l1',
        solver='liblinear',
        cv = StratifiedShuffleSplit(n_splits=4),
        Cs=1000
        ))
    ])

In [81]:
scores = cross_val_score(clf, 
                         X=cog_allscores_reduced[[i for i in utils.domain_scores]].values, 
                         y=cog_allscores_reduced['GROUP'].values,
                         cv=StratifiedShuffleSplit(),
                         scoring=make_scorer(roc_auc_score))

In [None]:
# n_subjects here is 448
scores, np.mean(scores)

In [87]:
# check % of patients and controls
cog_allscores_reduced.GROUP.mean()
# so most ppl are patients

0.7455357142857143

In [57]:
# NVLMdomainAvgZ ATVIdomainAvgZ NVWMdomainAvgZ trying it again removing those scores
scores_to_remove = 'NVLMdomainAvgZ ATVIdomainAvgZ NVWMdomainAvgZ'.split(' ')


In [85]:
cog_five_scores = cog_reduced[np.setdiff1d(utils.domain_scores, scores_to_remove).tolist() + ['GROUP','SEX']].dropna()
print(cog_five_scores.shape)

(1761, 7)


In [77]:
cog_five_scores.head()

Unnamed: 0,RPSdomainAvgZ,SOPdomainAvgZ,VISPAdomainAvgZ,VLMdomainAvgZ,VWMdomainAvgZ,GROUP,SEX
0,0.981764,-2.397181,-0.807805,-3.778566,-1.539926,1,2
1,3.559235,-1.113352,-1.213359,-1.492848,-1.579818,1,1
2,1.579159,-0.465473,1.236407,-0.126744,-0.469545,1,1
3,4.003487,-1.682203,-1.697674,-2.86574,-1.914997,1,1
154,0.469281,-0.068378,-1.747035,-0.308135,-1.329291,0,2


In [83]:
scores = cross_val_score(clf, 
                         X=cog_five_scores[np.setdiff1d(utils.domain_scores, scores_to_remove)].values, 
                         y=cog_five_scores.GROUP.values,
                         cv=StratifiedShuffleSplit(),
                         scoring=make_scorer(roc_auc_score))

In [84]:
# n_subjects is 1761
scores, np.mean(scores)

(array([ 0.75236158,  0.73169772,  0.72913933,  0.79965888,  0.79480451,
         0.74416164,  0.78909735,  0.76672789,  0.72218578,  0.77663343]),
 0.76064681186040417)

In [86]:
# check the % of patients and controls
cog_five_scores.GROUP.mean()
# more even distribution which may account for the reduced accuracy

0.58319136854060194

In [93]:
res = {'auc':[]}
X=cog_five_scores[np.setdiff1d(utils.domain_scores, scores_to_remove)].values
y=cog_five_scores.GROUP.values
for num, (train, test) in enumerate(StratifiedShuffleSplit().split(X,y)):
    clf_ = clf
    clf_.fit(X[train], y[train])
    res['auc'].append(roc_auc_score(y[test], clf_.predict(X[test])))
    res['coef_{}'.format(num)] = clf_.named_steps['lg'].coef_
    res['Cs_{}'.format(num)] = clf_.named_steps['lg'].Cs_
    res['scores_{}'.format(num)] = clf.named_steps['lg'].scores_
    res['coef_paths_{}'.format(num)] = clf.named_steps['lg'].coefs_paths_
    

In [94]:
np.mean(res['auc'])

0.76676725268958279

In [None]:
# should be able to make a regularization by accuracy by n_columns with
# the following things

In [99]:
pd.DataFrame(res['scores_1'][1])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,0.415094,0.415094,0.415094,0.415094,0.415094,0.415094,0.415094,0.415094,0.415094,0.415094,...,0.773585,0.773585,0.773585,0.773585,0.773585,0.773585,0.773585,0.773585,0.773585,0.773585
1,0.415094,0.415094,0.415094,0.415094,0.415094,0.415094,0.415094,0.415094,0.415094,0.415094,...,0.754717,0.754717,0.754717,0.754717,0.754717,0.754717,0.754717,0.754717,0.754717,0.754717
2,0.415094,0.415094,0.415094,0.415094,0.415094,0.415094,0.415094,0.415094,0.415094,0.415094,...,0.81761,0.81761,0.81761,0.81761,0.81761,0.81761,0.81761,0.81761,0.81761,0.81761
3,0.415094,0.415094,0.415094,0.415094,0.415094,0.415094,0.415094,0.415094,0.415094,0.415094,...,0.72956,0.72956,0.72956,0.72956,0.72956,0.72956,0.72956,0.72956,0.72956,0.72956


In [103]:
pd.DataFrame(res['coef_paths_1'][1][0])

Unnamed: 0,0,1,2,3,4,5
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [104]:
pd.DataFrame(res['Cs_1'])

Unnamed: 0,0
0,0.000100
1,0.000102
2,0.000104
3,0.000106
4,0.000108
5,0.000110
6,0.000112
7,0.000114
8,0.000116
9,0.000118
