# About the notebook
This script contains the machine analysis from the Griffin model published on GitHub (https://github.com/adoebley/Griffin_analyses). Using their model and their data it was possible to recreate the results they published. However, large differences in the results were discovered when doing binary classification with a balanced dataset.

In [1]:
import argparse
import sys
import os
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np

from sklearn.metrics import roc_curve,auc

# Functions

In [2]:
def import_data_griffin(in_file):
    # in_file is the path to the file with the cancer and healthy features
    import pandas as pd
    import numpy as np
    from sklearn.preprocessing import StandardScaler
    
    data = pd.read_csv(in_file, sep='\t')
    data = data.set_index('sample')

    #get features and exclude all other columns
    features = data.columns[(data.columns.str.startswith('central_cov')) | (data.columns.str.startswith('mean_cov')) | (data.columns.str.startswith('amplitude')) | (data.columns.str.startswith('Ulz'))]
    print('Features',len(features))

    data = data.sort_index()

    print('Total samples:',len(data))

    #scale data
    scaler = StandardScaler()
    scaler.fit(data[features])
    data[features] = scaler.transform(data[features])
    data[features].mean()
    
    #add tumor fraction groups
    data['tfx_group'] = 'none'
    a = 0
    for b in tfx_groups:
        tfx_group_name = str(a)+'-'+str(b)+'TFx'
        data['tfx_group'] = np.where((data['status']==1) & (data['tumor_fraction']>=a) & (data['tumor_fraction']<b),tfx_group_name,data['tfx_group'])
        a=b
    #if group maxes don't go all the way to 1, add a group > max val
    if b<1:
        tfx_group_name = '>'+str(b)+'TFx'
        data['tfx_group'] = np.where((data['status']==1) & (data['status']==1) & (data['tumor_fraction']>=b),tfx_group_name,data['tfx_group'])
    #specify tfx group for healthy donors
    data['tfx_group'] = np.where((data['status']==0),'Healthy',data['tfx_group'])
    
    return(data,features)

In [3]:
def run_bootstrap_with_PCA_griffin(data,iterations,features,report_interval,hyperparameters):
    import time
    import sys
    import pandas as pd
    import numpy as np
    from sklearn.model_selection import StratifiedKFold
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import GridSearchCV
    from matplotlib import pyplot as plt
    from sklearn.decomposition import PCA

    start_time = time.time()

    probabilities = pd.DataFrame(index=data.index)
    c_vals = []
    coefs = pd.DataFrame(index=features)
    num_pcs = []
    train_indexes = []
    
    # Loop for each iteration
    for i in range(iterations):
            
        #bootstrap a training set with replacement
        X_train = data.sample(len(data), replace = True, random_state = i+100)[features]
        y_train = data.sample(len(data), replace = True, random_state = i+100)['status']
        
        #the test set is all samples that aren't seen in the training data
        X_test = data[~(data.index.isin(X_train.index))][features]
        y_test = data[~(data.index.isin(X_train.index))]['status']
        
        #print(len(X_train),len(X_train.index.unique()),len(X_test))
        
        #perform PCA on the training set
        n_components = min(len(features), len(X_train))
        pca = PCA(n_components=n_components, svd_solver='randomized', random_state = 100)
        PCs = pca.fit_transform(X_train[features])
        principal_components = pd.DataFrame(data = PCs, columns = ['PC_'+str(m) for m in np.arange(n_components)], index = X_train.index)
        
        
        #find the principle components that make up 80% of the varience
        for j in range(len(pca.explained_variance_ratio_)):
            current_sum = pca.explained_variance_ratio_[:j].sum()
            if current_sum>=fraction_variance:
                break
        #print('number of components:',j)
        pca_features = ['PC_'+str(m) for m in np.arange(0,j)]
        
        #apply to the test data
        test_PCs = pca.transform(X_test[features])
        test_principal_components = pd.DataFrame(data = test_PCs , columns = ['PC_'+str(m) for m in np.arange(n_components)], index = X_test.index)
        
        X_train = principal_components[pca_features]
        X_test = test_principal_components[pca_features]
        
        #10 fold cross validation on the training set
        cv = StratifiedKFold(n_splits=10, shuffle=True, random_state = i+100) 

        model = LogisticRegression(class_weight='balanced', max_iter=500, solver = 'liblinear')
        search = GridSearchCV(estimator=model, param_grid=hyperparameters, cv=cv, n_jobs = 1)
        search.fit(X_train, y_train)
        best_C = search.best_params_['C']

        ##train a new model on the full training dataset (is this the same as refit...?)
        model = LogisticRegression(class_weight='balanced', max_iter=500, C=best_C, solver = 'liblinear')
        model.fit(X_train, y_train)

        #predict the test data
        pred = model.predict(X_test)
        prob = model.predict_proba(X_test)

        #save results
        probabilities[i] = pd.Series(prob[:,1], index = X_test.index)
        c_vals.append(best_C)
        coefs[i] = pd.Series(model.coef_[0], index = pca_features)
        num_pcs.append(j)
     
        train_indexes.append(list(X_train.index))
        
        if i%report_interval==0:
            print('iteration:',i, ', time (sec):',np.round(time.time()-start_time,2),'num_pcs:',j)
        if i%20==0:
            #prevent dfs from becoming too fragmented
            probabilities = probabilities.copy()
            coefs = coefs.copy()   
            sys.stdout.flush()

    probabilities = probabilities.merge(data[['status']], left_index=True, right_index=True)

    return(probabilities,c_vals,coefs,num_pcs,train_indexes)

In [4]:
def get_AUC_griffin(probabilities,data,iterations):
    #get AUC and accuracy for each bootstrap
    from sklearn.metrics import roc_curve,auc
    import pandas as pd
    import numpy as np

    AUCs = pd.DataFrame()

    probabilities = probabilities.merge(data[['tumor_fraction','sample_type','Stage','tfx_group']], left_index=True, right_index=True)
    
    for i in range(iterations):
        current_dict = {}
        current = probabilities[~(probabilities[i].isnull())][['status','tumor_fraction','sample_type','Stage','tfx_group',i]].copy()

        #overall accuracy and AUC
        group = 'overall'
        fpr,tpr,_ = roc_curve(current['status'],current[i])
        AUC = auc(fpr,tpr)
        current_dict[group] = AUC
        del(AUC,group,fpr,tpr)

        #separate out the healthy samples to be used in every AUC
        healthy_df = current[current['status']==0]
        cancer_df = current[current['status']==1]
        del(current)
        
        for group,df in cancer_df.groupby('sample_type'):
            if group == 'Duodenal_Cancer':
                continue

            df2 = df.append(healthy_df, ignore_index=True)
            fpr,tpr,_ = roc_curve(df2['status'],df2[i])
            AUC = auc(fpr,tpr)
            current_dict[group] = AUC
            del(AUC,group,fpr,tpr)
            
        for group,df in cancer_df.groupby('Stage'):
            if group == '0' or group == 'X':
                continue
            df2 = df.append(healthy_df, ignore_index=True)
            fpr,tpr,_ = roc_curve(df2['status'],df2[i])
            AUC = auc(fpr,tpr)
            current_dict[group] = AUC
            del(AUC,group,fpr,tpr)
            
        for group,df in cancer_df.groupby('tfx_group'):
            df2 = df.append(healthy_df, ignore_index=True)
            fpr,tpr,_ = roc_curve(df2['status'],df2[i])
            AUC = auc(fpr,tpr)
            current_dict[group] = AUC
            del(AUC,group,fpr,tpr)
            
        AUCs = AUCs.append(pd.Series(current_dict), ignore_index=True)
        
    CIs = pd.DataFrame([AUCs.median(), AUCs.quantile(.025), AUCs.quantile(.975)]).T
    CIs = CIs.rename(columns = {'Unnamed 0':'median'})    
    return(AUCs,CIs)

# PCA bootstrapping with Griffin data

In [5]:
# Parameter

in_file_griffin = "/data/gpfs-1/groups/ag_kircher/cfDNA-analysis/lea/cfDNA_classification_analyses/features/30000-sites_reformatted.txt"
iterations = 1000
report_interval = 50
fraction_variance = .8
tfx_groups = [0.03,0.05]

In [6]:
data_griffin,features_griffin  = import_data_griffin(in_file_griffin)
data_griffin.head()

Features 810
Total samples: 423


Unnamed: 0_level_0,tumor_fraction,status,sample_type,Stage,Age at Diagnosis,Gender,site_group,central_coverage_AHR.hg38.30000,central_coverage_AR.hg38.30000,central_coverage_ARNT.hg38.30000,...,mean_coverage_ZNF341.hg38.30000,mean_coverage_ZNF35.hg38.30000,mean_coverage_ZNF384.hg38.30000,mean_coverage_ZNF449.hg38.30000,mean_coverage_ZNF467.hg38.30000,mean_coverage_ZNF554.hg38.30000,mean_coverage_ZNF580.hg38.30000,mean_coverage_ZNF770.hg38.30000,mean_coverage_ZSCAN16.hg38.30000,tfx_group
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bile_Duct_Cancer_CGPLPA114,0.02606,1,Bile_Duct_Cancer,II,,F,30000-sites,-0.042226,0.312164,0.322081,...,0.156921,-0.254731,-1.240088,0.487258,0.211782,-0.353443,-0.419691,-0.750493,0.029957,0-0.03TFx
Bile_Duct_Cancer_CGPLPA115,0.05922,1,Bile_Duct_Cancer,IV,,M,30000-sites,0.03297,1.057961,0.686518,...,0.60306,0.743916,-1.167991,0.635084,0.822281,0.441677,1.041876,-1.022608,0.483664,>0.05TFx
Bile_Duct_Cancer_CGPLPA117,0.0,1,Bile_Duct_Cancer,II,,M,30000-sites,-0.585273,-0.947509,-0.616956,...,0.02081,0.283118,0.21267,-0.075061,0.334148,-0.01756,-0.081756,0.683711,-0.592497,0-0.03TFx
Bile_Duct_Cancer_CGPLPA118,0.02789,1,Bile_Duct_Cancer,I,68.0,F,30000-sites,0.347597,0.370014,0.340557,...,-0.334589,-0.535742,-2.281892,0.484359,-0.124724,-0.057688,-1.202572,-0.942574,-0.516083,0-0.03TFx
Bile_Duct_Cancer_CGPLPA122,0.04373,1,Bile_Duct_Cancer,II,62.0,F,30000-sites,-0.428903,-0.568096,0.506378,...,-0.599248,-1.363667,-2.528825,-0.428684,-1.29385,-0.656629,-2.154421,-1.7173,0.447049,0.03-0.05TFx


In [7]:
# calculating probabilities
print('running '+str(iterations)+' logreg bootstrap iterations')
hyperparameters = {'C': [0.0001, 0.001,0.01,0.1,1,10,100,1000]}

probabilities_griffin,c_vals_griffin,coefs_griffin,num_pcs_griffin,train_indexes_griffin = run_bootstrap_with_PCA_griffin(data_griffin,iterations,features_griffin,report_interval,hyperparameters)    


running 1000 logreg bootstrap iterations
iteration: 0 , time (sec): 0.81 num_pcs: 8
iteration: 50 , time (sec): 264.19 num_pcs: 13
iteration: 100 , time (sec): 719.78 num_pcs: 12
iteration: 150 , time (sec): 1230.98 num_pcs: 11
iteration: 200 , time (sec): 1542.02 num_pcs: 9
iteration: 250 , time (sec): 1807.57 num_pcs: 11
iteration: 300 , time (sec): 2064.26 num_pcs: 9
iteration: 350 , time (sec): 2337.84 num_pcs: 12
iteration: 400 , time (sec): 2580.45 num_pcs: 14
iteration: 450 , time (sec): 2861.71 num_pcs: 12
iteration: 500 , time (sec): 3077.55 num_pcs: 11
iteration: 550 , time (sec): 3286.85 num_pcs: 10
iteration: 600 , time (sec): 3506.74 num_pcs: 10
iteration: 650 , time (sec): 3843.29 num_pcs: 14
iteration: 700 , time (sec): 4113.48 num_pcs: 10
iteration: 750 , time (sec): 4337.84 num_pcs: 9
iteration: 800 , time (sec): 4601.43 num_pcs: 9
iteration: 850 , time (sec): 4873.23 num_pcs: 14
iteration: 900 , time (sec): 5125.41 num_pcs: 16
iteration: 950 , time (sec): 5387.36 num_

In [8]:
# calculate AUC with confidence intervals
AUCs_griffin,CIs_griffin = get_AUC_griffin(probabilities_griffin,data_griffin,iterations)
CIs_griffin

Unnamed: 0,median,0.025,0.975
0-0.03TFx,0.921373,0.851839,0.956009
0.03-0.05TFx,0.973136,0.889771,0.996338
>0.05TFx,0.989183,0.917467,1.0
Bile_Duct_Cancer,0.972422,0.911439,0.998073
Breast_Cancer,0.935635,0.863241,0.977639
Colorectal_Cancer,0.976471,0.860797,0.997902
Gastric_cancer,0.931527,0.8043,0.992181
I,0.928594,0.830331,0.984001
II,0.931042,0.868021,0.967661
III,0.95,0.862822,0.988677


# PCA bootstrapping with Griffin data (using only breast cancer and all healthy)

In [9]:
bc = data_griffin[data_griffin['sample_type'] == 'Breast_Cancer']
healthy = data_griffin[data_griffin['sample_type'] == 'Healthy'] # using all healthys gives a unbalanced set
griffin_subsample = pd.concat([bc, healthy], axis=0)

# calculate probabilities
print('running '+str(iterations)+' logreg bootstrap iterations')
hyperparameters = {'C': [0.0001, 0.001,0.01,0.1,1,10,100,1000]}

probabilities_griffin_sub,c_vals_griffin_sub,coefs_griffin_sub,num_pcs_griffin_sub,train_indexes_griffin_sub = run_bootstrap_with_PCA_griffin(griffin_subsample,iterations,features_griffin,report_interval,hyperparameters)    

# calculate AUC with confidence intervals
AUCs_griffin_sub,CIs_griffin_sub = get_AUC_griffin(probabilities_griffin_sub,griffin_subsample,iterations)

CIs_griffin_sub

running 1000 logreg bootstrap iterations
iteration: 0 , time (sec): 1.14 num_pcs: 22
iteration: 50 , time (sec): 199.14 num_pcs: 7
iteration: 100 , time (sec): 410.22 num_pcs: 27
iteration: 150 , time (sec): 486.54 num_pcs: 13
iteration: 200 , time (sec): 521.69 num_pcs: 20
iteration: 250 , time (sec): 556.67 num_pcs: 15
iteration: 300 , time (sec): 591.53 num_pcs: 18
iteration: 350 , time (sec): 627.07 num_pcs: 19
iteration: 400 , time (sec): 662.84 num_pcs: 16
iteration: 450 , time (sec): 698.76 num_pcs: 20
iteration: 500 , time (sec): 733.86 num_pcs: 17
iteration: 550 , time (sec): 769.27 num_pcs: 21
iteration: 600 , time (sec): 805.39 num_pcs: 26
iteration: 650 , time (sec): 841.17 num_pcs: 14
iteration: 700 , time (sec): 876.55 num_pcs: 14
iteration: 750 , time (sec): 911.74 num_pcs: 17
iteration: 800 , time (sec): 947.17 num_pcs: 30
iteration: 850 , time (sec): 981.82 num_pcs: 16
iteration: 900 , time (sec): 1016.53 num_pcs: 16
iteration: 950 , time (sec): 1051.57 num_pcs: 27


Unnamed: 0,median,0.025,0.975
0-0.03TFx,0.933333,0.856781,0.980584
0.03-0.05TFx,0.87356,0.48796,1.0
Breast_Cancer,0.926447,0.850606,0.975046
I,0.936709,0.549668,1.0
II,0.941873,0.856409,0.990519
III,0.909436,0.700593,0.997971
overall,0.926447,0.850606,0.975046
>0.05TFx,1.0,0.992498,1.0


# PCA bootstrapping with Griffin data (using only breast cancer and the same number of healthy)

In [10]:
bc = data_griffin[data_griffin['sample_type'] == 'Breast_Cancer']
healthy = data_griffin[data_griffin['sample_type'] == 'Healthy'].head(54) 
# using a random selection of 54 healthys (same number) gives a balanced set
griffin_subsample = pd.concat([bc, healthy], axis=0)

# calculate probabilities
print('running '+str(iterations)+' logreg bootstrap iterations')
hyperparameters = {'C': [0.0001, 0.001,0.01,0.1,1,10,100,1000]}

probabilities_griffin_sub,c_vals_griffin_sub,coefs_griffin_sub,num_pcs_griffin_sub,train_indexes_griffin_sub = run_bootstrap_with_PCA_griffin(griffin_subsample,iterations,features_griffin,report_interval,hyperparameters)    

# calculate AUC with confidence intervals
AUCs_griffin_sub,CIs_griffin_sub = get_AUC_griffin(probabilities_griffin_sub,griffin_subsample,iterations)

CIs_griffin_sub

running 1000 logreg bootstrap iterations
iteration: 0 , time (sec): 0.57 num_pcs: 4
iteration: 50 , time (sec): 28.15 num_pcs: 4
iteration: 100 , time (sec): 56.01 num_pcs: 13
iteration: 150 , time (sec): 83.68 num_pcs: 6
iteration: 200 , time (sec): 111.33 num_pcs: 4
iteration: 250 , time (sec): 138.95 num_pcs: 5
iteration: 300 , time (sec): 166.44 num_pcs: 10
iteration: 350 , time (sec): 194.42 num_pcs: 15
iteration: 400 , time (sec): 222.21 num_pcs: 13
iteration: 450 , time (sec): 250.17 num_pcs: 4
iteration: 500 , time (sec): 277.93 num_pcs: 6
iteration: 550 , time (sec): 305.63 num_pcs: 7
iteration: 600 , time (sec): 333.38 num_pcs: 6
iteration: 650 , time (sec): 360.93 num_pcs: 4
iteration: 700 , time (sec): 388.5 num_pcs: 6
iteration: 750 , time (sec): 416.38 num_pcs: 5
iteration: 800 , time (sec): 444.33 num_pcs: 5
iteration: 850 , time (sec): 471.88 num_pcs: 5
iteration: 900 , time (sec): 500.19 num_pcs: 6
iteration: 950 , time (sec): 528.34 num_pcs: 7


Unnamed: 0,median,0.025,0.975
0-0.03TFx,0.806239,0.619032,0.949056
0.03-0.05TFx,0.714286,0.086957,1.0
Breast_Cancer,0.802111,0.616541,0.940792
I,0.742424,0.164333,1.0
II,0.846154,0.682684,0.960345
III,0.736842,0.322698,0.979026
overall,0.802111,0.616541,0.940792
>0.05TFx,1.0,0.96808,1.0


# PCA bootstrapping with Griffin data (using the file with 10,000 sites)

In [11]:
# Parameter

in_file_griffin_10000 = "/data/gpfs-1/groups/ag_kircher/cfDNA-analysis/lea/cfDNA_classification_analyses/features/10000-sites_reformatted.txt"
iterations = 1000
report_interval = 50
fraction_variance = .8
tfx_groups = [0.03,0.05]

In [12]:
data_griffin_10000,features_griffin_10000  = import_data_griffin(in_file_griffin_10000)
data_griffin_10000.head()

Features 1131
Total samples: 423


Unnamed: 0_level_0,tumor_fraction,status,sample_type,Stage,Age at Diagnosis,Gender,site_group,central_coverage_AHR.hg38.10000,central_coverage_AR.hg38.10000,central_coverage_ARID3A.hg38.10000,...,mean_coverage_ZNF701.hg38.10000,mean_coverage_ZNF768.hg38.10000,mean_coverage_ZNF770.hg38.10000,mean_coverage_ZNF778.hg38.10000,mean_coverage_ZNF792.hg38.10000,mean_coverage_ZSCAN16.hg38.10000,mean_coverage_ZSCAN22.hg38.10000,mean_coverage_ZSCAN4.hg38.10000,mean_coverage_ZSCAN9.hg38.10000,tfx_group
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bile_Duct_Cancer_CGPLPA114,0.02606,1,Bile_Duct_Cancer,II,,F,10000-sites,0.171594,0.33275,0.193461,...,-0.639478,-0.624524,0.260385,-0.345816,-0.362235,-0.08271,-0.007402,0.472299,-0.459406,0-0.03TFx
Bile_Duct_Cancer_CGPLPA115,0.05922,1,Bile_Duct_Cancer,IV,,M,10000-sites,0.196454,1.110641,0.379044,...,0.425257,0.059174,-0.487309,0.823683,0.652251,0.419692,1.027133,0.076323,0.691637,>0.05TFx
Bile_Duct_Cancer_CGPLPA117,0.0,1,Bile_Duct_Cancer,II,,M,10000-sites,-0.492087,-1.063978,-1.162155,...,0.450572,-0.185925,0.790116,0.121752,0.367547,-0.698863,0.05048,-0.596246,0.220266,0-0.03TFx
Bile_Duct_Cancer_CGPLPA118,0.02789,1,Bile_Duct_Cancer,I,68.0,F,10000-sites,0.461076,-0.141737,-1.255983,...,-0.222519,0.439724,-0.785283,0.092746,-0.820825,-0.491899,-0.41993,-0.792461,-0.909229,0-0.03TFx
Bile_Duct_Cancer_CGPLPA122,0.04373,1,Bile_Duct_Cancer,II,62.0,F,10000-sites,-0.493396,-0.444695,-1.954213,...,-1.677408,0.399411,-1.668168,-0.993217,-2.764206,-0.033734,-0.604603,-0.109254,-2.911435,0.03-0.05TFx


In [13]:
# calculate probabilities
print('running '+str(iterations)+' logreg bootstrap iterations')
hyperparameters = {'C': [0.0001, 0.001,0.01,0.1,1,10,100,1000]}

probabilities_griffin_10000,c_vals_griffin_10000,coefs_griffin_10000,num_pcs_griffin_10000,train_indexes_griffin_10000 = run_bootstrap_with_PCA_griffin(data_griffin_10000,iterations,features_griffin_10000,report_interval,hyperparameters)  

running 1000 logreg bootstrap iterations
iteration: 0 , time (sec): 0.94 num_pcs: 17
iteration: 50 , time (sec): 52.82 num_pcs: 30
iteration: 100 , time (sec): 106.29 num_pcs: 29
iteration: 150 , time (sec): 158.22 num_pcs: 27
iteration: 200 , time (sec): 209.29 num_pcs: 20
iteration: 250 , time (sec): 261.29 num_pcs: 28
iteration: 300 , time (sec): 312.51 num_pcs: 19
iteration: 350 , time (sec): 364.77 num_pcs: 28
iteration: 400 , time (sec): 416.8 num_pcs: 33
iteration: 450 , time (sec): 467.28 num_pcs: 30
iteration: 500 , time (sec): 518.85 num_pcs: 28
iteration: 550 , time (sec): 570.0 num_pcs: 26
iteration: 600 , time (sec): 620.97 num_pcs: 24
iteration: 650 , time (sec): 671.42 num_pcs: 32
iteration: 700 , time (sec): 722.25 num_pcs: 22
iteration: 750 , time (sec): 772.72 num_pcs: 23
iteration: 800 , time (sec): 822.67 num_pcs: 23
iteration: 850 , time (sec): 873.12 num_pcs: 34
iteration: 900 , time (sec): 923.25 num_pcs: 36
iteration: 950 , time (sec): 975.01 num_pcs: 23


In [14]:
# calculate AUC with confidence intervals
AUCs_griffin_10000,CIs_griffin_10000 = get_AUC_griffin(probabilities_griffin_10000,data_griffin_10000,iterations)
CIs_griffin_10000

Unnamed: 0,median,0.025,0.975
0-0.03TFx,0.929706,0.887084,0.962568
0.03-0.05TFx,0.975986,0.922618,0.998343
>0.05TFx,0.987059,0.909051,1.0
Bile_Duct_Cancer,0.972973,0.894208,0.998991
Breast_Cancer,0.951067,0.897441,0.984803
Colorectal_Cancer,0.97075,0.898048,1.0
Gastric_cancer,0.942404,0.836976,0.994452
I,0.936029,0.859719,0.98915
II,0.937465,0.884798,0.971873
III,0.959196,0.898748,0.993248


# PCA bootstrapping with Griffin data (only breast cancer and all healthys of 10,000 sites)

In [15]:
bc_10000 = data_griffin_10000[data_griffin_10000['sample_type'] == 'Breast_Cancer']
healthy_10000 = data_griffin_10000[data_griffin_10000['sample_type'] == 'Healthy'] #unbalanced

griffin_subsample_10000 = pd.concat([bc_10000, healthy_10000], axis=0)
griffin_subsample.head()

Unnamed: 0_level_0,tumor_fraction,status,sample_type,Stage,Age at Diagnosis,Gender,site_group,central_coverage_AHR.hg38.30000,central_coverage_AR.hg38.30000,central_coverage_ARNT.hg38.30000,...,mean_coverage_ZNF341.hg38.30000,mean_coverage_ZNF35.hg38.30000,mean_coverage_ZNF384.hg38.30000,mean_coverage_ZNF449.hg38.30000,mean_coverage_ZNF467.hg38.30000,mean_coverage_ZNF554.hg38.30000,mean_coverage_ZNF580.hg38.30000,mean_coverage_ZNF770.hg38.30000,mean_coverage_ZSCAN16.hg38.30000,tfx_group
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Breast_Cancer_CGPLBR100,0.008334,1,Breast_Cancer,III,44.0,F,30000-sites,0.220487,0.120372,-0.219724,...,0.592663,1.360328,0.436171,0.693055,0.935337,0.663122,1.009021,0.878994,-0.016209,0-0.03TFx
Breast_Cancer_CGPLBR101,0.0,1,Breast_Cancer,II,46.0,F,30000-sites,0.714137,0.753596,1.26158,...,0.078468,0.169807,-0.971526,0.495954,0.147939,0.404522,0.361313,-1.704495,0.523463,0-0.03TFx
Breast_Cancer_CGPLBR102,0.0,1,Breast_Cancer,II,47.0,F,30000-sites,0.409264,1.122586,0.54333,...,0.578485,0.633627,0.374888,0.545229,0.054834,0.82066,0.320949,-0.215868,0.057021,0-0.03TFx
Breast_Cancer_CGPLBR103,0.02131,1,Breast_Cancer,II,48.0,F,30000-sites,0.570667,0.975094,0.343328,...,0.420635,0.86327,0.115338,0.437983,0.923366,0.666095,0.834422,-0.692869,0.197113,0-0.03TFx
Breast_Cancer_CGPLBR104,0.02645,1,Breast_Cancer,II,68.0,F,30000-sites,0.604962,1.810533,0.439865,...,0.04066,0.555065,-1.164386,1.487258,-0.126054,0.658664,0.417636,-1.563636,1.144325,0-0.03TFx


In [16]:
# calculate probabilities
print('running '+str(iterations)+' logreg bootstrap iterations')
hyperparameters = {'C': [0.0001, 0.001,0.01,0.1,1,10,100,1000]}

probabilities_griffin_sub_10000,c_vals_griffin_sub_10000,coefs_griffin_sub_10000,num_pcs_griffin_sub_10000,train_indexes_griffin_sub_10000 = run_bootstrap_with_PCA_griffin(griffin_subsample_10000,iterations,features_griffin_10000,report_interval,hyperparameters)    

# calculate AUC with confidence intervals
AUCs_griffin_sub_10000,CIs_griffin_sub_10000 = get_AUC_griffin(probabilities_griffin_sub_10000,griffin_subsample_10000,iterations)
CIs_griffin_sub_10000

running 1000 logreg bootstrap iterations
iteration: 0 , time (sec): 0.85 num_pcs: 39
iteration: 50 , time (sec): 42.76 num_pcs: 19
iteration: 100 , time (sec): 84.59 num_pcs: 43
iteration: 150 , time (sec): 126.95 num_pcs: 25
iteration: 200 , time (sec): 168.61 num_pcs: 36
iteration: 250 , time (sec): 209.97 num_pcs: 30
iteration: 300 , time (sec): 251.84 num_pcs: 33
iteration: 350 , time (sec): 293.33 num_pcs: 35
iteration: 400 , time (sec): 335.33 num_pcs: 30
iteration: 450 , time (sec): 377.05 num_pcs: 35
iteration: 500 , time (sec): 418.91 num_pcs: 33
iteration: 550 , time (sec): 460.46 num_pcs: 37
iteration: 600 , time (sec): 502.1 num_pcs: 43
iteration: 650 , time (sec): 543.93 num_pcs: 27
iteration: 700 , time (sec): 585.54 num_pcs: 28
iteration: 750 , time (sec): 627.07 num_pcs: 33
iteration: 800 , time (sec): 668.71 num_pcs: 47
iteration: 850 , time (sec): 709.61 num_pcs: 32
iteration: 900 , time (sec): 751.29 num_pcs: 30
iteration: 950 , time (sec): 792.59 num_pcs: 44


Unnamed: 0,median,0.025,0.975
0-0.03TFx,0.93962,0.87207,0.98527
0.03-0.05TFx,0.915109,0.526038,1.0
Breast_Cancer,0.936659,0.864848,0.984268
I,0.962963,0.721738,1.0
II,0.945254,0.871794,0.991556
III,0.921218,0.713116,1.0
overall,0.936659,0.864848,0.984268
>0.05TFx,1.0,0.992644,1.0


# PCA bootstrapping with Griffin data (using only breast cancer and the same number of healthys of 10,000 sites)

In [17]:
bc_10000 = data_griffin_10000[data_griffin_10000['sample_type'] == 'Breast_Cancer']
healthy_10000 = data_griffin_10000[data_griffin_10000['sample_type'] == 'Healthy'].head(54) #balanced

griffin_subsample_10000 = pd.concat([bc_10000, healthy_10000], axis=0)

In [18]:
print('running '+str(iterations)+' logreg bootstrap iterations')
hyperparameters = {'C': [0.0001, 0.001,0.01,0.1,1,10,100,1000]}

probabilities_griffin_sub_10000,c_vals_griffin_sub_10000,coefs_griffin_sub_10000,num_pcs_griffin_sub_10000,train_indexes_griffin_sub_10000 = run_bootstrap_with_PCA_griffin(griffin_subsample_10000,iterations,features_griffin_10000,report_interval,hyperparameters)    

print('Getting AUC')
AUCs_griffin_sub_10000,CIs_griffin_sub_10000 = get_AUC_griffin(probabilities_griffin_sub_10000,griffin_subsample_10000,iterations)
CIs_griffin_sub_10000

running 1000 logreg bootstrap iterations
iteration: 0 , time (sec): 0.56 num_pcs: 6
iteration: 50 , time (sec): 28.97 num_pcs: 6
iteration: 100 , time (sec): 57.51 num_pcs: 20
iteration: 150 , time (sec): 86.04 num_pcs: 10
iteration: 200 , time (sec): 114.42 num_pcs: 6
iteration: 250 , time (sec): 142.91 num_pcs: 9
iteration: 300 , time (sec): 171.35 num_pcs: 15
iteration: 350 , time (sec): 199.9 num_pcs: 22
iteration: 400 , time (sec): 228.33 num_pcs: 19
iteration: 450 , time (sec): 256.88 num_pcs: 6
iteration: 500 , time (sec): 285.23 num_pcs: 10
iteration: 550 , time (sec): 313.74 num_pcs: 12
iteration: 600 , time (sec): 342.25 num_pcs: 10
iteration: 650 , time (sec): 370.72 num_pcs: 5
iteration: 700 , time (sec): 399.16 num_pcs: 10
iteration: 750 , time (sec): 427.63 num_pcs: 9
iteration: 800 , time (sec): 456.09 num_pcs: 9
iteration: 850 , time (sec): 484.47 num_pcs: 8
iteration: 900 , time (sec): 512.86 num_pcs: 11
iteration: 950 , time (sec): 541.3 num_pcs: 12
Getting AUC


Unnamed: 0,median,0.025,0.975
0-0.03TFx,0.848391,0.655975,0.959875
0.03-0.05TFx,0.794553,0.212788,1.0
Breast_Cancer,0.848084,0.663716,0.954904
I,0.847222,0.270537,1.0
II,0.86724,0.707061,0.969242
III,0.81,0.461741,0.984276
overall,0.848084,0.663716,0.954904
>0.05TFx,1.0,0.952381,1.0
