# About this notebook
This script computes random forest and gradient boosting models based on the features generated by the Griffin workflow published by Doebley et al. (Nature Communications, 2022). The feature set used in this script was downloaded from their repository (https://github.com/adoebley/Griffin_analyses/tree/main/delfi_data_cancer_detection/number_of_sites_analysis/merged_data). It was used in both tree-based models with the same different adjustments (e.g. scaled data, Principal Component Analysis for dimensionality reduction, adjusting the hyperparameter, hyperparameter optimization) to compare performances.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.signal import savgol_filter

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve,auc,accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA

# Read in the data

In [2]:
# params
in_file_griffin = "/data/gpfs-1/groups/ag_kircher/cfDNA-analysis/lea/cfDNA_classification_analyses/features/30000-sites_reformatted.txt"

data = pd.read_csv(in_file_griffin, sep='\t')
data = data.set_index('sample')

features = data.columns[(data.columns.str.startswith('central_cov')) | (data.columns.str.startswith('mean_cov')) | (data.columns.str.startswith('amplitude')) ]
#features = pd.concat([data['status'], data['sample_type'], data[features]], axis=1)

data.head()

Unnamed: 0_level_0,tumor_fraction,status,sample_type,Stage,Age at Diagnosis,Gender,site_group,central_coverage_AHR.hg38.30000,central_coverage_AR.hg38.30000,central_coverage_ARNT.hg38.30000,...,mean_coverage_ZNF317.hg38.30000,mean_coverage_ZNF341.hg38.30000,mean_coverage_ZNF35.hg38.30000,mean_coverage_ZNF384.hg38.30000,mean_coverage_ZNF449.hg38.30000,mean_coverage_ZNF467.hg38.30000,mean_coverage_ZNF554.hg38.30000,mean_coverage_ZNF580.hg38.30000,mean_coverage_ZNF770.hg38.30000,mean_coverage_ZSCAN16.hg38.30000
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bile_Duct_Cancer_CGPLPA114,0.02606,1,Bile_Duct_Cancer,II,,F,30000-sites,0.96701,1.07815,1.00625,...,1.01354,0.94214,0.96241,0.93693,1.00252,0.95153,0.97131,0.91894,0.97111,0.94607
Bile_Duct_Cancer_CGPLPA115,0.05922,1,Bile_Duct_Cancer,IV,,M,30000-sites,0.9694,1.09246,1.01414,...,1.01352,0.94686,0.96902,0.93733,1.00303,0.95612,0.97666,0.93451,0.97026,0.94892
Bile_Duct_Cancer_CGPLPA117,0.0,1,Bile_Duct_Cancer,II,,M,30000-sites,0.94975,1.05398,0.98592,...,1.01075,0.9407,0.96597,0.94499,1.00058,0.95245,0.97357,0.92254,0.97559,0.94216
Bile_Duct_Cancer_CGPLPA118,0.02789,1,Bile_Duct_Cancer,I,68.0,F,30000-sites,0.9794,1.07926,1.00665,...,1.01193,0.93694,0.96055,0.93115,1.00251,0.949,0.9733,0.9106,0.97051,0.94264
Bile_Duct_Cancer_CGPLPA122,0.04373,1,Bile_Duct_Cancer,II,62.0,F,30000-sites,0.95472,1.06126,1.01024,...,1.01149,0.93414,0.95507,0.92978,0.99936,0.94021,0.96927,0.90046,0.96809,0.94869


In [3]:
bc = data[data['sample_type'] == 'Breast_Cancer'] #54
healthy = data[data['sample_type'] == 'Healthy'] #215

breast_cancer = pd.concat([bc, healthy], axis=0) #215 + 54 = 269 samples
breast_cancer_balanced = pd.concat([bc, healthy.head(54)], axis=0) #54 + 54 = 1 samples

In [4]:
train,test = train_test_split(data, test_size = 0.25, random_state = 42)
# 81 training samples
# 27 testing samples

X_train = train[features]
y_train = train.loc[:,"status"]
X_test = test[features]
y_test = test.loc[:,"status"]

In [5]:
# scale data
scaled_data = data
scaler = StandardScaler()
scaler.fit(scaled_data[features])
scaled_data[features] = scaler.transform(scaled_data[features])

In [6]:
train_scaled,test_scaled = train_test_split(scaled_data, test_size = 0.25, random_state = 42)
# 81 training samples
# 27 testing samples

X_train_scaled = train_scaled[features]
y_train_scaled = train_scaled.loc[:,"status"]
X_test_scaled = test_scaled[features]
y_test_scaled = test_scaled.loc[:,"status"]

In [7]:
bc = scaled_data[scaled_data['sample_type'] == 'Breast_Cancer'] #54
healthy = scaled_data[scaled_data['sample_type'] == 'Healthy'] #215

breast_cancer_scaled = pd.concat([bc, healthy], axis=0) #215 + 54 = 269 samples
breast_cancer_balanced_scaled = pd.concat([bc, healthy.head(54)], axis=0) #54 + 54 = 1 samples

# Functions

In [8]:
def get_AUC_griffin(prob,data):
    #get AUC and accuracy for each bootstrap
    probabilities = pd.DataFrame(index=data.index)
    probabilities[0] = pd.Series(prob[:,1], index = X_test.index)
    probabilities = probabilities.merge(data[['status']], left_index=True, right_index=True)

    AUCs = pd.DataFrame()

    probabilities = probabilities.merge(data[['tumor_fraction','sample_type','Stage']], left_index=True, right_index=True)
    
    for i in range(1):
        current_dict = {}
        current = probabilities[~(probabilities[i].isnull())][['status','tumor_fraction','sample_type','Stage',i]].copy()

        #overall accuracy and AUC
        group = 'overall'
        fpr,tpr,_ = roc_curve(current['status'],current[i])
        AUC = auc(fpr,tpr)
        current_dict[group] = AUC
        del(AUC,group,fpr,tpr)

        #separate out the healthy samples to be used in every AUC
        healthy_df = current[current['status']==0]
        cancer_df = current[current['status']==1]
        del(current)
        
        for group,df in cancer_df.groupby('sample_type'):
            if group == 'Duodenal_Cancer':
                continue

            df2 = df.append(healthy_df, ignore_index=True)
            fpr,tpr,_ = roc_curve(df2['status'],df2[i])
            AUC = auc(fpr,tpr)
            current_dict[group] = AUC
            del(AUC,group,fpr,tpr)
            
        AUCs = AUCs.append(pd.Series(current_dict), ignore_index=True)
        
    CIs = pd.DataFrame([AUCs.median(), AUCs.quantile(.025), AUCs.quantile(.975)]).T
    CIs = CIs.rename(columns = {'Unnamed 0':'median'})    
    return(AUCs,CIs)

In [9]:
def calculate_RF(data, PCA_flag, adjustment_flag, iterations):
    fraction_variance = .8
    probabilities = pd.DataFrame(index=data.index)
    coefs = pd.DataFrame(index=features)
    train_indexes = []

    # Loop for each iteration
    for i in range(iterations):

        train_internal,test_internal = train_test_split(data, test_size = 0.25, random_state = i+100) #, random_state = 42
        X_train_internal = train_internal[features]
        y_train_internal = train_internal.loc[:,"status"]
        X_test_internal = test_internal[features]
        y_test_internal = test_internal.loc[:,"status"]
        
        if PCA_flag == True:
            #perform PCA on the training set
            n_components = min(len(features), len(X_train_internal))
            pca = PCA(n_components=n_components, svd_solver='randomized', random_state = 100)
            PCs = pca.fit_transform(X_train_internal[features])
            principal_components = pd.DataFrame(data = PCs, columns = ['PC_'+str(m) for m in np.arange(n_components)], index = X_train_internal.index)

            #find the principle components that make up 80% of the varience
            for j in range(len(pca.explained_variance_ratio_)):
                current_sum = pca.explained_variance_ratio_[:j].sum()
                if current_sum>=fraction_variance:
                    break
            pca_features = ['PC_'+str(m) for m in np.arange(0,j)]

            #apply to the test data
            test_PCs = pca.transform(X_test_internal[features])
            test_principal_components = pd.DataFrame(data = test_PCs , columns = ['PC_'+str(m) for m in np.arange(n_components)], index = X_test_internal.index)

            X_train_internal = principal_components[pca_features]
            X_test_internal = test_principal_components[pca_features]
    
        if adjustment_flag == True:
            model = RandomForestClassifier(max_samples=81, min_samples_leaf=3,n_estimators=300)
        else:
            model = RandomForestClassifier()

        #train a new model 
        model.fit(X_train_internal, y_train_internal)

        #predict the test data
        pred = model.predict(X_test_internal)
        prob = model.predict_proba(X_test_internal)

        #save results
        probabilities[i] = pd.Series(prob[:,1], index = X_test_internal.index)
        train_indexes.append(list(X_train_internal.index))
        
        if i%20==0:
            #prevent dfs from becoming too fragmented
            probabilities = probabilities.copy()
            coefs = coefs.copy()

    probabilities = probabilities.merge(data[['status']], left_index=True, right_index=True)
    acc = accuracy_score(y_test_internal, pred)
    
    AUCs = pd.DataFrame()
    probabilities = probabilities.merge(data[['tumor_fraction','sample_type','Stage']], left_index=True, right_index=True)

    for i in range(iterations):
        current_dict = {}
        current = probabilities[~(probabilities[i].isnull())][['status','tumor_fraction','sample_type','Stage',i]].copy()

        #overall accuracy and AUC
        group = 'overall'
        fpr,tpr,_ = roc_curve(current['status'],current[i])
        AUC = auc(fpr,tpr)
        current_dict[group] = AUC
        del(AUC,group,fpr,tpr)

        #separate out the healthy samples to be used in every AUC
        healthy_df = current[current['status']==0]
        cancer_df = current[current['status']==1]
        del(current)

        for group,df in cancer_df.groupby('sample_type'):
            if group == 'Duodenal_Cancer':
                continue

            df2 = df.append(healthy_df, ignore_index=True)
            fpr,tpr,_ = roc_curve(df2['status'],df2[i])
            AUC = auc(fpr,tpr)
            current_dict[group] = AUC
            del(AUC,group,fpr,tpr)

        AUCs = AUCs.append(pd.Series(current_dict), ignore_index=True)

    CIs = pd.DataFrame([AUCs.median(), AUCs.quantile(.025), AUCs.quantile(.975)]).T
    CIs = CIs.rename(columns = {'Unnamed 0':'median'}) 
    return(acc,CIs)

In [10]:
def calculate_GBC(data, PCA_flag, adjustment_flag, iterations):
    fraction_variance = .8
    probabilities = pd.DataFrame(index=data.index)
    coefs = pd.DataFrame(index=features)
    train_indexes = []

    # Loop for each iteration
    for i in range(iterations):

        train_internal,test_internal = train_test_split(data, test_size = 0.25, random_state = i+100) #, random_state = 42
        X_train_internal = train_internal[features]
        y_train_internal = train_internal.loc[:,"status"]
        X_test_internal = test_internal[features]
        y_test_internal = test_internal.loc[:,"status"]
        
        if PCA_flag == True:
            #perform PCA on the training set
            n_components = min(len(features), len(X_train_internal))
            pca = PCA(n_components=n_components, svd_solver='randomized', random_state = 100)
            PCs = pca.fit_transform(X_train_internal[features])
            principal_components = pd.DataFrame(data = PCs, columns = ['PC_'+str(m) for m in np.arange(n_components)], index = X_train_internal.index)

            #find the principle components that make up 80% of the varience
            for j in range(len(pca.explained_variance_ratio_)):
                current_sum = pca.explained_variance_ratio_[:j].sum()
                if current_sum>=fraction_variance:
                    break
            pca_features = ['PC_'+str(m) for m in np.arange(0,j)]

            #apply to the test data
            test_PCs = pca.transform(X_test_internal[features])
            test_principal_components = pd.DataFrame(data = test_PCs , columns = ['PC_'+str(m) for m in np.arange(n_components)], index = X_test_internal.index)

            X_train_internal = principal_components[pca_features]
            X_test_internal = test_principal_components[pca_features]
    
        if adjustment_flag == True:
            model = GradientBoostingClassifier(n_estimators=300, min_samples_leaf=3)
        else:
            model = GradientBoostingClassifier()

        #train a new model 
        model.fit(X_train_internal, y_train_internal)

        #predict the test data
        pred = model.predict(X_test_internal)
        prob = model.predict_proba(X_test_internal)

        #save results
        probabilities[i] = pd.Series(prob[:,1], index = X_test_internal.index)
        train_indexes.append(list(X_train_internal.index))
        
        if i%20==0:
            #prevent dfs from becoming too fragmented
            probabilities = probabilities.copy()
            coefs = coefs.copy()

    probabilities = probabilities.merge(data[['status']], left_index=True, right_index=True)
    acc = accuracy_score(y_test_internal, pred)
    
    AUCs = pd.DataFrame()
    probabilities = probabilities.merge(data[['tumor_fraction','sample_type','Stage']], left_index=True, right_index=True)

    for i in range(iterations):
        current_dict = {}
        current = probabilities[~(probabilities[i].isnull())][['status','tumor_fraction','sample_type','Stage',i]].copy()

        #overall accuracy and AUC
        group = 'overall'
        fpr,tpr,_ = roc_curve(current['status'],current[i])
        AUC = auc(fpr,tpr)
        current_dict[group] = AUC
        del(AUC,group,fpr,tpr)

        #separate out the healthy samples to be used in every AUC
        healthy_df = current[current['status']==0]
        cancer_df = current[current['status']==1]
        del(current)

        for group,df in cancer_df.groupby('sample_type'):
            if group == 'Duodenal_Cancer':
                continue

            df2 = df.append(healthy_df, ignore_index=True)
            fpr,tpr,_ = roc_curve(df2['status'],df2[i])
            AUC = auc(fpr,tpr)
            current_dict[group] = AUC
            del(AUC,group,fpr,tpr)

        AUCs = AUCs.append(pd.Series(current_dict), ignore_index=True)

    CIs = pd.DataFrame([AUCs.median(), AUCs.quantile(.025), AUCs.quantile(.975)]).T
    CIs = CIs.rename(columns = {'Unnamed 0':'median'}) 
    return(acc,CIs)

In [11]:
def hyperparameter_optimization_RF(data, PCA_flag, HPO_type, iterations):
    probabilities = pd.DataFrame(index=data.index)
    coefs = pd.DataFrame(index=features)
    train_indexes = []
    fraction_variance = .8

    # Loop for each iteration
    for i in range(iterations):

        train_internal,test_internal = train_test_split(data, test_size = 0.25, random_state = i+100) #, random_state = 42

        X_train_internal = train_internal[features]
        y_train_internal = train_internal.loc[:,"status"]
        X_test_internal = test_internal[features]
        y_test_internal = test_internal.loc[:,"status"]

        if PCA_flag == True:
            #perform PCA on the training set
            n_components = min(len(features), len(X_train_internal))
            pca = PCA(n_components=n_components, svd_solver='randomized', random_state = 100)
            PCs = pca.fit_transform(X_train_internal[features])
            principal_components = pd.DataFrame(data = PCs, columns = ['PC_'+str(m) for m in np.arange(n_components)], index = X_train_internal.index)

            #find the principle components that make up 80% of the varience
            for j in range(len(pca.explained_variance_ratio_)):
                current_sum = pca.explained_variance_ratio_[:j].sum()
                if current_sum>=fraction_variance:
                    break
            pca_features = ['PC_'+str(m) for m in np.arange(0,j)]

            #apply to the test data
            test_PCs = pca.transform(X_test_internal[features])
            test_principal_components = pd.DataFrame(data = test_PCs , columns = ['PC_'+str(m) for m in np.arange(n_components)], index = X_test_internal.index)

            X_train_internal = principal_components[pca_features]
            X_test_internal = test_principal_components[pca_features]

        if HPO_type == "random" and i == 0:
            n_estimators = range(100,501,50) #list [start:stop:step], np arange
            #max_features = np.arange(50,251,50)
            min_samples_leaf = range(2,6)
            max_samples = np.arange(61,73,2)
            #max_depth = range(3,16)

            random_grid = {'n_estimators': n_estimators,
                           #'max_features': max_features,
                           'min_samples_leaf': min_samples_leaf, 
                           'max_samples': max_samples,
                           'bootstrap': [True]} 
                           #'max_depth': max_depth}

            cv = StratifiedKFold(n_splits=10, shuffle=True, random_state = i+100) 
            rf = RandomForestClassifier()
            random_search = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 200, cv = cv, verbose=2, random_state=41, n_jobs = -1)
            random_search.fit(X_train_internal, y_train_internal)

            best_n_estimators = random_search.best_params_['n_estimators']
            #best_max_features = random_search.best_params_['max_features']
            best_min_samples_leaf = random_search.best_params_['min_samples_leaf']
            best_max_samples = random_search.best_params_['max_samples']
        
        if HPO_type == "grid" and i == 0:
            n_estimators = range(100,501,50) #list [start:stop:step], np arange
            max_features = np.arange(50,251,50)
            min_samples_leaf = range(2,6)
            max_samples = np.arange(61,73,2)
            #max_depth = range(3,16)

            full_grid = {'n_estimators': n_estimators,
                       'max_features': max_features,
                       'min_samples_leaf': min_samples_leaf, 
                       'max_samples': max_samples,
                       'bootstrap': [True]} 
                       #'max_depth': max_depth}

            cv = StratifiedKFold(n_splits=10, shuffle=True, random_state = i+100) 
            rf = RandomForestClassifier()
            grid_search = GridSearchCV(estimator = rf, param_grid = full_grid,  cv = cv, verbose=2, n_jobs = -1)
            grid_search.fit(X_train_internal, y_train_internal)

            best_n_estimators = grid_search.best_params_['n_estimators']
            #best_max_features = grid_search.best_params_['max_features']
            best_min_samples_leaf = grid_search.best_params_['min_samples_leaf']
            best_max_samples = grid_search.best_params_['max_samples']

        model = RandomForestClassifier(max_samples=best_max_samples, min_samples_leaf=best_min_samples_leaf, n_estimators=best_n_estimators)
        model.fit(X_train_internal, y_train_internal)

        #predict the test data
        pred = model.predict(X_test_internal)
        prob = model.predict_proba(X_test_internal)

        #save results
        probabilities[i] = pd.Series(prob[:,1], index = X_test_internal.index)
        acc = accuracy_score(y_test_internal, pred)
        train_indexes.append(list(X_train_internal.index))

        if i%20==0:
            #prevent dfs from becoming too fragmented
            probabilities = probabilities.copy()
            coefs = coefs.copy()

    probabilities = probabilities.merge(data[['status']], left_index=True, right_index=True)
    AUCs = pd.DataFrame()
    probabilities = probabilities.merge(data[['tumor_fraction','sample_type','Stage']], left_index=True, right_index=True)

    for i in range(iterations):
        current_dict = {}
        current = probabilities[~(probabilities[i].isnull())][['status','tumor_fraction','sample_type','Stage',i]].copy()

        #overall accuracy and AUC
        group = 'overall'
        fpr,tpr,_ = roc_curve(current['status'],current[i])
        AUC = auc(fpr,tpr)
        current_dict[group] = AUC
        del(AUC,group,fpr,tpr)

        #separate out the healthy samples to be used in every AUC
        healthy_df = current[current['status']==0]
        cancer_df = current[current['status']==1]
        del(current)

        for group,df in cancer_df.groupby('sample_type'):
            if group == 'Duodenal_Cancer':
                continue

            df2 = df.append(healthy_df, ignore_index=True)
            fpr,tpr,_ = roc_curve(df2['status'],df2[i])
            AUC = auc(fpr,tpr)
            current_dict[group] = AUC
            del(AUC,group,fpr,tpr)

        AUCs = AUCs.append(pd.Series(current_dict), ignore_index=True)

    CIs = pd.DataFrame([AUCs.median(), AUCs.quantile(.025), AUCs.quantile(.975)]).T
    CIs = CIs.rename(columns = {'Unnamed 0':'median'}) 
    return(acc, CIs)

# Random Forest Classifier

In [12]:
# default model (all cancer types)
acc,CIs = calculate_RF(data, False, False, 1)
print(acc)
print(CIs)

0.8490566037735849
                     median     0.025     0.975
Bile_Duct_Cancer   0.973164  0.973164  0.973164
Breast_Cancer      0.900726  0.900726  0.900726
Colorectal_Cancer  0.932203  0.932203  0.932203
Gastric_cancer     0.937853  0.937853  0.937853
Lung_Cancer        0.995763  0.995763  0.995763
Ovarian_Cancer     0.957627  0.957627  0.957627
Pancreatic_Cancer  0.893597  0.893597  0.893597
overall            0.926253  0.926253  0.926253


In [13]:
# default model (all cancer types) - 1000 iteration
acc,CIs = calculate_RF(data, False, False, 1000)
print(acc)
print(CIs)

0.8490566037735849
                     median     0.025     0.975
Bile_Duct_Cancer   0.972588  0.885686  1.000000
Breast_Cancer      0.926226  0.843730  0.981231
Colorectal_Cancer  0.951097  0.856349  1.000000
Gastric_cancer     0.949120  0.853291  1.000000
Lung_Cancer        1.000000  0.958504  1.000000
Ovarian_Cancer     0.985421  0.933698  1.000000
Pancreatic_Cancer  0.880864  0.731009  0.986691
overall            0.939037  0.894646  0.972860


In [14]:
# default model (only breast cancer, unbalanced)
acc,CIs = calculate_RF(breast_cancer, False, False, 1)
print(acc)
print(CIs)

0.8676470588235294
                 median     0.025     0.975
Breast_Cancer  0.709483  0.709483  0.709483
overall        0.709483  0.709483  0.709483


In [15]:
# default model (only breast cancer, unbalanced) - 1000 iteration
acc,CIs = calculate_RF(breast_cancer, False, False, 1000)
print(acc)
print(CIs)

0.8823529411764706
                 median     0.025     0.975
Breast_Cancer  0.895804  0.780391  0.968972
overall        0.895804  0.780391  0.968972


In [16]:
# default model (only breast cancer, balanced)
acc,CIs = calculate_RF(breast_cancer_balanced, False, False, 1)
print(acc)
print(CIs)

0.8148148148148148
                 median     0.025     0.975
Breast_Cancer  0.830556  0.830556  0.830556
overall        0.830556  0.830556  0.830556


In [17]:
# default model (only breast cancer, balanced) - 1000 iteration
acc,CIs = calculate_RF(breast_cancer_balanced, False, False, 1000)
print(acc)
print(CIs)

0.7037037037037037
                 median     0.025     0.975
Breast_Cancer  0.860795  0.702868  0.965937
overall        0.860795  0.702868  0.965937


The same using scaled data.

In [18]:
# default model with scaled data (all cancer types)
acc,CIs = calculate_RF(scaled_data, False, False, 1)
print(acc)
print(CIs)

0.8113207547169812
                     median     0.025     0.975
Bile_Duct_Cancer   0.967514  0.967514  0.967514
Breast_Cancer      0.884988  0.884988  0.884988
Colorectal_Cancer  0.911017  0.911017  0.911017
Gastric_cancer     0.923729  0.923729  0.923729
Lung_Cancer        0.995763  0.995763  0.995763
Ovarian_Cancer     0.972458  0.972458  0.972458
Pancreatic_Cancer  0.888889  0.888889  0.888889
overall            0.916697  0.916697  0.916697


In [19]:
# default model with scaled data (all cancer types) - 1000 iteration
acc,CIs = calculate_RF(scaled_data, False, False, 1000)
print(acc)
print(CIs)

0.8584905660377359
                     median     0.025     0.975
Bile_Duct_Cancer   0.971154  0.887898  1.000000
Breast_Cancer      0.927015  0.842808  0.982744
Colorectal_Cancer  0.953939  0.859530  1.000000
Gastric_cancer     0.948712  0.856442  1.000000
Lung_Cancer        1.000000  0.953946  1.000000
Ovarian_Cancer     0.984958  0.937462  1.000000
Pancreatic_Cancer  0.882325  0.741462  0.981856
overall            0.939380  0.893071  0.972031


In [20]:
# default model with scaled data (only breast cancer, unbalanced)
acc,CIs = calculate_RF(breast_cancer_scaled, False, False, 1)
print(acc)
print(CIs)

0.8676470588235294
                 median     0.025     0.975
Breast_Cancer  0.740517  0.740517  0.740517
overall        0.740517  0.740517  0.740517


In [21]:
# default model with scaled data (only breast cancer, unbalanced) - 1000 iteration
acc,CIs = calculate_RF(breast_cancer_scaled, False, False, 1000)
print(acc)
print(CIs)

0.8970588235294118
                 median     0.025     0.975
Breast_Cancer  0.894941  0.783306  0.967622
overall        0.894941  0.783306  0.967622


In [22]:
# default model with scaled data (only breast cancer, balanced)
acc,CIs = calculate_RF(breast_cancer_balanced_scaled, False, False, 1)
print(acc)
print(CIs)

0.8518518518518519
                 median     0.025     0.975
Breast_Cancer  0.852778  0.852778  0.852778
overall        0.852778  0.852778  0.852778


In [23]:
# default model with scaled data (only breast cancer, balanced) - 1000 iteration
acc,CIs = calculate_RF(breast_cancer_balanced_scaled, False, False, 1000)
print(acc)
print(CIs)

0.7037037037037037
                 median     0.025     0.975
Breast_Cancer  0.862637  0.702778  0.965928
overall        0.862637  0.702778  0.965928


Addding Principal Component Analysis to reduce the dimensionality. (Still using the scaled data because it is needed for PCA).

In [24]:
# default model with scaled data using PCA (all cancer types)
acc,CIs = calculate_RF(scaled_data, True, False, 1)
print(acc)
print(CIs)

0.839622641509434
                     median     0.025     0.975
Bile_Duct_Cancer   0.973164  0.973164  0.973164
Breast_Cancer      0.908596  0.908596  0.908596
Colorectal_Cancer  0.980226  0.980226  0.980226
Gastric_cancer     0.939266  0.939266  0.939266
Lung_Cancer        0.983051  0.983051  0.983051
Ovarian_Cancer     0.953390  0.953390  0.953390
Pancreatic_Cancer  0.805085  0.805085  0.805085
overall            0.917057  0.917057  0.917057


In [25]:
# default model with scaled data using PCA (all cancer types) - 1000 iteration
acc,CIs = calculate_RF(scaled_data, True, False, 1000)
print(acc)
print(CIs)

0.8867924528301887
                     median     0.025     0.975
Bile_Duct_Cancer   0.949405  0.845527  1.000000
Breast_Cancer      0.901127  0.784097  0.981932
Colorectal_Cancer  0.982143  0.913800  1.000000
Gastric_cancer     0.931579  0.772914  1.000000
Lung_Cancer        0.995968  0.870370  1.000000
Ovarian_Cancer     0.980392  0.915510  1.000000
Pancreatic_Cancer  0.851060  0.663009  0.974527
overall            0.923898  0.870519  0.964217


In [26]:
# default model with scaled data using PCA (only breast cancer, unbalanced)
acc,CIs = calculate_RF(breast_cancer_scaled, True, False, 1)
print(acc)
print(CIs)

0.8970588235294118
                 median     0.025     0.975
Breast_Cancer  0.827586  0.827586  0.827586
overall        0.827586  0.827586  0.827586


In [27]:
# default model with scaled data using PCA (only breast cancer, unbalanced) - 1000 iteration
acc,CIs = calculate_RF(breast_cancer_scaled, True, False, 1000)
print(acc)
print(CIs)

0.8823529411764706
                 median     0.025     0.975
Breast_Cancer  0.892335  0.766397  0.974839
overall        0.892335  0.766397  0.974839


In [28]:
# default model with scaled data using PCA (only breast cancer, balanced)
acc,CIs = calculate_RF(breast_cancer_balanced_scaled, True, False, 1)
print(acc)
print(CIs)

0.6666666666666666
                 median     0.025     0.975
Breast_Cancer  0.802778  0.802778  0.802778
overall        0.802778  0.802778  0.802778


In [29]:
# default model with scaled data using PCA (only breast cancer, balanced) - 1000 iteration
acc,CIs = calculate_RF(breast_cancer_balanced_scaled, True, False, 1000)
print(acc)
print(CIs)

0.8148148148148148
                 median     0.025     0.975
Breast_Cancer  0.818681  0.602216  0.958403
overall        0.818681  0.602216  0.958403


Using PCA does not imprrove the performance and is therefore not used further. Now the adjusted model is used to compare performance.

In [30]:
# adjusted model with scaled data (all cancer types)
acc,CIs = calculate_RF(scaled_data, False, True, 1)
print(acc)
print(CIs)

0.8113207547169812
                     median     0.025     0.975
Bile_Duct_Cancer   0.971751  0.971751  0.971751
Breast_Cancer      0.889831  0.889831  0.889831
Colorectal_Cancer  0.872881  0.872881  0.872881
Gastric_cancer     0.867232  0.867232  0.867232
Lung_Cancer        1.000000  1.000000  1.000000
Ovarian_Cancer     0.949153  0.949153  0.949153
Pancreatic_Cancer  0.871940  0.871940  0.871940
overall            0.901551  0.901551  0.901551


In [39]:
# adjusted model with scaled data (all cancer types) - 1000 iteration
acc,CIs = calculate_RF(scaled_data, False, True, 1000)
print(acc)
print(CIs)

0.8490566037735849
                     median     0.025     0.975
Bile_Duct_Cancer   0.964744  0.874886  1.000000
Breast_Cancer      0.919645  0.832548  0.977283
Colorectal_Cancer  0.911926  0.790882  0.988104
Gastric_cancer     0.925481  0.829646  0.992608
Lung_Cancer        1.000000  0.961720  1.000000
Ovarian_Cancer     0.978892  0.923892  1.000000
Pancreatic_Cancer  0.868609  0.715904  0.979170
overall            0.925362  0.873612  0.964647


In [40]:
# adjusted model with scaled data using PCA (only breast cancer, unbalanced)
acc,CIs = calculate_RF(breast_cancer_scaled, False, True, 1)
print(acc)
print(CIs)

0.8823529411764706
                 median     0.025     0.975
Breast_Cancer  0.741379  0.741379  0.741379
overall        0.741379  0.741379  0.741379


In [41]:
# adjusted model with scaled data using PCA (only breast cancer, unbalanced) - 1000 iteration
acc,CIs = calculate_RF(breast_cancer_scaled, False, True, 1000)
print(acc)
print(CIs)

0.8676470588235294
                 median     0.025     0.975
Breast_Cancer  0.896635  0.791208  0.964286
overall        0.896635  0.791208  0.964286


In [42]:
# adjusted model with scaled data using PCA (only breast cancer, balanced)
acc,CIs = calculate_RF(breast_cancer_balanced_scaled, False, True, 1)
print(acc)
print(CIs)

0.8518518518518519
                 median     0.025     0.975
Breast_Cancer  0.833333  0.833333  0.833333
overall        0.833333  0.833333  0.833333


In [43]:
# adjusted model with scaled data using PCA (only breast cancer, balanced) - 1000 iteration
acc,CIs = calculate_RF(breast_cancer_balanced_scaled, False, True, 1000)
print(acc)
print(CIs)

0.7407407407407407
                 median     0.025     0.975
Breast_Cancer  0.866667  0.715869  0.964736
overall        0.866667  0.715869  0.964736


This feature set has a higher performance on all different model adjustments compared to the script using the features of this thesis. But model performance decreases when having a balanced dataset. The Griffin model uses binary classification while comparing 7 cancer types to healthy. This would ideally require a multi-class model and might indicate some kind of overfitting since model performance decreases when reducing the number of healthy samples.

# Hyperparameter optimization

In [44]:
### RF with all features using random search CV

n_estimators = range(100,501,50) #list [start:stop:step], np arange
max_features = np.arange(50,251,50)
min_samples_leaf = range(2,6)
max_samples = np.arange(61,73,2)
#max_depth = range(3,16)

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'min_samples_leaf': min_samples_leaf, 
               'max_samples': max_samples,
               'bootstrap': [True]} 
               #'max_depth': max_depth}


rf = RandomForestClassifier()
rf_all_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 500, cv = 10, verbose=2, random_state=41, n_jobs = -1)
rf_all_random.fit(X_train, y_train)
              
predict_train = rf_all_random.predict(X_train)
predict_test = rf_all_random.predict(X_test)
probability_test = rf_all_random.predict_proba(X_test)

accuracy_score(y_test, predict_test)

Fitting 10 folds for each of 500 candidates, totalling 5000 fits


0.7924528301886793

In [45]:
# best_score_ 
print('best score', rf_all_random.best_score_)

# best_estimator_ 
print('best estimator', rf_all_random.best_estimator_)

# best_params_
print('best params', rf_all_random.best_params_)

best score 0.8422379032258064
best estimator RandomForestClassifier(max_features=250, max_samples=71, min_samples_leaf=3,
                       n_estimators=400)
best params {'n_estimators': 400, 'min_samples_leaf': 3, 'max_samples': 71, 'max_features': 250, 'bootstrap': True}


In [47]:
### RF with all features on the parameters of random grid hyperparameter optimization

best_n_estimators = rf_all_random.best_params_['n_estimators']
best_max_features = rf_all_random.best_params_['max_features']
best_min_samples_leaf = rf_all_random.best_params_['min_samples_leaf']
best_max_samples = rf_all_random.best_params_['max_samples']

# train model
rf_all_random_adj = RandomForestClassifier(max_features=best_max_features, max_samples=best_max_samples, min_samples_leaf=best_min_samples_leaf, n_estimators=best_n_estimators, random_state=41)
rf_all_random_adj.fit(X_train, y_train)

# test model

predict_train = rf_all_random_adj.predict(X_train)
predict_test = rf_all_random_adj.predict(X_test)
probability_test = rf_all_random_adj.predict_proba(X_test)

accuracy_score(y_test, predict_test)

0.7924528301886793

In [48]:
### RF with all features using grid search CV

n_estimators = range(100,501,50) #list [start:stop:step], np arange
max_features = np.arange(50,251,50)
min_samples_leaf = range(2,6)
max_samples = np.arange(61,65,2)
#max_depth = range(3,16)

full_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'min_samples_leaf': min_samples_leaf, 
               'max_samples': max_samples,
               'bootstrap': [True]} 
               #'max_depth': max_depth}

rf = RandomForestClassifier()
rf_all_grid = GridSearchCV(estimator = rf, param_grid = full_grid,  cv = 5, verbose=2, n_jobs = -1)
rf_all_grid.fit(X_train, y_train)
              
predict_train = rf_all_grid.predict(X_train)
predict_test = rf_all_grid.predict(X_test)
probability_test = rf_all_grid.predict_proba(X_test)

accuracy_score(y_test, predict_test)

Fitting 5 folds for each of 360 candidates, totalling 1800 fits


0.7924528301886793

In [49]:
# best_score_ 
print('best score', rf_all_grid.best_score_)

# best_estimator_ 
print('best estimator', rf_all_grid.best_estimator_)

# best_params_
print('best params', rf_all_grid.best_params_)

best score 0.8330853174603174
best estimator RandomForestClassifier(max_features=200, max_samples=61, min_samples_leaf=2,
                       n_estimators=500)
best params {'bootstrap': True, 'max_features': 200, 'max_samples': 61, 'min_samples_leaf': 2, 'n_estimators': 500}


In [50]:
### RF with all features on the parameters of random grid hyperparameter optimization

# best params
best_n_estimators = rf_all_grid.best_params_['n_estimators']
best_max_features = rf_all_grid.best_params_['max_features']
best_min_samples_leaf = rf_all_grid.best_params_['min_samples_leaf']
best_max_samples = rf_all_grid.best_params_['max_samples']

# train model
rf_all_grid_adj = RandomForestClassifier(max_features=best_max_features, max_samples=best_max_samples, min_samples_leaf=best_min_samples_leaf, n_estimators=best_n_estimators, random_state=41)
rf_all_grid_adj.fit(X_train, y_train)

# test model

predict_train = rf_all_grid_adj.predict(X_train)
predict_test = rf_all_grid_adj.predict(X_test)
probability_test = rf_all_grid_adj.predict_proba(X_test)

accuracy_score(y_test, predict_test)

0.7641509433962265

Calculate model performance using random and grid search with different number of iterations using PCA to reduce the dimensionality or not.

In [51]:
# Random search without PCA, 1 iteration
acc,CIs = hyperparameter_optimization_RF(data, False, "random", 1)
print(acc)
print(CIs)

Fitting 10 folds for each of 200 candidates, totalling 2000 fits
0.8018867924528302
                     median     0.025     0.975
Bile_Duct_Cancer   0.966102  0.966102  0.966102
Breast_Cancer      0.874092  0.874092  0.874092
Colorectal_Cancer  0.881356  0.881356  0.881356
Gastric_cancer     0.864407  0.864407  0.864407
Lung_Cancer        0.991525  0.991525  0.991525
Ovarian_Cancer     0.936441  0.936441  0.936441
Pancreatic_Cancer  0.875706  0.875706  0.875706
overall            0.896141  0.896141  0.896141


In [52]:
# Random search without PCA, 1000 iterations
acc,CIs = hyperparameter_optimization_RF(data, False, "random", 1000)
print(acc)
print(CIs)

Fitting 10 folds for each of 200 candidates, totalling 2000 fits
0.8584905660377359
                     median     0.025     0.975
Bile_Duct_Cancer   0.961686  0.872949  1.000000
Breast_Cancer      0.915691  0.831844  0.977007
Colorectal_Cancer  0.898571  0.765404  0.987191
Gastric_cancer     0.916824  0.806694  0.988241
Lung_Cancer        1.000000  0.960784  1.000000
Ovarian_Cancer     0.976616  0.914511  1.000000
Pancreatic_Cancer  0.866408  0.708589  0.977041
overall            0.920598  0.868349  0.962766


In [53]:
# Random search with PCA, 1000 iterations
acc,CIs = hyperparameter_optimization_RF(data, True, "random", 1000)
print(acc)
print(CIs)

Fitting 10 folds for each of 200 candidates, totalling 2000 fits
0.8867924528301887
                     median     0.025     0.975
Bile_Duct_Cancer   0.950725  0.840000  1.000000
Breast_Cancer      0.901728  0.789125  0.982000
Colorectal_Cancer  0.982013  0.919583  1.000000
Gastric_cancer     0.925973  0.778287  1.000000
Lung_Cancer        0.997024  0.927273  1.000000
Ovarian_Cancer     0.979167  0.925280  1.000000
Pancreatic_Cancer  0.841488  0.654409  0.972425
overall            0.923242  0.869307  0.964048


In [54]:
# Grid search without PCA, 1 iteration
acc,CIs = hyperparameter_optimization_RF(data, False, "grid", 1)
print(acc)
print(CIs)

Fitting 10 folds for each of 1080 candidates, totalling 10800 fits
0.8018867924528302
                     median     0.025     0.975
Bile_Duct_Cancer   0.968927  0.968927  0.968927
Breast_Cancer      0.888620  0.888620  0.888620
Colorectal_Cancer  0.855932  0.855932  0.855932
Gastric_cancer     0.864407  0.864407  0.864407
Lung_Cancer        1.000000  1.000000  1.000000
Ovarian_Cancer     0.961864  0.961864  0.961864
Pancreatic_Cancer  0.877589  0.877589  0.877589
overall            0.900469  0.900469  0.900469


In [55]:
# Grid search without PCA, 1000 iterations
acc,CIs = hyperparameter_optimization_RF(data, False, "grid", 1000)
print(acc)
print(CIs)

Fitting 10 folds for each of 1080 candidates, totalling 10800 fits
0.8490566037735849
                     median     0.025     0.975
Bile_Duct_Cancer   0.962264  0.873311  1.000000
Breast_Cancer      0.916284  0.826864  0.976042
Colorectal_Cancer  0.907890  0.780326  0.987255
Gastric_cancer     0.922105  0.824070  0.991071
Lung_Cancer        1.000000  0.960196  1.000000
Ovarian_Cancer     0.977589  0.916667  1.000000
Pancreatic_Cancer  0.867761  0.717501  0.976221
overall            0.923146  0.871581  0.962846


In [56]:
# Grid search with PCA, 1000 iterations
acc,CIs = hyperparameter_optimization_RF(data, True, "random", 1000)
print(acc)
print(CIs)

Fitting 10 folds for each of 200 candidates, totalling 2000 fits
0.8962264150943396
                     median     0.025     0.975
Bile_Duct_Cancer   0.950000  0.839011  1.000000
Breast_Cancer      0.899289  0.793770  0.977457
Colorectal_Cancer  0.980449  0.911419  1.000000
Gastric_cancer     0.924000  0.767528  1.000000
Lung_Cancer        0.995763  0.920604  1.000000
Ovarian_Cancer     0.977273  0.915439  1.000000
Pancreatic_Cancer  0.839648  0.642373  0.970372
overall            0.921009  0.865659  0.961028


With this dataset the hyperparameter optimization seems to have the same or improved model performance compared to the script using this thesis features. This is interesting as both datasets have the same dimensionality and this dataset should also suffer from the curse of dimensionality.

# Gradient Boost Classifier

In [57]:
# default model (all cancer types)
acc,CIs = calculate_GBC(data, False, False, 1)
print(acc)
print(CIs)

0.8867924528301887
                     median     0.025     0.975
Bile_Duct_Cancer   0.974576  0.974576  0.974576
Breast_Cancer      0.933414  0.933414  0.933414
Colorectal_Cancer  0.968927  0.968927  0.968927
Gastric_cancer     0.889831  0.889831  0.889831
Lung_Cancer        1.000000  1.000000  1.000000
Ovarian_Cancer     0.978814  0.978814  0.978814
Pancreatic_Cancer  0.930320  0.930320  0.930320
overall            0.943743  0.943743  0.943743


In [58]:
# default model (all cancer types) - 1000 iteration
acc,CIs = calculate_GBC(data, False, False, 1000)
print(acc)
print(CIs)

0.8867924528301887
                     median     0.025     0.975
Bile_Duct_Cancer   0.983660  0.907649  1.000000
Breast_Cancer      0.943396  0.871416  0.992476
Colorectal_Cancer  0.975872  0.911090  1.000000
Gastric_cancer     0.956832  0.853744  1.000000
Lung_Cancer        1.000000  0.962963  1.000000
Ovarian_Cancer     0.989474  0.943163  1.000000
Pancreatic_Cancer  0.897059  0.745425  0.992398
overall            0.951827  0.909847  0.980247


In [59]:
# default model (only breast cancer, unbalanced)
acc,CIs = calculate_GBC(breast_cancer, False, False, 1)
print(acc)
print(CIs)

0.9264705882352942
                 median     0.025     0.975
Breast_Cancer  0.848276  0.848276  0.848276
overall        0.848276  0.848276  0.848276


In [60]:
# default model (only breast cancer, unbalanced) - 1000 iteration
acc,CIs = calculate_GBC(breast_cancer, False, False, 1000)
print(acc)
print(CIs)

0.9117647058823529
                 median     0.025    0.975
Breast_Cancer  0.910053  0.815031  0.97619
overall        0.910053  0.815031  0.97619


In [61]:
# default model (only breast cancer, balanced)
acc,CIs = calculate_GBC(breast_cancer_balanced, False, False, 1)
print(acc)
print(CIs)

0.5185185185185185
               median  0.025  0.975
Breast_Cancer    0.65   0.65   0.65
overall          0.65   0.65   0.65


In [62]:
# default model (only breast cancer, balanced) - 1000 iteration - 1000 iteration
acc,CIs = calculate_GBC(breast_cancer_balanced, False, False, 1000)
print(acc)
print(CIs)

0.6666666666666666
                 median    0.025     0.975
Breast_Cancer  0.805556  0.58639  0.966667
overall        0.805556  0.58639  0.966667


Try with scaling.

In [63]:
# default model (all cancer types)
acc,CIs = calculate_GBC(scaled_data, False, False, 1)
print(acc)
print(CIs)

0.8773584905660378
                     median     0.025     0.975
Bile_Duct_Cancer   0.977401  0.977401  0.977401
Breast_Cancer      0.932203  0.932203  0.932203
Colorectal_Cancer  0.971751  0.971751  0.971751
Gastric_cancer     0.889831  0.889831  0.889831
Lung_Cancer        1.000000  1.000000  1.000000
Ovarian_Cancer     0.978814  0.978814  0.978814
Pancreatic_Cancer  0.922787  0.922787  0.922787
overall            0.942661  0.942661  0.942661


In [64]:
# default model (all cancer types) - 1000 iteration
acc,CIs = calculate_GBC(scaled_data, False, False, 1000)
print(acc)
print(CIs)

0.8867924528301887
                     median     0.025     0.975
Bile_Duct_Cancer   0.983974  0.905615  1.000000
Breast_Cancer      0.943113  0.870920  0.991601
Colorectal_Cancer  0.976190  0.909082  1.000000
Gastric_cancer     0.957447  0.855326  1.000000
Lung_Cancer        1.000000  0.963131  1.000000
Ovarian_Cancer     0.990115  0.945093  1.000000
Pancreatic_Cancer  0.898119  0.743510  0.991533
overall            0.951870  0.909836  0.980057


In [65]:
# default model (only breast cancer, unbalanced)
acc,CIs = calculate_GBC(breast_cancer_scaled, False, False, 1)
print(acc)
print(CIs)

0.9264705882352942
                 median     0.025     0.975
Breast_Cancer  0.827586  0.827586  0.827586
overall        0.827586  0.827586  0.827586


In [88]:
# default model (only breast cancer, balanced)
acc,CIs = calculate_GBC(breast_cancer_balanced_scaled, False, False, 1)
print(acc)
print(CIs)

0.5185185185185185
                 median     0.025     0.975
Breast_Cancer  0.680556  0.680556  0.680556
overall        0.680556  0.680556  0.680556


Scaling does not improve the performance (as expected for a random forest) but is kept for PCA, because it is needed there.

In [89]:
# default model (all cancer types)
acc,CIs = calculate_GBC(scaled_data, True, False, 1)
print(acc)
print(CIs)

0.8584905660377359
                     median     0.025     0.975
Bile_Duct_Cancer   0.898305  0.898305  0.898305
Breast_Cancer      0.900726  0.900726  0.900726
Colorectal_Cancer  0.971751  0.971751  0.971751
Gastric_cancer     0.940678  0.940678  0.940678
Lung_Cancer        0.983051  0.983051  0.983051
Ovarian_Cancer     0.953390  0.953390  0.953390
Pancreatic_Cancer  0.826742  0.826742  0.826742
overall            0.908402  0.908402  0.908402


In [90]:
# default model (all cancer types) - 1000 iteration
acc,CIs = calculate_GBC(scaled_data, True, False, 1000)
print(acc)
print(CIs)

0.8867924528301887
                     median     0.025     0.975
Bile_Duct_Cancer   0.940000  0.814756  1.000000
Breast_Cancer      0.913397  0.818098  0.981826
Colorectal_Cancer  0.978990  0.892437  1.000000
Gastric_cancer     0.923752  0.770051  1.000000
Lung_Cancer        0.993333  0.880061  1.000000
Ovarian_Cancer     0.977635  0.911083  1.000000
Pancreatic_Cancer  0.850000  0.662014  0.977579
overall            0.925426  0.870939  0.966133


In [91]:
# default model (only breast cancer, unbalanced)
acc,CIs = calculate_GBC(breast_cancer_scaled, True, False, 1)
print(acc)
print(CIs)

0.8676470588235294
                 median     0.025     0.975
Breast_Cancer  0.805172  0.805172  0.805172
overall        0.805172  0.805172  0.805172


In [92]:
# default model (only breast cancer, balanced)
acc,CIs = calculate_GBC(breast_cancer_balanced_scaled, True, False, 1)
print(acc)
print(CIs)

0.7407407407407407
                 median     0.025     0.975
Breast_Cancer  0.805556  0.805556  0.805556
overall        0.805556  0.805556  0.805556


PCA does not improve model performance and is therefore not kept. Now the adjusted model is tried.

In [93]:
# default model (all cancer types)
acc,CIs = calculate_GBC(scaled_data, False, True, 1)
print(acc)
print(CIs)

0.8773584905660378
                     median     0.025     0.975
Bile_Duct_Cancer   0.983051  0.983051  0.983051
Breast_Cancer      0.952785  0.952785  0.952785
Colorectal_Cancer  0.985876  0.985876  0.985876
Gastric_cancer     0.906780  0.906780  0.906780
Lung_Cancer        1.000000  1.000000  1.000000
Ovarian_Cancer     0.991525  0.991525  0.991525
Pancreatic_Cancer  0.928437  0.928437  0.928437
overall            0.955644  0.955644  0.955644


In [98]:
# default model (all cancer types) - 1000 iteration
acc,CIs = calculate_GBC(scaled_data, False, True, 1000)
print(acc)
print(CIs)

0.8867924528301887
                     median     0.025     0.975
Bile_Duct_Cancer   0.985423  0.913415  1.000000
Breast_Cancer      0.948718  0.881334  0.992605
Colorectal_Cancer  0.979842  0.919745  1.000000
Gastric_cancer     0.962264  0.859594  1.000000
Lung_Cancer        1.000000  0.968090  1.000000
Ovarian_Cancer     0.991254  0.950813  1.000000
Pancreatic_Cancer  0.904153  0.743282  0.991452
overall            0.956086  0.916211  0.982692


In [100]:
# default model (only breast cancer, unbalanced)
acc,CIs = calculate_GBC(breast_cancer_scaled, False, True, 1)
print(acc)
print(CIs)

0.9411764705882353
                 median     0.025     0.975
Breast_Cancer  0.837931  0.837931  0.837931
overall        0.837931  0.837931  0.837931


In [102]:
# default model (only breast cancer, balanced)
acc,CIs = calculate_GBC(breast_cancer_balanced_scaled, False, True, 1)
print(acc)
print(CIs)

0.8148148148148148
                 median     0.025     0.975
Breast_Cancer  0.955556  0.955556  0.955556
overall        0.955556  0.955556  0.955556


This feature set has a higher performance on all different model adjustments compared to the script using the features of this thesis. But model performance decreases when having a balanced dataset. The Griffin model uses binary classification while comparing 7 cancer types to healthy. This would ideally require a multi-class model and might indicate some kind of overfitting since model performance decreases when reducing the number of healthy samples.

# Feature Selection on GBC

In [104]:
# train model
gbc_all_default = GradientBoostingClassifier(random_state=42)
gbc_all_default.fit(X_train, y_train)

# test model

predict_train = gbc_all_default.predict(X_train)
predict_test = gbc_all_default.predict(X_test)
probability_test = gbc_all_default.predict_proba(X_test)

print('accuracy', accuracy_score(y_test, predict_test))

AUCs,CIs = get_AUC_griffin(probability_test,data)
CIs

accuracy 0.8301886792452831


Unnamed: 0,median,0.025,0.975
Bile_Duct_Cancer,1.0,1.0,1.0
Breast_Cancer,0.962919,0.962919,0.962919
Colorectal_Cancer,0.99026,0.99026,0.99026
Gastric_cancer,0.95,0.95,0.95
Lung_Cancer,1.0,1.0,1.0
Ovarian_Cancer,0.994318,0.994318,0.994318
Pancreatic_Cancer,0.911157,0.911157,0.911157
overall,0.967009,0.967009,0.967009


In [105]:
### GBC with selected features on default parameters
# use the gbc_all_default to select feature importance

feature_imp_gbc = pd.DataFrame(data=gbc_all_default.feature_importances_, columns=["importance"], index=features).sort_values(by="importance", ascending=False)
n_features = 20
feature_imp_gbc.head(n_features)

Unnamed: 0,importance
central_coverage_MYOD1.hg38.30000,0.211704
central_coverage_TBX5.hg38.30000,0.07309
central_coverage_ZNF467.hg38.30000,0.062794
central_coverage_WT1.hg38.30000,0.042148
mean_coverage_LYL1.hg38.30000,0.040074
central_coverage_FOXO1.hg38.30000,0.039694
mean_coverage_FOXH1.hg38.30000,0.02615
central_coverage_EGR3.hg38.30000,0.023989
amplitude_RUNX3.hg38.30000,0.023813
central_coverage_SP2.hg38.30000,0.022598


In [106]:
reduced_gbc = feature_imp_gbc.sort_values(by="importance", ascending=False).head(n_features).index
features_reduced_gbc = data.loc[:,reduced_gbc]
features_reduced_gbc["status"] = data["status"]
features_reduced_gbc

Unnamed: 0_level_0,central_coverage_MYOD1.hg38.30000,central_coverage_TBX5.hg38.30000,central_coverage_ZNF467.hg38.30000,central_coverage_WT1.hg38.30000,mean_coverage_LYL1.hg38.30000,central_coverage_FOXO1.hg38.30000,mean_coverage_FOXH1.hg38.30000,central_coverage_EGR3.hg38.30000,amplitude_RUNX3.hg38.30000,central_coverage_SP2.hg38.30000,...,mean_coverage_ATF2.hg38.30000,central_coverage_GLIS3.hg38.30000,mean_coverage_ZNF146.hg38.30000,central_coverage_SPIB.hg38.30000,central_coverage_HES1.hg38.30000,amplitude_PHOX2B.hg38.30000,mean_coverage_PBX1.hg38.30000,mean_coverage_SPI1.hg38.30000,mean_coverage_SPIB.hg38.30000,status
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bile_Duct_Cancer_CGPLPA114,0.406042,0.740012,0.268188,0.166092,-0.233753,0.287461,0.443489,0.686921,0.473528,-0.208997,...,0.324192,-0.810005,0.247476,0.276918,0.169244,-0.224624,-0.174477,-0.094571,-0.030589,1
Bile_Duct_Cancer_CGPLPA115,0.780588,0.587055,-0.055745,-0.009383,0.350823,0.073372,0.683849,-0.560672,-0.296502,0.049094,...,0.985117,-0.644064,0.212329,0.166350,0.264982,0.614474,0.602090,0.500497,0.553506,1
Bile_Duct_Cancer_CGPLPA117,-0.486883,-1.053164,0.272364,-0.050905,-0.182458,-0.396442,0.616248,0.083539,-0.405379,-0.623210,...,-0.280593,1.115215,-0.856782,-0.616086,-0.028298,-0.103907,-0.281738,-0.633749,-0.387166,1
Bile_Duct_Cancer_CGPLPA118,0.583390,0.536198,0.061181,-0.454637,0.719610,0.565334,1.264719,0.547417,-0.952255,-0.259873,...,0.413506,1.236420,-1.101855,0.494950,0.355623,-0.278439,0.562046,0.438853,0.632006,1
Bile_Duct_Cancer_CGPLPA122,1.169453,0.882100,-1.053792,0.936725,0.916827,0.839367,0.588706,-0.097551,-2.145093,-1.122579,...,0.137908,0.408799,-0.278768,0.720881,0.170215,0.767507,0.540594,0.765976,0.717158,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Pancreatic_Cancer_CGPLPA86,0.194525,0.045491,1.113516,-0.117060,1.422693,0.712981,-0.360214,1.113749,-4.028112,0.510253,...,-0.344389,1.602113,-0.582261,1.615295,-0.086420,-0.951551,0.823762,1.601866,1.254685,1
Pancreatic_Cancer_CGPLPA92,-0.568561,-1.024824,0.802111,-0.248666,-0.308925,-0.712703,-0.159914,0.565564,-0.277930,-0.301141,...,-0.742475,0.393713,-0.603634,-0.665165,-0.308351,-1.027845,0.064356,-0.313201,-0.317980,1
Pancreatic_Cancer_CGPLPA93,0.403764,0.463213,-0.396978,1.554874,0.796551,0.567991,0.235678,1.326973,-1.778887,-0.495474,...,-0.089206,0.228293,0.133488,0.615954,-0.169538,-0.898048,0.371837,0.487346,0.506938,1
Pancreatic_Cancer_CGPLPA94,0.003184,0.158462,0.119644,0.542846,0.414499,0.086365,0.072934,0.306972,-0.083488,-0.460756,...,0.303777,0.320366,0.345790,0.573363,-0.800265,-1.486899,0.190209,0.552278,0.550844,1


In [107]:
# make a new model on the reduced features
train_reduced_default,test_reduced_default = train_test_split(features_reduced_gbc, test_size = 0.25, random_state = 42)

# train reduced model
X_train_reduced_default = train_reduced_default.drop(["status"], axis = 1)
y_train_reduced_default = train_reduced_default.loc[:,"status"]
X_test_reduced_default = test_reduced_default.drop(["status"], axis = 1)
y_test_reduced_default = test_reduced_default.loc[:,"status"]

gbc_reduced_default = GradientBoostingClassifier(random_state=42)
gbc_reduced_default.fit(X_train_reduced_default, y_train_reduced_default)

# test model

predict_train = gbc_reduced_default.predict(X_train_reduced_default)
predict_test = gbc_reduced_default.predict(X_test_reduced_default)
probability_test = gbc_reduced_default.predict_proba(X_test_reduced_default)

print(accuracy_score(y_test_reduced_default, predict_test))

AUCs,CIs = get_AUC_griffin(probability_test,data)
CIs

0.8584905660377359


Unnamed: 0,median,0.025,0.975
Bile_Duct_Cancer,1.0,1.0,1.0
Breast_Cancer,0.938995,0.938995,0.938995
Colorectal_Cancer,0.993506,0.993506,0.993506
Gastric_cancer,0.931818,0.931818,0.931818
Lung_Cancer,1.0,1.0,1.0
Ovarian_Cancer,0.991477,0.991477,0.991477
Pancreatic_Cancer,0.880165,0.880165,0.880165
overall,0.952713,0.952713,0.952713


Training a model only with the 20 features of highest feature importance does not improve model perfomance. It seems that the model with all features already ony includes features of high importance and therefore this approach is not needed.