# About the notebook
This script computes gradient boosting models based on the features generated as part of the thesis. Different adjustments of the models are tested (e.g. scaled data, Principal Component Analysis for dimensionality reduction, adjusting the hyperparameter, hyperparameter optimization).

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.signal import savgol_filter

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve,auc,accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.metrics import RocCurveDisplay

# Read in the data

In [2]:
# params
cancer_type = "breast_cancer"
GC = "corrected"
score = "MIDPOINT"
amplitude = "FFT"

In [3]:
# breast cancer features
file = "/data/gpfs-1/groups/ag_kircher/cfDNA-analysis/lea/cfDNA_classification_analyses/features/DELFI_"+cancer_type+"_"+GC+"_"+score+"_"+amplitude+"_features.csv"
c_features = pd.read_csv(file, sep="\t", index_col=0)

file = "/data/gpfs-1/groups/ag_kircher/cfDNA-analysis/lea/cfDNA_classification_analyses/features/DELFI_"+cancer_type+"_metadata.tsv"
c_meta = pd.read_csv(file, sep='\t', index_col='sample_name')

c_features = c_features.reset_index(drop=False)
c_features[['sample','p','score']] = c_features['index'].str.split('_',2, expand=True)
c_features = c_features.set_index('sample')
cancer = pd.concat([c_features, c_meta], axis=1)

In [4]:
# healthy features
file = "/data/gpfs-1/groups/ag_kircher/cfDNA-analysis/lea/cfDNA_classification_analyses/features/DELFI_healthy_"+GC+"_"+score+"_"+amplitude+"_features.csv"
h_features = pd.read_csv(file, sep="\t", index_col=0)

file = "/data/gpfs-1/groups/ag_kircher/cfDNA-analysis/lea/cfDNA_classification_analyses/features/DELFI_healthy_metadata.tsv"
h_meta = pd.read_csv(file, sep='\t', index_col='sample_name')

h_features = h_features.reset_index(drop=False)
h_features[['sample','p','score']] = h_features['index'].str.split('_',2, expand=True)
h_features = h_features.set_index('sample')
healthy = pd.concat([h_features, h_meta], axis=1)

In [5]:
# concat breast cancer and healthy
data = pd.concat([cancer, healthy], axis=0) 

features = data.columns[(data.columns.str.startswith('central_cov')) | (data.columns.str.startswith('mean_cov')) | (data.columns.str.startswith('amplitude')) | (data.columns.str.startswith('nucleosome_spacing_fft'))]
features_phenotype = data.columns[(data.columns.str.startswith('phenotype')) | (data.columns.str.startswith('central_cov')) | (data.columns.str.startswith('mean_cov')) | (data.columns.str.startswith('amplitude')) | (data.columns.str.startswith('nucleosome_spacing_fft'))]

data.head()

Unnamed: 0,index,phenotype,central_coverage_NFKB2,mean_coverage_NFKB2,amplitude190_NFKB2,nucleosome_spacing_fft_NFKB2,central_coverage_TP73,mean_coverage_TP73,amplitude190_TP73,nucleosome_spacing_fft_TP73,...,Gender,Stage,Age,Status,% GC,Length,Median,≥ 1X,≥ 5X,fraction
EGAF00002727253,EGAF00002727253_c_MIDPOINT,1.0,0.921859,0.999899,5.96487,148.0,0.993629,0.999585,15.140059,192.0,...,F,I,54.0,breast_cancer,41%,140 bp,2.0X,88.0%,1.0%,0.06429
EGAF00002727240,EGAF00002727240_c_MIDPOINT,1.0,0.966934,1.0002,19.353707,192.0,1.033113,1.000039,9.433198,240.0,...,F,II,61.0,breast_cancer,42%,143 bp,2.0X,88.0%,3.0%,0.3644
EGAF00002727280,EGAF00002727280_c_MIDPOINT,1.0,1.161236,0.999987,11.505221,213.0,1.115174,0.999805,17.278634,192.0,...,F,II,37.0,breast_cancer,42%,134 bp,2.0X,86.0%,1.0%,0.09767
EGAF00002727290,EGAF00002727290_c_MIDPOINT,1.0,1.027811,0.999776,20.178665,192.0,1.038958,1.000246,3.898227,160.0,...,F,II,48.0,breast_cancer,41%,139 bp,2.0X,89.0%,2.0%,0.06922
EGAF00002727254,EGAF00002727254_c_MIDPOINT,1.0,1.118326,1.000246,10.611337,213.0,1.166457,0.999947,1.834101,213.0,...,F,II,47.0,breast_cancer,41%,134 bp,1.0X,86.0%,1.0%,0.1147


In [6]:
# split into a training (75%) and testing set (25%)
train,test = train_test_split(data, test_size = 0.25, random_state = 42)

X_train = train[features]
y_train = train.loc[:,"phenotype"]
X_test = test[features]
y_test = test.loc[:,"phenotype"]

In [7]:
# scale data
scaled_data = pd.concat([cancer, healthy], axis=0) 
scaler = StandardScaler()
scaler.fit(scaled_data[features])
scaled_data[features] = scaler.transform(scaled_data[features])
scaled_data[features].mean()

scaled_data.head()

Unnamed: 0,index,phenotype,central_coverage_NFKB2,mean_coverage_NFKB2,amplitude190_NFKB2,nucleosome_spacing_fft_NFKB2,central_coverage_TP73,mean_coverage_TP73,amplitude190_TP73,nucleosome_spacing_fft_TP73,...,Gender,Stage,Age,Status,% GC,Length,Median,≥ 1X,≥ 5X,fraction
EGAF00002727253,EGAF00002727253_c_MIDPOINT,1.0,-1.587964,-0.402804,-1.431977,-2.18118,-1.235524,-2.211313,0.879156,-0.40684,...,F,I,54.0,breast_cancer,41%,140 bp,2.0X,88.0%,1.0%,0.06429
EGAF00002727240,EGAF00002727240_c_MIDPOINT,1.0,-0.575681,1.081786,0.380465,-0.469377,-0.233665,0.148223,-0.184884,1.386579,...,F,II,61.0,breast_cancer,42%,143 bp,2.0X,88.0%,3.0%,0.3644
EGAF00002727280,EGAF00002727280_c_MIDPOINT,1.0,3.787805,0.031149,-0.681981,0.34762,1.848499,-1.066968,1.277891,-0.40684,...,F,II,37.0,breast_cancer,42%,134 bp,2.0X,86.0%,1.0%,0.09767
EGAF00002727290,EGAF00002727290_c_MIDPOINT,1.0,0.791448,-1.014266,0.49214,-0.469377,-0.08535,1.227604,-1.216876,-1.602453,...,F,II,48.0,breast_cancer,41%,139 bp,2.0X,89.0%,2.0%,0.06922
EGAF00002727254,EGAF00002727254_c_MIDPOINT,1.0,2.82417,1.308605,-0.802986,0.34762,3.14975,-0.327229,-1.60173,0.37778,...,F,II,47.0,breast_cancer,41%,134 bp,1.0X,86.0%,1.0%,0.1147


In [8]:
# split scaled data into a training (75%) and testing set (25%)
train_scaled,test_scaled = train_test_split(scaled_data, test_size = 0.25, random_state = 42)

X_train_scaled = train_scaled[features]
y_train_scaled = train_scaled.loc[:,"phenotype"]
X_test_scaled = test_scaled[features]
y_test_scaled = test_scaled.loc[:,"phenotype"]

# Functions

In [9]:
def get_AUC_griffin(prob,data):
    #get AUC and accuracy for each bootstrap
    probabilities = pd.DataFrame(index=data.index)
    probabilities[0] = pd.Series(prob[:,1], index = X_test.index)
    probabilities = probabilities.merge(data[['phenotype']], left_index=True, right_index=True)

    AUCs = pd.DataFrame()

    probabilities = probabilities.merge(data[['fraction','Status','Stage','Median']], left_index=True, right_index=True)
    
    for i in range(1):
        current_dict = {}
        current = probabilities[~(probabilities[i].isnull())][['phenotype','fraction','Status','Stage','Median',i]].copy()

        #overall accuracy and AUC
        group = 'overall'
        fpr,tpr,_ = roc_curve(current['phenotype'],current[i])
        AUC = auc(fpr,tpr)
        current_dict[group] = AUC
        mean_fpr = fpr
        mean_tpr = tpr
        del(AUC,group,fpr,tpr)

        #separate out the healthy samples to be used in every AUC
        healthy_df = current[current['phenotype']==0]
        cancer_df = current[current['phenotype']==1]
        del(current)
        
        for group,df in cancer_df.groupby('Status'):
            if group == 'Duodenal_Cancer':
                continue

            df2 = df.append(healthy_df, ignore_index=True)
            fpr,tpr,_ = roc_curve(df2['phenotype'],df2[i])
            AUC = auc(fpr,tpr)
            current_dict[group] = AUC
            del(AUC,group,fpr,tpr)
        
        for group,df in cancer_df.groupby('Median'):
            df2 = df.append(healthy_df, ignore_index=True)
            fpr,tpr,_ = roc_curve(df2['phenotype'],df2[i])
            AUC = auc(fpr,tpr)
            current_dict[group] = AUC
            del(AUC,group,fpr,tpr)
        
        AUCs = AUCs.append(pd.Series(current_dict), ignore_index=True)
        
    CIs = pd.DataFrame([AUCs.median(), AUCs.quantile(.025), AUCs.quantile(.975)]).T
    CIs = CIs.rename(columns = {'Unnamed 0':'median'})    
    return(AUCs,CIs)

In [10]:
def roc_data(prob,data):
    #get AUC and accuracy for each bootstrap
    probabilities = pd.DataFrame(index=data.index)
    probabilities[0] = pd.Series(prob[:,1], index = X_test.index)
    probabilities = probabilities.merge(data[['phenotype']], left_index=True, right_index=True)
    probabilities = probabilities.merge(data[['fraction','Status','Stage','Median']], left_index=True, right_index=True)
    
    AUCs = pd.DataFrame()
    current_dict = {}
    current = probabilities[~(probabilities[0].isnull())][['phenotype','fraction','Status','Stage','Median',0]].copy()

    #overall accuracy and AUC
    group = 'overall'
    fpr,tpr,_ = roc_curve(current['phenotype'],current[0])
    AUC = auc(fpr,tpr)
    current_dict[group] = AUC

    AUCs = AUCs.append(pd.Series(current_dict), ignore_index=True)
        
    CIs = pd.DataFrame([AUCs.median(), AUCs.quantile(.025), AUCs.quantile(.975)]).T
    CIs = CIs.rename(columns = {'Unnamed 0':'median'})    
    return(fpr,tpr,AUC)

In [11]:
def calculate_GBC(data, PCA_flag, adjustment_flag, iterations):
    fraction_variance = .8
    probabilities = pd.DataFrame(index=data.index)
    coefs = pd.DataFrame(index=features)
    train_indexes = []

    # Loop for each iteration
    for i in range(iterations):

        train_internal,test_internal = train_test_split(data, test_size = 0.25, random_state = i+100) #, random_state = 42
        X_train_internal = train_internal[features]
        y_train_internal = train_internal.loc[:,"phenotype"]
        X_test_internal = test_internal[features]
        y_test_internal = test_internal.loc[:,"phenotype"]
        
        if PCA_flag == True:
            #perform PCA on the training set
            n_components = min(len(features), len(X_train_internal))
            pca = PCA(n_components=n_components, svd_solver='randomized', random_state = 100)
            PCs = pca.fit_transform(X_train_internal[features])
            principal_components = pd.DataFrame(data = PCs, columns = ['PC_'+str(m) for m in np.arange(n_components)], index = X_train_internal.index)

            #find the principle components that make up 80% of the varience
            for j in range(len(pca.explained_variance_ratio_)):
                current_sum = pca.explained_variance_ratio_[:j].sum()
                if current_sum>=fraction_variance:
                    break
            pca_features = ['PC_'+str(m) for m in np.arange(0,j)]

            #apply to the test data
            test_PCs = pca.transform(X_test_internal[features])
            test_principal_components = pd.DataFrame(data = test_PCs , columns = ['PC_'+str(m) for m in np.arange(n_components)], index = X_test_internal.index)

            X_train_internal = principal_components[pca_features]
            X_test_internal = test_principal_components[pca_features]
        
        if adjustment_flag == True:
            model = GradientBoostingClassifier(n_estimators=300, min_samples_leaf=3)
        else:
            model = GradientBoostingClassifier()

        #train a new model 
        model.fit(X_train_internal, y_train_internal)

        #predict the test data
        pred = model.predict(X_test_internal)
        prob = model.predict_proba(X_test_internal)

        #save results
        probabilities[i] = pd.Series(prob[:,1], index = X_test_internal.index)
        acc = accuracy_score(y_test_internal, pred)
        train_indexes.append(list(X_train_internal.index))

        if i%20==0:
            #prevent dfs from becoming too fragmented
            probabilities = probabilities.copy()
            coefs = coefs.copy()

    probabilities = probabilities.merge(data[['phenotype']], left_index=True, right_index=True)    
    AUCs = pd.DataFrame()
    probabilities = probabilities.merge(data[['fraction','Status','Stage']], left_index=True, right_index=True)

    for i in range(iterations):
        current_dict = {}
        current = probabilities[~(probabilities[i].isnull())][['phenotype','fraction','Status','Stage',i]].copy()

        #overall accuracy and AUC
        group = 'overall'
        fpr,tpr,_ = roc_curve(current['phenotype'],current[i])
        AUC = auc(fpr,tpr)
        current_dict[group] = AUC
        del(AUC,group,fpr,tpr)

        #separate out the healthy samples to be used in every AUC
        healthy_df = current[current['phenotype']==0]
        cancer_df = current[current['phenotype']==1]
        del(current)

        for group,df in cancer_df.groupby('Status'):
            if group == 'Duodenal_Cancer':
                continue

            df2 = df.append(healthy_df, ignore_index=True)
            fpr,tpr,_ = roc_curve(df2['phenotype'],df2[i])
            AUC = auc(fpr,tpr)
            current_dict[group] = AUC
            del(AUC,group,fpr,tpr)

        AUCs = AUCs.append(pd.Series(current_dict), ignore_index=True)

    CIs = pd.DataFrame([AUCs.median(), AUCs.quantile(.025), AUCs.quantile(.975)]).T
    CIs = CIs.rename(columns = {'Unnamed 0':'median'}) 
    return(acc,CIs)

In [12]:
def hyperparameter_optimization_GBC(data, PCA_flag, HPO_type, iterations):
    fraction_variance = .8
    probabilities = pd.DataFrame(index=data.index)
    c_vals = []
    coefs = pd.DataFrame(index=features)
    train_indexes = []

    # Loop for each iteration
    for i in range(iterations):

        train_internal,test_internal = train_test_split(data, test_size = 0.25, random_state = i+100) #, random_state = 42
        X_train_internal = train_internal[features]
        y_train_internal = train_internal.loc[:,"phenotype"]
        X_test_internal = test_internal[features]
        y_test_internal = test_internal.loc[:,"phenotype"]
        
        if PCA_flag == True:
            #perform PCA on the training set
            n_components = min(len(features), len(X_train_internal))
            pca = PCA(n_components=n_components, svd_solver='randomized', random_state = 100)
            PCs = pca.fit_transform(X_train_internal[features])
            principal_components = pd.DataFrame(data = PCs, columns = ['PC_'+str(m) for m in np.arange(n_components)], index = X_train_internal.index)

            #find the principle components that make up 80% of the varience
            for j in range(len(pca.explained_variance_ratio_)):
                current_sum = pca.explained_variance_ratio_[:j].sum()
                if current_sum>=fraction_variance:
                    break
            pca_features = ['PC_'+str(m) for m in np.arange(0,j)]

            #apply to the test data
            test_PCs = pca.transform(X_test_internal[features])
            test_principal_components = pd.DataFrame(data = test_PCs , columns = ['PC_'+str(m) for m in np.arange(n_components)], index = X_test_internal.index)

            X_train_internal = principal_components[pca_features]
            X_test_internal = test_principal_components[pca_features]
        
        if HPO_type == "random" and i == 0:
            n_estimators = range(100,501,50) 
            max_features = ['sqrt','log2',None] 
            min_samples_leaf = range(2,6)
            max_depth = range(3,16)
            min_samples_split = range(2,5)

            hyperparameters = {'n_estimators': n_estimators,
                            'max_features': max_features,
                            'min_samples_leaf': min_samples_leaf,
                            'max_depth': max_depth,
                            'min_samples_split': min_samples_split}

            cv = StratifiedKFold(n_splits=10, shuffle=True, random_state = i+100) 

            model = GradientBoostingClassifier()
            search = RandomizedSearchCV(estimator = model, param_distributions = hyperparameters, n_iter = 500,  cv = cv, verbose=3, n_jobs = -1, return_train_score=True)        
            search.fit(X_train_scaled, y_train_scaled)

            best_n_estimators = search.best_params_['n_estimators']
            best_max_features = search.best_params_['max_features']
            best_min_samples_leaf = search.best_params_['min_samples_leaf']
            best_max_depth = search.best_params_['max_depth']
            best_min_samples_split = search.best_params_['min_samples_split']
        
        if HPO_type == "grid" and i == 0:
            n_estimators = range(100,501,50) 
            max_features = ['sqrt','log2',None] 
            min_samples_leaf = range(2,6)
            max_depth = range(3,16)
            min_samples_split = range(2,5)

            hyperparameters = {'n_estimators': n_estimators,
                            'max_features': max_features,
                            'min_samples_leaf': min_samples_leaf,
                            'max_depth': max_depth,
                            'min_samples_split': min_samples_split}

            cv = StratifiedKFold(n_splits=10, shuffle=True, random_state = i+100) 
            model = GradientBoostingClassifier()
            search = GridSearchCV(estimator = model, param_grid = hyperparameters,  cv = cv, verbose=2, n_jobs = -1)
            search.fit(X_train_internal, y_train_internal)

            best_n_estimators = search.best_params_['n_estimators']
            best_max_features = search.best_params_['max_features']
            best_min_samples_leaf = search.best_params_['min_samples_leaf']
            best_max_depth = search.best_params_['max_depth']
            best_min_samples_split = search.best_params_['min_samples_split']
        
        model = GradientBoostingClassifier(n_estimators=best_n_estimators, max_features=best_max_features, 
                                           min_samples_leaf=best_min_samples_leaf, 
                                           max_depth=best_max_depth, min_samples_split=best_min_samples_split)
        
        #train a new model 
        model.fit(X_train_internal, y_train_internal)

        #predict the test data
        pred = model.predict(X_test_internal)
        prob = model.predict_proba(X_test_internal)

        #save results
        probabilities[i] = pd.Series(prob[:,1], index = X_test_internal.index)
        acc = accuracy_score(y_test_internal, pred)
        train_indexes.append(list(X_train_internal.index))

        if i%20==0:
            #prevent dfs from becoming too fragmented
            probabilities = probabilities.copy()
            coefs = coefs.copy()   
            #sys.stdout.flush()

    probabilities = probabilities.merge(data[['phenotype']], left_index=True, right_index=True)
    AUCs = pd.DataFrame()
    probabilities = probabilities.merge(data[['fraction','Status','Stage']], left_index=True, right_index=True)

    for i in range(iterations):
        current_dict = {}
        current = probabilities[~(probabilities[i].isnull())][['phenotype','fraction','Status','Stage',i]].copy()

        #overall accuracy and AUC
        group = 'overall'
        fpr,tpr,_ = roc_curve(current['phenotype'],current[i])
        AUC = auc(fpr,tpr)
        current_dict[group] = AUC
        del(AUC,group,fpr,tpr)

        #separate out the healthy samples to be used in every AUC
        healthy_df = current[current['phenotype']==0]
        cancer_df = current[current['phenotype']==1]
        del(current)

        for group,df in cancer_df.groupby('Status'):
            if group == 'Duodenal_Cancer':
                continue

            df2 = df.append(healthy_df, ignore_index=True)
            fpr,tpr,_ = roc_curve(df2['phenotype'],df2[i])
            AUC = auc(fpr,tpr)
            current_dict[group] = AUC
            del(AUC,group,fpr,tpr)

        AUCs = AUCs.append(pd.Series(current_dict), ignore_index=True)

    CIs = pd.DataFrame([AUCs.median(), AUCs.quantile(.025), AUCs.quantile(.975)]).T
    CIs = CIs.rename(columns = {'Unnamed 0':'median'}) 
    return(acc,CIs)

# Gradient Boosting Classifier
First try

In [13]:
# train model
gbc_all_default = GradientBoostingClassifier(random_state=42)
gbc_all_default.fit(X_train, y_train)

# test model

predict_train = gbc_all_default.predict(X_train)
predict_test = gbc_all_default.predict(X_test)
probability_test = gbc_all_default.predict_proba(X_test)

accuracy_score(y_test, predict_test)

0.7037037037037037

In [14]:
AUCs,CIs = get_AUC_griffin(probability_test,data)
CIs

Unnamed: 0,median,0.025,0.975
1.0X,0.961538,0.961538,0.961538
2.0X,0.684615,0.684615,0.684615
3.0X,0.846154,0.846154,0.846154
breast_cancer,0.747253,0.747253,0.747253
overall,0.747253,0.747253,0.747253


# Gradient Boosting Classifier
First train a gradient boosting classifier with all features on default parameters

In [15]:
# default model 
acc,CIs = calculate_GBC(data, False, False, 1)
print(acc)
print(CIs)

0.7777777777777778
                 median     0.025     0.975
breast_cancer  0.861111  0.861111  0.861111
overall        0.861111  0.861111  0.861111


In [16]:
# default model over 1,000 iterations
acc,CIs = calculate_GBC(data, False, False, 1000)
print(acc)
print(CIs)

0.6666666666666666
                 median     0.025     0.975
breast_cancer  0.774725  0.535704  0.961618
overall        0.774725  0.535704  0.961618


In [17]:
# default model using Principal Component Analysis to reduce dimensionality
acc,CIs = calculate_GBC(data, True, False, 1)
print(acc)
print(CIs)

0.7037037037037037
                 median     0.025     0.975
breast_cancer  0.772222  0.772222  0.772222
overall        0.772222  0.772222  0.772222


In [18]:
# default model using Principal Component Analysis to reduce dimensionality over 1,000 iterations
acc,CIs = calculate_GBC(data, True, False, 1000)
print(acc)
print(CIs)

0.5185185185185185
                 median     0.025     0.975
breast_cancer  0.615385  0.405502  0.805647
overall        0.615385  0.405502  0.805647


Rerun the models using scaled data.

In [19]:
# default model with scaled data
acc,CIs = calculate_GBC(scaled_data, False, False, 1)
print(acc)
print(CIs)

0.7037037037037037
                 median     0.025     0.975
breast_cancer  0.811111  0.811111  0.811111
overall        0.811111  0.811111  0.811111


In [20]:
# default model over 1,000 iterations with scaled data
acc,CIs = calculate_GBC(scaled_data, False, False, 1000)
print(acc)
print(CIs)

0.6666666666666666
                 median     0.025     0.975
breast_cancer  0.772222  0.543937  0.961111
overall        0.772222  0.543937  0.961111


In [21]:
# default model using Principal Component Analysis to reduce dimensionality with scaled data
acc,CIs = calculate_GBC(scaled_data, True, False, 1)
print(acc)
print(CIs)

0.5555555555555556
                 median     0.025     0.975
breast_cancer  0.666667  0.666667  0.666667
overall        0.666667  0.666667  0.666667


In [22]:
# default model using Principal Component Analysis to reduce dimensionality over 1,000 iterations with scaled data
acc,CIs = calculate_GBC(scaled_data, True, False, 1000)
print(acc)
print(CIs)

0.6666666666666666
                 median     0.025     0.975
breast_cancer  0.783333  0.598843  0.917582
overall        0.783333  0.598843  0.917582


Now use a gradient boosting model with adjusted hyperparameters.

In [23]:
# adjusted model 
acc,CIs = calculate_GBC(data, False, True, 1)
print(acc)
print(CIs)

0.5925925925925926
                 median     0.025     0.975
breast_cancer  0.794444  0.794444  0.794444
overall        0.794444  0.794444  0.794444


In [24]:
# adjusted model over 1,000 iterations
acc,CIs = calculate_GBC(data, False, True, 1000)
print(acc)
print(CIs)

0.6296296296296297
                 median     0.025     0.975
breast_cancer  0.855556  0.648214  0.969172
overall        0.855556  0.648214  0.969172


In [25]:
# adjusted model using Principal Component Analysis to reduce dimensionality
acc,CIs = calculate_GBC(data, True, True, 1)
print(acc)
print(CIs)

0.8148148148148148
                 median     0.025     0.975
breast_cancer  0.794444  0.794444  0.794444
overall        0.794444  0.794444  0.794444


In [26]:
# adjusted model using Principal Component Analysis to reduce dimensionality over 1,000 iterations
acc,CIs = calculate_GBC(data, True, True, 1000)
print(acc)
print(CIs)

0.48148148148148145
                 median  0.025     0.975
breast_cancer  0.607955    0.4  0.806818
overall        0.607955    0.4  0.806818


And use the adjusted model with scaled data.

In [27]:
# adjusted model with scaled data
acc,CIs = calculate_GBC(scaled_data, False, True, 1)
print(acc)
print(CIs)

0.8148148148148148
                 median     0.025     0.975
breast_cancer  0.894444  0.894444  0.894444
overall        0.894444  0.894444  0.894444


In [28]:
# adjusted model over 1,000 iterations with scaled data
acc,CIs = calculate_GBC(scaled_data, False, True, 1000)
print(acc)
print(CIs)

0.6296296296296297
                 median     0.025     0.975
breast_cancer  0.852062  0.642857  0.969172
overall        0.852062  0.642857  0.969172


In [29]:
# adjusted model using Principal Component Analysis to reduce dimensionality with scaled data
acc,CIs = calculate_GBC(scaled_data, True, True, 1)
print(acc)
print(CIs)

0.5555555555555556
                 median     0.025     0.975
breast_cancer  0.638889  0.638889  0.638889
overall        0.638889  0.638889  0.638889


In [30]:
# adjusted model using Principal Component Analysis to reduce dimensionality over 1,000 iterations with scaled data
acc,CIs = calculate_GBC(scaled_data, True, True, 1000)
print(acc)
print(CIs)

0.6666666666666666
                median     0.025     0.975
breast_cancer  0.78022  0.593407  0.916667
overall        0.78022  0.593407  0.916667


When comparing all these model, it shows that using scaled data does not improve the model. This was expected of a tree-based model. However, the scaled data was still used as it did not reduce model performance and is still needed for PCA, which is a linear approach to reduce dimensionality. It also makes all three models more comparable. 
PCA did not improve but rather reduce model performance. This means that still a model with way many features than samples is used and other ways need to be found to reduce dimensionality. PCA reduces dimensionality unaware of the sample's class abels and is therefore considered an unsupervised approach. This makes PCA a sub-optimal approach for the classification problem of this thesis.

# Hyperparameter Optimization

In [31]:
# Random search without PCA, 1 iteration
acc,CIs = hyperparameter_optimization_GBC(data, False, "random", 1)
print(acc)
print(CIs)

Fitting 10 folds for each of 500 candidates, totalling 5000 fits
0.7777777777777778
                 median     0.025     0.975
breast_cancer  0.855556  0.855556  0.855556
overall        0.855556  0.855556  0.855556


In [32]:
# Random search without PCA, 1000 iterations
acc,CIs = hyperparameter_optimization_GBC(data, False, "random", 1000)
print(acc)
print(CIs)

Fitting 10 folds for each of 500 candidates, totalling 5000 fits
0.6296296296296297
                 median     0.025     0.975
breast_cancer  0.892045  0.738889  0.983516
overall        0.892045  0.738889  0.983516


In [33]:
# Random search with PCA, 1000 iterations
acc,CIs = hyperparameter_optimization_GBC(data, True, "random", 1000)
print(acc)
print(CIs)

Fitting 10 folds for each of 500 candidates, totalling 5000 fits
0.5185185185185185
                 median     0.025     0.975
breast_cancer  0.615385  0.386364  0.805587
overall        0.615385  0.386364  0.805587


In [34]:
# Grid search without PCA, 1 iteration
acc,CIs = hyperparameter_optimization_GBC(data, False, "grid", 1)
print(acc)
print(CIs)

Fitting 10 folds for each of 4212 candidates, totalling 42120 fits
0.7037037037037037
               median  0.025  0.975
breast_cancer     0.8    0.8    0.8
overall           0.8    0.8    0.8


In [35]:
# Grid search without PCA, 1000 iterations
acc,CIs = hyperparameter_optimization_GBC(data, False, "grid", 1000)
print(acc)
print(CIs)

Fitting 10 folds for each of 4212 candidates, totalling 42120 fits
0.8148148148148148
                 median     0.025     0.975
breast_cancer  0.901099  0.761364  0.987669
overall        0.901099  0.761364  0.987669


In [36]:
# Grid search with PCA, 1000 iterations
acc,CIs = hyperparameter_optimization_GBC(data, True, "grid", 1000)
print(acc)
print(CIs)

Fitting 10 folds for each of 4212 candidates, totalling 42120 fits
0.5555555555555556
                 median     0.025     0.975
breast_cancer  0.605556  0.388873  0.802282
overall        0.605556  0.388873  0.802282


Hyperparameter optimization is highly variable between different runs and gives worse results compared to the default model, which should not happen. If the default parameter give a better perfromance these should be used in HPO. (HPO optimizes for accuracy and therefore this should at least be the same.) This indicates that the optimization does not work. In a dataset tht suffers from the curse of dimensionality determining the optimal value for each parameter is challenging. In this scenario it might be better to set the parameter manually according to the requirement of the data.

# Feature Selection

In [37]:
### GBC with selected features on default parameters
# use the gbc_all_default to select feature importance

feature_imp_gbc = pd.DataFrame(data=gbc_all_default.feature_importances_, columns=["importance"], index=features).sort_values(by="importance", ascending=False)
n_features = 20
feature_imp_gbc.head(n_features)

Unnamed: 0,importance
central_coverage_PAX5,0.456451
amplitude190_NRF1,0.160648
amplitude190_TFEB,0.067236
amplitude190_ESR1,0.054497
central_coverage_ZNF444,0.041075
amplitude190_PDX1,0.040014
central_coverage_NEUROG2,0.02233
nucleosome_spacing_fft_PBX1,0.018878
amplitude190_MYRF,0.018604
mean_coverage_MAX,0.016969


In [38]:
# reduce dataframe
reduced_gbc = feature_imp_gbc.sort_values(by="importance", ascending=False).head(n_features).index
features_reduced_gbc = data.loc[:,reduced_gbc]
features_reduced_gbc["phenotype"] = data["phenotype"]
features_reduced_gbc.head()

Unnamed: 0,central_coverage_PAX5,amplitude190_NRF1,amplitude190_TFEB,amplitude190_ESR1,central_coverage_ZNF444,amplitude190_PDX1,central_coverage_NEUROG2,nucleosome_spacing_fft_PBX1,amplitude190_MYRF,mean_coverage_MAX,...,mean_coverage_MIXL1,amplitude190_MYNN,mean_coverage_OTX2,amplitude190_NFIL3,nucleosome_spacing_fft_NR2F6,central_coverage_ZNF770,amplitude190_IRF2,mean_coverage_ZNF770,mean_coverage_YY1,phenotype
EGAF00002727253,0.95198,11.82647,15.301401,7.363483,0.746554,9.973878,1.044305,240.0,55.492314,0.999359,...,0.999239,33.163644,0.999933,30.050948,240.0,0.954014,86.200811,1.00038,0.999592,1.0
EGAF00002727240,1.034202,19.753629,2.408898,12.397321,0.864042,12.034061,1.049872,240.0,56.578862,1.000178,...,0.999842,13.486429,0.999592,38.213567,213.0,0.949956,99.149338,0.999825,0.999738,1.0
EGAF00002727280,1.181263,17.263602,19.997528,6.769311,0.917827,6.398794,1.221325,192.0,54.74213,0.999924,...,0.999406,28.467568,0.99943,34.523266,192.0,0.939283,92.459628,0.999859,1.000133,1.0
EGAF00002727290,0.992708,19.311146,15.781319,7.351755,0.839985,7.492604,1.052477,192.0,62.181978,0.999495,...,0.999601,13.561818,0.999991,38.500391,213.0,0.898705,116.488705,0.99998,0.999969,1.0
EGAF00002727254,1.153631,29.880457,28.730186,5.806152,0.887759,7.786167,1.217408,213.0,57.841763,0.999716,...,0.999722,24.124371,1.000202,34.816523,240.0,0.990337,86.858993,0.999686,0.999853,1.0


In [39]:
# make a new model on the reduced features
train_reduced_default,test_reduced_default = train_test_split(features_reduced_gbc, test_size = 0.25, random_state = 42)

# train reduced model
X_train_reduced_default = train_reduced_default.drop(["phenotype"], axis = 1)
y_train_reduced_default = train_reduced_default.loc[:,"phenotype"]
X_test_reduced_default = test_reduced_default.drop(["phenotype"], axis = 1)
y_test_reduced_default = test_reduced_default.loc[:,"phenotype"]

gbc_reduced_default = GradientBoostingClassifier(random_state=42)
gbc_reduced_default.fit(X_train_reduced_default, y_train_reduced_default)

# test model
predict_train = gbc_reduced_default.predict(X_train_reduced_default)
predict_test = gbc_reduced_default.predict(X_test_reduced_default)
probability_test = gbc_reduced_default.predict_proba(X_test_reduced_default)

accuracy_score(y_test_reduced_default, predict_test)

0.6666666666666666

In [40]:
AUCs,CIs = get_AUC_griffin(probability_test,data)
CIs

Unnamed: 0,median,0.025,0.975
1.0X,0.884615,0.884615,0.884615
2.0X,0.707692,0.707692,0.707692
3.0X,0.923077,0.923077,0.923077
breast_cancer,0.763736,0.763736,0.763736
overall,0.763736,0.763736,0.763736


Training a model only with the 20 features of highest feature importance does not improve model perfomance. It seems that the model with all features already ony includes features of high importance and therefore this approach is not needed.