# About the notebook
This script computes logistic regresson models based on the features generated as part of the thesis. Different adjustments of the models are tested (e.g. scaled data, Principal Component Analysis for dimensionality reduction, adjusting the hyperparameter, hyperparameter optimization).

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.signal import savgol_filter

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve,auc
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.metrics import roc_curve,auc,accuracy_score
from sklearn.preprocessing import StandardScaler

# Read in the data

In [2]:
# params
cancer_type = "breast_cancer"
GC = "corrected"
score = "MIDPOINT"
amplitude = "FFT"

In [3]:
# cancer features
file = "/data/gpfs-1/groups/ag_kircher/cfDNA-analysis/lea/cfDNA_classification_analyses/features/DELFI_"+cancer_type+"_"+GC+"_"+score+"_"+amplitude+"_features.csv"
c_features = pd.read_csv(file, sep="\t", index_col=0)

file = "/data/gpfs-1/groups/ag_kircher/cfDNA-analysis/lea/cfDNA_classification_analyses/features/DELFI_"+cancer_type+"_metadata.tsv"
c_meta = pd.read_csv(file, sep='\t', index_col='sample_name')

c_features = c_features.reset_index(drop=False)
c_features[['sample','p','score']] = c_features['index'].str.split('_',2, expand=True)
c_features = c_features.set_index('sample')
cancer = pd.concat([c_features, c_meta], axis=1)

In [4]:
# healthy features
file = "/data/gpfs-1/groups/ag_kircher/cfDNA-analysis/lea/cfDNA_classification_analyses/features/DELFI_healthy_"+GC+"_"+score+"_"+amplitude+"_features.csv"
h_features = pd.read_csv(file, sep="\t", index_col=0)

file = "/data/gpfs-1/groups/ag_kircher/cfDNA-analysis/lea/cfDNA_classification_analyses/features/DELFI_healthy_metadata.tsv"
h_meta = pd.read_csv(file, sep='\t', index_col='sample_name')

h_features = h_features.reset_index(drop=False)
h_features[['sample','p','score']] = h_features['index'].str.split('_',2, expand=True)
h_features = h_features.set_index('sample')
healthy = pd.concat([h_features, h_meta], axis=1)

In [5]:
# concat breast cancer and healthy
data = pd.concat([cancer, healthy], axis=0) #pd.concat([data1, data2], axis=0)

# nucleosome_spacing_fft
features = data.columns[(data.columns.str.startswith('central_cov')) | (data.columns.str.startswith('mean_cov')) | (data.columns.str.startswith('amplitude'))]
features_phenotype = data.columns[(data.columns.str.startswith('phenotype')) | (data.columns.str.startswith('central_cov')) | (data.columns.str.startswith('mean_cov')) | (data.columns.str.startswith('amplitude'))]

data.head()

Unnamed: 0,index,phenotype,central_coverage_NFKB2,mean_coverage_NFKB2,amplitude190_NFKB2,nucleosome_spacing_fft_NFKB2,central_coverage_TP73,mean_coverage_TP73,amplitude190_TP73,nucleosome_spacing_fft_TP73,...,Gender,Stage,Age,Status,% GC,Length,Median,≥ 1X,≥ 5X,fraction
EGAF00002727253,EGAF00002727253_c_MIDPOINT,1.0,0.921859,0.999899,5.96487,148.0,0.993629,0.999585,15.140059,192.0,...,F,I,54.0,breast_cancer,41%,140 bp,2.0X,88.0%,1.0%,0.06429
EGAF00002727240,EGAF00002727240_c_MIDPOINT,1.0,0.966934,1.0002,19.353707,192.0,1.033113,1.000039,9.433198,240.0,...,F,II,61.0,breast_cancer,42%,143 bp,2.0X,88.0%,3.0%,0.3644
EGAF00002727280,EGAF00002727280_c_MIDPOINT,1.0,1.161236,0.999987,11.505221,213.0,1.115174,0.999805,17.278634,192.0,...,F,II,37.0,breast_cancer,42%,134 bp,2.0X,86.0%,1.0%,0.09767
EGAF00002727290,EGAF00002727290_c_MIDPOINT,1.0,1.027811,0.999776,20.178665,192.0,1.038958,1.000246,3.898227,160.0,...,F,II,48.0,breast_cancer,41%,139 bp,2.0X,89.0%,2.0%,0.06922
EGAF00002727254,EGAF00002727254_c_MIDPOINT,1.0,1.118326,1.000246,10.611337,213.0,1.166457,0.999947,1.834101,213.0,...,F,II,47.0,breast_cancer,41%,134 bp,1.0X,86.0%,1.0%,0.1147


In [6]:
#scale data
scaled_data = pd.concat([cancer, healthy], axis=0) 
scaler = StandardScaler()
scaler.fit(scaled_data[features])
scaled_data[features] = scaler.transform(scaled_data[features])
scaled_data[features].mean()

scaled_data.head()

Unnamed: 0,index,phenotype,central_coverage_NFKB2,mean_coverage_NFKB2,amplitude190_NFKB2,nucleosome_spacing_fft_NFKB2,central_coverage_TP73,mean_coverage_TP73,amplitude190_TP73,nucleosome_spacing_fft_TP73,...,Gender,Stage,Age,Status,% GC,Length,Median,≥ 1X,≥ 5X,fraction
EGAF00002727253,EGAF00002727253_c_MIDPOINT,1.0,-1.587964,-0.402804,-1.431977,148.0,-1.235524,-2.211313,0.879156,192.0,...,F,I,54.0,breast_cancer,41%,140 bp,2.0X,88.0%,1.0%,0.06429
EGAF00002727240,EGAF00002727240_c_MIDPOINT,1.0,-0.575681,1.081786,0.380465,192.0,-0.233665,0.148223,-0.184884,240.0,...,F,II,61.0,breast_cancer,42%,143 bp,2.0X,88.0%,3.0%,0.3644
EGAF00002727280,EGAF00002727280_c_MIDPOINT,1.0,3.787805,0.031149,-0.681981,213.0,1.848499,-1.066968,1.277891,192.0,...,F,II,37.0,breast_cancer,42%,134 bp,2.0X,86.0%,1.0%,0.09767
EGAF00002727290,EGAF00002727290_c_MIDPOINT,1.0,0.791448,-1.014266,0.49214,192.0,-0.08535,1.227604,-1.216876,160.0,...,F,II,48.0,breast_cancer,41%,139 bp,2.0X,89.0%,2.0%,0.06922
EGAF00002727254,EGAF00002727254_c_MIDPOINT,1.0,2.82417,1.308605,-0.802986,213.0,3.14975,-0.327229,-1.60173,213.0,...,F,II,47.0,breast_cancer,41%,134 bp,1.0X,86.0%,1.0%,0.1147


In [7]:
# split data into a training (75%) and testing set (25%)
train,test = train_test_split(data, test_size = 0.25, random_state = 100) #, random_state = 42

X_train = train[features]
y_train = train.loc[:,"phenotype"]
X_test = test[features]
y_test = test.loc[:,"phenotype"]

In [8]:
# split scaled data into a training (75%) and testing set (25%)
train_scaled,test_scaled = train_test_split(scaled_data, test_size = 0.25, random_state = 42)

X_train_scaled = train_scaled[features]
y_train_scaled = train_scaled.loc[:,"phenotype"]
X_test_scaled = test_scaled[features]
y_test_scaled = test_scaled.loc[:,"phenotype"]

# Functions

In [9]:
def calculate_logReg(data, PCA_flag, adjustment_flag, iterations):
    fraction_variance = .8
    probabilities = pd.DataFrame(index=data.index)
    coefs = pd.DataFrame(index=features)
    train_indexes = []

    # Loop for each iteration
    for i in range(iterations):

        train,test = train_test_split(data, test_size = 0.25, random_state = i+100)
        X_train = train[features]
        y_train = train.loc[:,"phenotype"]
        X_test = test[features]
        y_test = test.loc[:,"phenotype"]
    
        if PCA_flag == True:
            #perform PCA on the training set
            n_components = min(len(features), len(X_train))
            pca = PCA(n_components=n_components, svd_solver='randomized', random_state = 100)
            PCs = pca.fit_transform(X_train[features])
            principal_components = pd.DataFrame(data = PCs, columns = ['PC_'+str(m) for m in np.arange(n_components)], index = X_train.index)

            #find the principle components that make up 80% of the varience
            for j in range(len(pca.explained_variance_ratio_)):
                current_sum = pca.explained_variance_ratio_[:j].sum()
                if current_sum>=fraction_variance:
                    break
            pca_features = ['PC_'+str(m) for m in np.arange(0,j)]

            #apply to the test data
            test_PCs = pca.transform(X_test[features])
            test_principal_components = pd.DataFrame(data = test_PCs , columns = ['PC_'+str(m) for m in np.arange(n_components)], index = X_test.index)

            X_train = principal_components[pca_features]
            X_test = test_principal_components[pca_features]

        if adjustment_flag == True:
            model = LogisticRegression(class_weight='balanced', max_iter=500, solver = 'liblinear')
        else:
            model = LogisticRegression(class_weight='balanced', max_iter=500)
        model.fit(X_train, y_train)

        #predict the test data
        pred = model.predict(X_test)
        prob = model.predict_proba(X_test)

        #save results
        probabilities[i] = pd.Series(prob[:,1], index = X_test.index)
        acc = accuracy_score(y_test, pred)
        train_indexes.append(list(X_train.index))

        if i%20==0:
            #prevent dfs from becoming too fragmented
            probabilities = probabilities.copy()
            coefs = coefs.copy()

    probabilities = probabilities.merge(data[['phenotype']], left_index=True, right_index=True)
    AUCs = pd.DataFrame()
    probabilities = probabilities.merge(data[['fraction','Status','Stage']], left_index=True, right_index=True)

    for i in range(iterations):
        current_dict = {}
        current = probabilities[~(probabilities[i].isnull())][['phenotype','fraction','Status','Stage',i]].copy()

        #overall accuracy and AUC
        group = 'overall'
        fpr,tpr,_ = roc_curve(current['phenotype'],current[i])
        AUC = auc(fpr,tpr)
        current_dict[group] = AUC
        del(AUC,group,fpr,tpr)

        #separate out the healthy samples to be used in every AUC
        healthy_df = current[current['phenotype']==0]
        cancer_df = current[current['phenotype']==1]
        del(current)

        for group,df in cancer_df.groupby('Status'):
            if group == 'Duodenal_Cancer':
                continue

            df2 = df.append(healthy_df, ignore_index=True)
            fpr,tpr,_ = roc_curve(df2['phenotype'],df2[i])
            AUC = auc(fpr,tpr)
            current_dict[group] = AUC
            del(AUC,group,fpr,tpr)

        AUCs = AUCs.append(pd.Series(current_dict), ignore_index=True)

    CIs = pd.DataFrame([AUCs.median(), AUCs.quantile(.025), AUCs.quantile(.975)]).T
    CIs = CIs.rename(columns = {'Unnamed 0':'median'}) 
    return(acc,CIs)

In [10]:
def hyperparameter_optimization_logReg(data, PCA_flag, iterations):
    fraction_variance = .8
    probabilities = pd.DataFrame(index=data.index)
    coefs = pd.DataFrame(index=features)
    train_indexes = []

    # Loop for each iteration
    for i in range(iterations):

        train,test = train_test_split(data, test_size = 0.25, random_state = i+100)
        X_train = train[features]
        y_train = train.loc[:,"phenotype"]
        X_test = test[features]
        y_test = test.loc[:,"phenotype"]
    
        if PCA_flag == True:
            #perform PCA on the training set
            n_components = min(len(features), len(X_train))
            pca = PCA(n_components=n_components, svd_solver='randomized', random_state = 100)
            PCs = pca.fit_transform(X_train[features])
            principal_components = pd.DataFrame(data = PCs, columns = ['PC_'+str(m) for m in np.arange(n_components)], index = X_train.index)

            #find the principle components that make up 80% of the varience
            for j in range(len(pca.explained_variance_ratio_)):
                current_sum = pca.explained_variance_ratio_[:j].sum()
                if current_sum>=fraction_variance:
                    break
            pca_features = ['PC_'+str(m) for m in np.arange(0,j)]

            #apply to the test data
            test_PCs = pca.transform(X_test[features])
            test_principal_components = pd.DataFrame(data = test_PCs , columns = ['PC_'+str(m) for m in np.arange(n_components)], index = X_test.index)

            X_train = principal_components[pca_features]
            X_test = test_principal_components[pca_features]
        
        if i == 0:
            hyperparameters = {'C': [0.0001, 0.001,0.01,0.1,1,10,100,1000]}
            cv = StratifiedKFold(n_splits=10, shuffle=True, random_state = i+100) 
            model = LogisticRegression(class_weight='balanced', max_iter=500, solver = 'liblinear')
            search = GridSearchCV(estimator=model, param_grid=hyperparameters, cv=cv, n_jobs = 1)
            search.fit(X_train, y_train)
            best_C = search.best_params_['C']

        ##train a new model on the full training dataset
        model = LogisticRegression(class_weight='balanced', max_iter=500, C=best_C, solver = 'liblinear')
        model.fit(X_train, y_train)

        #predict the test data
        pred = model.predict(X_test)
        prob = model.predict_proba(X_test)

        #save results
        probabilities[i] = pd.Series(prob[:,1], index = X_test.index)
        acc = accuracy_score(y_test, pred)
        train_indexes.append(list(X_train.index))

        if i%20==0:
            #prevent dfs from becoming too fragmented
            probabilities = probabilities.copy()
            coefs = coefs.copy()

    probabilities = probabilities.merge(data[['phenotype']], left_index=True, right_index=True)
    AUCs = pd.DataFrame()
    probabilities = probabilities.merge(data[['fraction','Status','Stage']], left_index=True, right_index=True)

    for i in range(iterations):
        current_dict = {}
        current = probabilities[~(probabilities[i].isnull())][['phenotype','fraction','Status','Stage',i]].copy()

        #overall accuracy and AUC
        group = 'overall'
        fpr,tpr,_ = roc_curve(current['phenotype'],current[i])
        AUC = auc(fpr,tpr)
        current_dict[group] = AUC
        del(AUC,group,fpr,tpr)

        #separate out the healthy samples to be used in every AUC
        healthy_df = current[current['phenotype']==0]
        cancer_df = current[current['phenotype']==1]
        del(current)

        for group,df in cancer_df.groupby('Status'):
            if group == 'Duodenal_Cancer':
                continue

            df2 = df.append(healthy_df, ignore_index=True)
            fpr,tpr,_ = roc_curve(df2['phenotype'],df2[i])
            AUC = auc(fpr,tpr)
            current_dict[group] = AUC
            del(AUC,group,fpr,tpr)

        AUCs = AUCs.append(pd.Series(current_dict), ignore_index=True)

    CIs = pd.DataFrame([AUCs.median(), AUCs.quantile(.025), AUCs.quantile(.975)]).T
    CIs = CIs.rename(columns = {'Unnamed 0':'median'}) 
    return(acc,CIs)

# Logistic Regression
First train a logistic regression model with all features on default parameters

In [11]:
# default model 
acc,CIs = calculate_logReg(data, False, False, 1)
print(acc)
print(CIs)

0.5185185185185185
                 median     0.025     0.975
breast_cancer  0.611111  0.611111  0.611111
overall        0.611111  0.611111  0.611111


In [12]:
# default model over 1,000 iterations
acc,CIs = calculate_logReg(data, False, False, 1000)
print(acc)
print(CIs)

0.6666666666666666
                 median     0.025     0.975
breast_cancer  0.758242  0.593344  0.905582
overall        0.758242  0.593344  0.905582


In [13]:
# default model using Principal Component Analysis to reduce dimensionality
acc,CIs = calculate_logReg(data, True, False, 1)
print(acc)
print(CIs)

0.5185185185185185
                 median     0.025     0.975
breast_cancer  0.438889  0.438889  0.438889
overall        0.438889  0.438889  0.438889


In [14]:
# default model using Principal Component Analysis to reduce dimensionality over 1,000 iterations
acc,CIs = calculate_logReg(data, True, False, 1000)
print(acc)
print(CIs)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.6296296296296297
                 median     0.025     0.975
breast_cancer  0.681319  0.499863  0.840997
overall        0.681319  0.499863  0.840997


Rerun the models using scaled data.

In [15]:
# default model with scaled data
acc,CIs = calculate_logReg(scaled_data, False, False, 1)
print(acc)
print(CIs)

0.6296296296296297
                 median     0.025     0.975
breast_cancer  0.711111  0.711111  0.711111
overall        0.711111  0.711111  0.711111


In [16]:
# default model over 1,000 iterations with scaled data
acc,CIs = calculate_logReg(scaled_data, False, False, 1000)
print(acc)
print(CIs)

0.6296296296296297
                 median  0.025     0.975
breast_cancer  0.852273    0.7  0.966667
overall        0.852273    0.7  0.966667


In [17]:
# default model using Principal Component Analysis to reduce dimensionality with scaled data
acc,CIs = calculate_logReg(scaled_data, True, False, 1)
print(acc)
print(CIs)

0.6296296296296297
                 median     0.025     0.975
breast_cancer  0.616667  0.616667  0.616667
overall        0.616667  0.616667  0.616667


In [18]:
# default model using Principal Component Analysis to reduce dimensionality over 1,000 iterations with scaled data
acc,CIs = calculate_logReg(scaled_data, True, False, 1000)
print(acc)
print(CIs)

0.6296296296296297
                 median     0.025     0.975
breast_cancer  0.811438  0.633267  0.939651
overall        0.811438  0.633267  0.939651


Now use a gradient boosting model with adjusted hyperparameters.

In [19]:
# adjusted model 
acc,CIs = calculate_logReg(data, False, True, 1)
print(acc)
print(CIs)

0.5185185185185185
                 median     0.025     0.975
breast_cancer  0.611111  0.611111  0.611111
overall        0.611111  0.611111  0.611111


In [20]:
# adjusted model over 1,000 iterations
acc,CIs = calculate_logReg(data, False, True, 1000)
print(acc)
print(CIs)

0.6666666666666666
                 median     0.025     0.975
breast_cancer  0.758533  0.590842  0.905556
overall        0.758533  0.590842  0.905556


In [21]:
# adjusted model using Principal Component Analysis to reduce dimensionality
acc,CIs = calculate_logReg(data, True, True, 1)
print(acc)
print(CIs)

0.5185185185185185
                 median     0.025     0.975
breast_cancer  0.427778  0.427778  0.427778
overall        0.427778  0.427778  0.427778


In [22]:
# adjusted model using Principal Component Analysis to reduce dimensionality over 1,000 iterations
acc,CIs = calculate_logReg(data, True, True, 1000)
print(acc)
print(CIs)

0.6296296296296297
                 median     0.025     0.975
breast_cancer  0.681319  0.494504  0.841258
overall        0.681319  0.494504  0.841258


And use the adjusted model with scaled data.

In [23]:
# adjusted model with scaled data
acc,CIs = calculate_logReg(scaled_data, False, True, 1)
print(acc)
print(CIs)

0.7037037037037037
                 median     0.025     0.975
breast_cancer  0.716667  0.716667  0.716667
overall        0.716667  0.716667  0.716667


In [24]:
# adjusted model over 1,000 iterations with scaled data
acc,CIs = calculate_logReg(scaled_data, False, True, 1000)
print(acc)
print(CIs)

0.5925925925925926
               median     0.025     0.975
breast_cancer    0.85  0.694436  0.966667
overall          0.85  0.694436  0.966667


In [25]:
# adjusted model using Principal Component Analysis to reduce dimensionality with scaled data
acc,CIs = calculate_logReg(scaled_data, True, True, 1)
print(acc)
print(CIs)

0.5925925925925926
               median  0.025  0.975
breast_cancer     0.6    0.6    0.6
overall           0.6    0.6    0.6


In [26]:
# adjusted model using Principal Component Analysis to reduce dimensionality over 1,000 iterations with scaled data
acc,CIs = calculate_logReg(scaled_data, True, True, 1000)
print(acc)
print(CIs)

0.6666666666666666
                 median     0.025   0.975
breast_cancer  0.807692  0.631696  0.9375
overall        0.807692  0.631696  0.9375


When comparing all these model, it shows that using scaled data improves the model. This was expected of a linear model. PCA did not improve but rather reduce model performance. This means that still a model with way many features than samples is used and other ways need to be found to reduce dimensionality. PCA reduces dimensionality unaware of the sample's class abels and is therefore considered an unsupervised approach. This makes PCA a sub-optimal approach for the classification problem of this thesis.

# Hyperparameter Optimization

In [27]:
# Grid search without PCA, 1 iteration
acc,CIs = hyperparameter_optimization_logReg(scaled_data, False, 1)
print(acc)
print(CIs)

0.5185185185185185
               median  0.025  0.975
breast_cancer    0.55   0.55   0.55
overall          0.55   0.55   0.55


In [28]:
# Grid search without PCA, 1000 iterations
acc,CIs = hyperparameter_optimization_logReg(scaled_data, False, 1000)
print(acc)
print(CIs)

0.6666666666666666
                 median     0.025     0.975
breast_cancer  0.747253  0.538799  0.916667
overall        0.747253  0.538799  0.916667


In [29]:
# Grid search with PCA, 1 iteration
acc,CIs = hyperparameter_optimization_logReg(scaled_data, True, 1)
print(acc)
print(CIs)

0.5925925925925926
                 median     0.025     0.975
breast_cancer  0.666667  0.666667  0.666667
overall        0.666667  0.666667  0.666667


In [30]:
# Grid search with PCA, 1000 iterations
acc,CIs = hyperparameter_optimization_logReg(scaled_data, True, 1000)
print(acc)
print(CIs)

0.7037037037037037
                 median     0.025     0.975
breast_cancer  0.817914  0.642045  0.944444
overall        0.817914  0.642045  0.944444


Hyperparameter optimization is highly variable between different runs and gives worse results compared to the default model, which should not happen. If the default parameter give a better perfromance these should be used in HPO. (HPO optimizes for accuracy and therefore this should at least be the same.) This indicates that the optimization does not work. In a dataset tht suffers from the curse of dimensionality determining the optimal value for each parameter is challenging. In this scenario it might be better to set the parameter manually according to the requirement of the data.