In [1]:
import numpy as np
import pandas as pd
import os

# Feature Importance Analysis

In [2]:
def k_fold_feature_importance_metrics(cph_file_name, rsf_file_name, sorted_feature_names): 
    regression_coefficients = []
    p_values = []
    rsf_weights = []
    for i in range(5):
        ### CPH feature importance
        fold_cph_file = "split" + str(i) + "/results/" + cph_file_name
        cph_table = pd.read_html(fold_cph_file)
        assert len(cph_table) == 1
        fold_cph_df = cph_table[0]
        fold_cph_df.rename(columns={"Unnamed: 0":"Feature"}, inplace=True)

        # Check that the feature order is the same for all folds
        fold_cph_df.sort_values(by=["Feature"], inplace = True)
        assert list(fold_cph_df["Feature"]) == sorted_feature_names

        regression_coefficients.append(fold_cph_df["coef"].to_numpy())
        p_values.append(fold_cph_df["p"].to_numpy())

        ### RSF feature importance
        fold_rsf_file = "split" + str(i) + "/results/" + rsf_file_name
        rsf_table = pd.read_html(fold_rsf_file)
        assert len(rsf_table) == 1
        fold_rsf_df = rsf_table[0]

        # Check that the feature order is the same for all folds
        fold_rsf_df.sort_values(by=["Feature"], inplace = True)
        assert list(fold_rsf_df["Feature"]) == sorted_feature_names

        fold_rsf_df[["Weight Avg", "Weight Std"]] = fold_rsf_df["Weight"].str.split("±", expand=True).astype("float")
        rsf_weights.append(fold_rsf_df["Weight Avg"])

    regression_coefficients = np.stack(regression_coefficients, axis=0)
    avg_coef = np.mean(regression_coefficients, axis=0)
    std_coef = np.std(regression_coefficients, axis=0)
    str_regression_coefficients = ["{:.3f} ± {:.3f}".format(mean, std) for (mean, std) in zip(avg_coef, std_coef)]

    p_values = np.stack(p_values, axis=0)
    avg_p = np.mean(p_values, axis = 0)
    std_p = np.mean(p_values, axis = 0)
    str_p_values = ["{:.3f} ± {:.3f}".format(mean, std) for (mean, std) in zip(avg_p, std_p)]

    rsf_weights = np.stack(rsf_weights, axis=0)
    weight_avg_across_folds = np.mean(rsf_weights, axis=0)
    weight_std_across_folds = np.std(rsf_weights, axis=0)
    str_rsf_weights = ["{:.3f} ± {:.3f}".format(mean, std) for (mean, std) in zip(weight_avg_across_folds, weight_std_across_folds)]

    k_fold_df = pd.DataFrame(data={"Feature": sorted_feature_names, "CPH coefficient": str_regression_coefficients, "CPH p-value": str_p_values, "RSF Importance Weight": str_rsf_weights})
    
    return k_fold_df

## Clinical

In [3]:
clin_feature_names = [
        'Age',
        'BMI',
        'Pack-year',
        'Smoking-start-age',
        'Cigarettes-per-day',
        'Number-of-smoking-years',
        'High-school-graduate',
        'Post-HS-training',
        'Associate-degree',
        'Bachelors-degree',
        'Graduate-school',
        'Female',
        'Non-white',
        'Smoking-at-the-start-of-trial',
        'Lived-with-smoker',
        'Worked-with-smoker',
        'Cancer-prior-to-trial',
        'Family-member-had-cancer']

clin_feature_names.sort()

In [4]:
cph_table = pd.read_html("split4/results/cph_clinical_all.html")

In [5]:
cph_file_name = "cph_clinical_all.html"
rsf_file_name = "rsf_clinical_all.html"

clinical_df = k_fold_feature_importance_metrics(cph_file_name, rsf_file_name, clin_feature_names)
display(clinical_df)

Unnamed: 0,Feature,CPH coefficient,CPH p-value,RSF Importance Weight
0,Age,-0.028 ± 0.037,0.853 ± 0.853,0.061 ± 0.017
1,Associate-degree,0.096 ± 0.141,0.794 ± 0.794,0.013 ± 0.006
2,BMI,0.028 ± 0.034,0.833 ± 0.833,0.094 ± 0.013
3,Bachelors-degree,-0.082 ± 0.161,0.868 ± 0.868,0.008 ± 0.006
4,Cancer-prior-to-trial,0.436 ± 0.283,0.350 ± 0.350,0.017 ± 0.010
5,Cigarettes-per-day,-0.001 ± 0.006,0.989 ± 0.989,0.035 ± 0.011
6,Family-member-had-cancer,0.108 ± 0.140,0.733 ± 0.733,0.016 ± 0.011
7,Female,-0.259 ± 0.229,0.446 ± 0.446,0.020 ± 0.004
8,Graduate-school,-0.100 ± 0.170,0.844 ± 0.844,0.003 ± 0.002
9,High-school-graduate,0.047 ± 0.082,0.882 ± 0.882,0.012 ± 0.005


## Radiomics 1

In [6]:
radiomics1_feature_names = ['original_shape_Elongation', 'original_shape_Flatness', 'original_shape_LeastAxisLength', 'original_shape_MajorAxisLength', 'original_shape_Maximum2DDiameterColumn', 'original_shape_Maximum2DDiameterRow', 'original_shape_Maximum2DDiameterSlice', 'original_shape_Maximum3DDiameter', 'original_shape_MinorAxisLength', 'original_shape_Sphericity', 'original_shape_SurfaceArea', 'original_shape_SurfaceVolumeRatio', 'original_shape_VoxelVolume']
radiomics1_feature_names.sort()

In [7]:
cph_file_name = "cph_radiomics1_all.html"
rsf_file_name = "rsf_radiomics1_all.html"

radiomics1_df = k_fold_feature_importance_metrics(cph_file_name, rsf_file_name, radiomics1_feature_names)
display(radiomics1_df)

Unnamed: 0,Feature,CPH coefficient,CPH p-value,RSF Importance Weight
0,original_shape_Elongation,-0.162 ± 0.102,0.434 ± 0.434,0.017 ± 0.006
1,original_shape_Flatness,0.100 ± 0.109,0.668 ± 0.668,0.024 ± 0.015
2,original_shape_LeastAxisLength,0.000 ± 0.106,0.766 ± 0.766,0.019 ± 0.013
3,original_shape_MajorAxisLength,0.034 ± 0.049,0.872 ± 0.872,0.029 ± 0.015
4,original_shape_Maximum2DDiameterColumn,0.280 ± 0.153,0.322 ± 0.322,0.021 ± 0.020
5,original_shape_Maximum2DDiameterRow,0.144 ± 0.097,0.610 ± 0.610,0.021 ± 0.025
6,original_shape_Maximum2DDiameterSlice,-0.098 ± 0.091,0.755 ± 0.755,0.016 ± 0.025
7,original_shape_Maximum3DDiameter,0.116 ± 0.112,0.678 ± 0.678,0.018 ± 0.030
8,original_shape_MinorAxisLength,0.029 ± 0.056,0.932 ± 0.932,0.019 ± 0.014
9,original_shape_Sphericity,0.140 ± 0.106,0.451 ± 0.451,0.048 ± 0.025


## Radiomics 2

In [8]:
radiomics2_feature_names = ['original_firstorder_10Percentile', 'original_firstorder_90Percentile', 'original_firstorder_Energy', 'original_firstorder_Entropy', 'original_firstorder_InterquartileRange', 'original_firstorder_Kurtosis', 'original_firstorder_Maximum', 'original_firstorder_Median', 'original_firstorder_Minimum', 'original_firstorder_RootMeanSquared', 'original_firstorder_Skewness', 'original_firstorder_TotalEnergy', 'original_firstorder_Uniformity', 'original_firstorder_Variance']
radiomics2_feature_names.sort()

In [9]:
cph_file_name = "cph_radiomics2_all.html"
rsf_file_name = "rsf_radiomics2_all.html"

radiomics2_df = k_fold_feature_importance_metrics(cph_file_name, rsf_file_name, radiomics2_feature_names)
display(radiomics2_df)

Unnamed: 0,Feature,CPH coefficient,CPH p-value,RSF Importance Weight
0,original_firstorder_10Percentile,0.004 ± 0.005,0.928 ± 0.928,0.018 ± 0.010
1,original_firstorder_90Percentile,0.016 ± 0.011,0.832 ± 0.832,0.009 ± 0.008
2,original_firstorder_Energy,0.110 ± 0.077,0.360 ± 0.360,0.018 ± 0.004
3,original_firstorder_Entropy,-0.002 ± 0.004,0.970 ± 0.970,0.011 ± 0.006
4,original_firstorder_InterquartileRange,0.001 ± 0.001,0.989 ± 0.989,0.018 ± 0.023
5,original_firstorder_Kurtosis,-0.018 ± 0.027,0.841 ± 0.841,0.018 ± 0.004
6,original_firstorder_Maximum,0.005 ± 0.008,0.916 ± 0.916,0.027 ± 0.007
7,original_firstorder_Median,0.099 ± 0.071,0.526 ± 0.526,0.023 ± 0.009
8,original_firstorder_Minimum,0.010 ± 0.013,0.907 ± 0.907,0.014 ± 0.016
9,original_firstorder_RootMeanSquared,-0.055 ± 0.030,0.682 ± 0.682,0.017 ± 0.019


# TD-AUC Curve

In [10]:
cpc_auc_file = "clinical_all_cpc_auc.npy"
rsf_auc_file = "clinical_all_rsf_auc.npy"
va_times = "clinical_all_va_times.npy"

cpc_auc_by_split = []
rsf_auc_by_split = []
for i in range(2):
    cpc_auc_by_split.append(np.load("split" + str(i) + "/" + cpc_auc_file))
    rsf_auc_by_split.append(np.load("split" + str(i) + "/" + rsf_auc_file))
    
cpc_auc_matrix = np.stack(cpc_auc_by_split, axis=1)
rsf_auc_matrix = np.stack(rsf_auc_by_split, axis=1)

ValueError: all input arrays must have the same shape