In [86]:
library(glmnet)
library(randomForest)
library(dplyr)
library(BBmisc)
library(caret)
library(tictoc)
library(pROC)
library(ROCR)
library(randomForestSRC)

In [87]:
data = read.csv('../../AnnotateData/recgli_annotation10_CELorNELorUNK.csv')
dim(data)

In [88]:
data = data[data$outcome == "rHGG" | data$outcome == "txe", ]
dim(data)

In [89]:
data = data[data$CELorNEL == "CEL" | data$CELorNEL == "NEL",]
dim(data)

In [90]:
data = droplevels.data.frame(data)

In [91]:
id = "t_number_y"
features = c('nadc.1', 'nfa.1',
             'cbvn_nlin', 'recov_npar', 
             'ccri', 'cni', 'ncre', 'ncho', 'nnaa', 
             'in_CEL')
label = 'outcome'

In [92]:
data_xy = data[,c(id, features, label)]
head(data_xy)

Unnamed: 0,t_number_y,nadc.1,nfa.1,cbvn_nlin,recov_npar,ccri,cni,ncre,ncho,nnaa,in_CEL,outcome
1,10366,0.75,0.83,1.08,98.89,1.88,2.6,0.47,1.03,0.46,0,txe
3,10448,1.62,0.5,2.43,100.0,,,,,,1,rHGG
4,10594,2.62,0.81,1.64,78.53,,,,,,0,txe
5,10594,2.12,0.42,0.95,100.0,1.03,3.27,0.79,1.04,0.36,0,rHGG
6,10594,1.6,0.73,0.84,100.0,2.33,7.57,1.07,1.61,0.13,0,rHGG
7,10594,3.03,0.42,0.42,100.0,2.13,4.37,1.21,1.89,0.74,0,rHGG


In [93]:
data_xy = data.frame(data_xy, 
                    hasDiffu = ifelse(is.na(data_xy$nadc.1), 0, 1), 
                    hasPerf  = ifelse(is.na(data_xy$cbvn_nlin), 0, 1), 
                    hasSpec  = ifelse(is.na(data_xy$cni), 0, 1))

## train test split

In [94]:
unique_tnums = unique(data_xy$t_number_y)
# 12
set.seed(12)
train_tnums = unique_tnums[sample(c(1:length(unique_tnums)), size = round(length(unique_tnums)*.75))]
test_tnums = unique_tnums[! unique_tnums %in% train_tnums]
test_tnums

In [95]:
table(data_xy$outcome)/sum(table(data_xy$outcome)['rHGG'], table(data_xy$outcome)['txe'])


     rHGG       txe 
0.7628866 0.2371134 

In [96]:
data_train = data_xy[data_xy$t_number_y %in% train_tnums, ]
dim(data_train)
table(data_train$outcome)/sum(table(data_train$outcome)['rHGG'], table(data_train$outcome)['txe'])


    rHGG      txe 
0.760181 0.239819 

In [97]:
data_test = data_xy[data_xy$t_number_y %in% test_tnums, ]
dim(data_test)
table(data_test$outcome)/sum(table(data_test$outcome)['rHGG'], table(data_test$outcome)['txe'])


     rHGG       txe 
0.7714286 0.2285714 

## filling in missing values:


In [98]:
# data_med = median imputation
# data_999 = 999 coded for missingness
# data_SRC = randomForestSRC imputated

In [109]:
for (col in c('nadc.1', 'nfa.1', 'cbvn_nlin', 'recov_npar', 'ccri', 'cni', 'ncre', 'ncho', 'nnaa')){
    
    ## normalize the training data & then the testing data w/ training params: 
    mean_col = mean(data_train[,col], na.rm = T)
    sd_col   = sd(data_train[,col], na.rm = T)    
    data_train[,col] = (data_train[,col]-mean_col)/sd_col
    data_test[,col]  = (data_test[,col]-mean_col)/sd_col 
}

data_999_train = data_train
data_999_test = data_test
data_med_train = data_train
data_med_test  = data_test

## fill in the data_999 training and testing w/ 999 for missing data: 
for (col in c('nadc.1', 'nfa.1', 'cbvn_nlin', 'recov_npar', 'ccri', 'cni', 'ncre', 'ncho', 'nnaa')){
    data_999_train[,col] = ifelse(is.na(data_999_train[,col]), 999, data_999_train[,col])
    data_999_test[,col] = ifelse(is.na(data_999_test[,col]), 999, data_999_test[,col])
    
    ## fill in the data_med training & testing w/ median of column from training: 
    data_med_train[,col] = ifelse(is.na(data_med_train[,col]), 
                                  median(data_med_train[,col], na.rm = T), 
                                  data_med_train[,col])
    data_med_test[,col] = ifelse(is.na(data_med_test[,col]), 
                                  median(data_med_train[,col], na.rm = T), 
                                  data_med_test[,col])
}
    

In [112]:
indicators = c('hasDiffu', 'hasPerf', 'hasSpec')

In [113]:
data_src_train = randomForestSRC::impute(data = data_train)
head(data_src_train)

Unnamed: 0,t_number_y,nadc.1,nfa.1,cbvn_nlin,recov_npar,ccri,cni,ncre,ncho,nnaa,in_CEL,outcome,hasDiffu,hasPerf,hasSpec
3,10448,-0.1572098,-0.97165715,0.79157206,0.8160546,-0.06392343,-0.10554231,0.0794532,-0.03314834,0.099396717,1,rHGG,1,1,0
4,10594,1.8648729,-0.02242617,0.06733624,-0.7763788,0.043825918,-0.09990289,-0.0543178,-0.08372352,-0.006595746,0,txe,1,1,0
5,10594,0.8538315,-1.21661999,-0.56522416,0.8160546,-0.435252492,-0.19577871,0.17133743,-0.32372189,-0.091011352,0,rHGG,1,1,1
6,10594,-0.1976515,-0.26738901,-0.66606712,0.8160546,-0.004547998,0.79558551,0.54667122,0.25046092,-0.826662218,0,rHGG,1,1,1
7,10594,2.6939268,-1.21661999,-1.05110389,0.8160546,-0.070810228,0.05782609,0.73433811,0.53251564,1.124411817,0,rHGG,1,1,1
10,10708,-1.1075887,1.07990658,0.14984412,0.8160546,1.439968611,1.13449374,0.09090877,1.1973589,0.292806491,1,rHGG,1,1,1


## using RFs: 

In [114]:
outcome = "outcome"
features = c('in_CEL', 'cbvn_nlin', 'recov_npar',
             'ccri', 'cni', 'ncho', 'ncre', 'nnaa', 'nfa.1', 'nadc.1',
            'hasPerf', 'hasSpec', 'hasDiffu')
features.addsign <- paste(features, collapse = "+")
rf_formula = as.formula(paste(outcome, features.addsign, sep = "~"))

In [122]:
rf_999 = randomForest(formula = rf_formula, 
                   data = data.frame(data_999_train))
rf_med = randomForest(formula = rf_formula, 
                   data = data.frame(data_med_train))
rf_src = randomForest(formula = rf_formula, 
                   data = data.frame(data_src_train))

In [128]:
data.frame(importance(rf_999), importance(rf_med), importance(rf_src))

Unnamed: 0,MeanDecreaseGini,MeanDecreaseGini.1,MeanDecreaseGini.2
in_CEL,1.7045021,1.6974648,1.1609921
cbvn_nlin,8.4518094,9.6020829,9.7081295
recov_npar,11.3424045,12.2522216,12.0236457
ccri,3.8581932,4.3651741,7.3087713
cni,4.0518004,4.4137988,7.887951
ncho,4.0590189,4.4382967,8.0868214
ncre,3.6567512,3.8277764,8.4296409
nnaa,4.2205653,4.6867316,8.7385343
nfa.1,10.1393563,9.8532974,6.9285991
nadc.1,9.881255,11.1514257,7.2559096


In [138]:
importance(rf_999)['cbvn_nlin',]

## how important are the divisions for RF varImp scores? 

In [152]:
## let's extract confusion matrix and put that in a data frame: 
outcome_exp_template = data.frame(expnum = NA,
tr_txe_pct = NA, 
cv_txe_pct = NA, 
tr_outtxe_predtxe = NA, 
tr_outtxe_predrHGG = NA, 
tr_outrHGG_predtxe = NA, 
tr_outrHGG_predrHGG = NA, 
cv_outtxe_predtxe = NA, 
cv_outtxe_predrHGG = NA, 
cv_outrHGG_predtxe = NA, 
cv_outrHGG_predrHGG = NA, 
varImp_in_CEL = NA, 
varImp_cbvn_nlin = NA, 
varImp_recov_npar = NA, 
varImp_ccri = NA, 
varImp_cni = NA, 
varImp_ncho = NA, 
varImp_ncre = NA, 
varImp_nnaa = NA, 
varImp_nfa.1 = NA, 
varImp_nadc.1 = NA, 
varImp_hasDiffu = NA, 
varImp_hasPerf = NA, 
varImp_hasSpec = NA)

In [153]:
varImp_col = paste('varImp_', col, sep = "")
varImp_col

In [154]:
outcome_999_experiment = outcome_exp_template

In [155]:
## let's begin by running experiments for anatomic features only: 
## for each experiment, we'll run the random forest with a variety of parameters 
expnum = 1
tic()
for (i in 1:25){
    set.seed(i)
    
    ## define features and outcome: 
    outcome = "outcome"
    features = c('in_CEL', 'cbvn_nlin', 'recov_npar',
                 'ccri', 'cni', 'ncho', 'ncre', 'nnaa', 'nfa.1', 'nadc.1',
                'hasPerf', 'hasSpec', 'hasDiffu')
    features.addsign <- paste(features, collapse = "+")
    rf_formula = as.formula(paste(outcome, features.addsign, sep = "~"))
    
    ## define training and testing (from the training) basically for x-val: 
    unique_tnums = unique(data_999_train$t_number_y)
    train_tnums = unique_tnums[sample(c(1:length(unique_tnums)), size = round(length(unique_tnums)*.75))]
    test_tnums = unique_tnums[! unique_tnums %in% train_tnums]
    
    data_exp_999_train = data_999_train[data_999_train$t_number_y %in% train_tnums, ]
    data_exp_999_cv    = data_999_train[! data_999_train$t_number_y %in% train_tnums, ]    
    
    rf_999 = randomForest(formula = rf_formula, 
             data = data_exp_999_train)
    
    importance_rf_999 = importance(rf_999)
    
    cv_999_preds = predict(rf_999, data_exp_999_cv)

    cv_999_conf_mat = confusionMatrix(cv_999_preds, data_exp_999_cv$outcome)
    
    tr_txe_pct = table(data_exp_999_train$outcome)['txe']/sum(table(data_exp_999_train$outcome)['txe'], table(data_exp_999_train$outcome)['rHGG'])
    cv_txe_pct = table(data_exp_999_cv$outcome)['txe']/sum(table(data_exp_999_cv$outcome)['txe'], table(data_exp_999_cv$outcome)['rHGG'])

    outcome_999_experiment[expnum,'expnum'] = expnum
    outcome_999_experiment[expnum,'tr_txe_pct'] = tr_txe_pct
    outcome_999_experiment[expnum,'cv_txe_pct'] = cv_txe_pct
    
    outcome_999_experiment[expnum,'tr_outtxe_predtxe'] = rf_999$confusion['txe', 'txe']
    outcome_999_experiment[expnum,'tr_outtxe_predrHGG'] = rf_999$confusion['txe', 'rHGG']
    outcome_999_experiment[expnum,'tr_outrHGG_predtxe'] = rf_999$confusion['rHGG', 'txe']
    outcome_999_experiment[expnum,'tr_outrHGG_predrHGG'] = rf_999$confusion['rHGG', 'rHGG']
    outcome_999_experiment[expnum,'cv_outtxe_predtxe'] = cv_999_conf_mat$table['txe', 'txe']
    outcome_999_experiment[expnum,'cv_outtxe_predrHGG'] = cv_999_conf_mat$table['txe', 'rHGG']
    outcome_999_experiment[expnum,'cv_outrHGG_predtxe'] = cv_999_conf_mat$table['rHGG', 'txe']
    outcome_999_experiment[expnum,'cv_outrHGG_predrHGG'] = cv_999_conf_mat$table['rHGG', 'rHGG']
    
    for (col in features){
        varImp_col = paste('varImp_', col, sep = "")
        outcome_999_experiment[expnum, varImp_col] = importance_rf_999[col,]
    }
      
    expnum = expnum + 1

}
toc()

3.51 sec elapsed


In [156]:
outcome_med_experiment = outcome_exp_template

In [157]:
## let's begin by running experiments for anatomic features only: 
## for each experiment, we'll run the random forest with a variety of parameters 
expnum = 1
tic()
for (i in 1:25){
    set.seed(i)
    
    ## define features and outcome: 
    outcome = "outcome"
    features = c('in_CEL', 'cbvn_nlin', 'recov_npar',
                 'ccri', 'cni', 'ncho', 'ncre', 'nnaa', 'nfa.1', 'nadc.1',
                'hasPerf', 'hasSpec', 'hasDiffu')
    features.addsign <- paste(features, collapse = "+")
    rf_formula = as.formula(paste(outcome, features.addsign, sep = "~"))
    
    ## define training and testing (from the training) basically for x-val: 
    unique_tnums = unique(data_med_train$t_number_y)
    train_tnums = unique_tnums[sample(c(1:length(unique_tnums)), size = round(length(unique_tnums)*.75))]
    test_tnums = unique_tnums[! unique_tnums %in% train_tnums]
    
    data_exp_med_train = data_med_train[data_med_train$t_number_y %in% train_tnums, ]
    data_exp_med_cv    = data_med_train[! data_med_train$t_number_y %in% train_tnums, ]    
    
    rf_med = randomForest(formula = rf_formula, 
             data = data_exp_med_train)
    
    importance_rf_med = importance(rf_med)
    
    cv_med_preds = predict(rf_med, data_exp_med_cv)

    cv_med_conf_mat = confusionMatrix(cv_med_preds, data_exp_med_cv$outcome)
    
    tr_txe_pct = table(data_exp_med_train$outcome)['txe']/sum(table(data_exp_med_train$outcome)['txe'], table(data_exp_med_train$outcome)['rHGG'])
    cv_txe_pct = table(data_exp_med_cv$outcome)['txe']/sum(table(data_exp_med_cv$outcome)['txe'], table(data_exp_med_cv$outcome)['rHGG'])

    outcome_med_experiment[expnum,'expnum'] = expnum
    outcome_med_experiment[expnum,'tr_txe_pct'] = tr_txe_pct
    outcome_med_experiment[expnum,'cv_txe_pct'] = cv_txe_pct
    
    outcome_med_experiment[expnum,'tr_outtxe_predtxe'] = rf_med$confusion['txe', 'txe']
    outcome_med_experiment[expnum,'tr_outtxe_predrHGG'] = rf_med$confusion['txe', 'rHGG']
    outcome_med_experiment[expnum,'tr_outrHGG_predtxe'] = rf_med$confusion['rHGG', 'txe']
    outcome_med_experiment[expnum,'tr_outrHGG_predrHGG'] = rf_med$confusion['rHGG', 'rHGG']
    outcome_med_experiment[expnum,'cv_outtxe_predtxe'] = cv_med_conf_mat$table['txe', 'txe']
    outcome_med_experiment[expnum,'cv_outtxe_predrHGG'] = cv_med_conf_mat$table['txe', 'rHGG']
    outcome_med_experiment[expnum,'cv_outrHGG_predtxe'] = cv_med_conf_mat$table['rHGG', 'txe']
    outcome_med_experiment[expnum,'cv_outrHGG_predrHGG'] = cv_med_conf_mat$table['rHGG', 'rHGG']
    
    for (col in features){
        varImp_col = paste('varImp_', col, sep = "")
        outcome_med_experiment[expnum, varImp_col] = importance_rf_med[col,]
    }
      
    expnum = expnum + 1

}
toc()

3.467 sec elapsed


In [158]:
outcome_src_experiment = outcome_exp_template

In [159]:
## let's begin by running experiments for anatomic features only: 
## for each experiment, we'll run the random forest with a variety of parameters 
expnum = 1
tic()
for (i in 1:25){
    set.seed(i)
    
    ## define features and outcome: 
    outcome = "outcome"
    features = c('in_CEL', 'cbvn_nlin', 'recov_npar',
                 'ccri', 'cni', 'ncho', 'ncre', 'nnaa', 'nfa.1', 'nadc.1',
                'hasPerf', 'hasSpec', 'hasDiffu')
    features.addsign <- paste(features, collapse = "+")
    rf_formula = as.formula(paste(outcome, features.addsign, sep = "~"))
    
    ## define training and testing (from the training) basically for x-val: 
    unique_tnums = unique(data_src_train$t_number_y)
    train_tnums = unique_tnums[sample(c(1:length(unique_tnums)), size = round(length(unique_tnums)*.75))]
    test_tnums = unique_tnums[! unique_tnums %in% train_tnums]
    
    data_exp_src_train = data_src_train[data_src_train$t_number_y %in% train_tnums, ]
    data_exp_src_cv    = data_src_train[! data_src_train$t_number_y %in% train_tnums, ]    
    
    rf_src = randomForest(formula = rf_formula, 
             data = data_exp_src_train)
    
    importance_rf_src = importance(rf_src)
    
    cv_src_preds = predict(rf_src, data_exp_src_cv)

    cv_src_conf_mat = confusionMatrix(cv_src_preds, data_exp_src_cv$outcome)
    
    tr_txe_pct = table(data_exp_src_train$outcome)['txe']/sum(table(data_exp_src_train$outcome)['txe'], table(data_exp_src_train$outcome)['rHGG'])
    cv_txe_pct = table(data_exp_src_cv$outcome)['txe']/sum(table(data_exp_src_cv$outcome)['txe'], table(data_exp_src_cv$outcome)['rHGG'])

    outcome_src_experiment[expnum,'expnum'] = expnum
    outcome_src_experiment[expnum,'tr_txe_pct'] = tr_txe_pct
    outcome_src_experiment[expnum,'cv_txe_pct'] = cv_txe_pct
    
    outcome_src_experiment[expnum,'tr_outtxe_predtxe'] = rf_src$confusion['txe', 'txe']
    outcome_src_experiment[expnum,'tr_outtxe_predrHGG'] = rf_src$confusion['txe', 'rHGG']
    outcome_src_experiment[expnum,'tr_outrHGG_predtxe'] = rf_src$confusion['rHGG', 'txe']
    outcome_src_experiment[expnum,'tr_outrHGG_predrHGG'] = rf_src$confusion['rHGG', 'rHGG']
    outcome_src_experiment[expnum,'cv_outtxe_predtxe'] = cv_src_conf_mat$table['txe', 'txe']
    outcome_src_experiment[expnum,'cv_outtxe_predrHGG'] = cv_src_conf_mat$table['txe', 'rHGG']
    outcome_src_experiment[expnum,'cv_outrHGG_predtxe'] = cv_src_conf_mat$table['rHGG', 'txe']
    outcome_src_experiment[expnum,'cv_outrHGG_predrHGG'] = cv_src_conf_mat$table['rHGG', 'rHGG']
    
    for (col in features){
        varImp_col = paste('varImp_', col, sep = "")
        outcome_src_experiment[expnum, varImp_col] = importance_rf_src[col,]
    }
      
    expnum = expnum + 1

}
toc()

3.393 sec elapsed


In [166]:
varImp_999_outcome = outcome_999_experiment[,grep('varImp', colnames(outcome_999_experiment))]


In [175]:
# 999 imputed variable importances: 
data.frame(med = apply(varImp_999_outcome, 2, median), sd=apply(varImp_999_outcome, 2, sd))

Unnamed: 0,med,sd
varImp_in_CEL,1.3852004,0.14851455
varImp_cbvn_nlin,6.6744839,0.8570848
varImp_recov_npar,8.1704338,1.36238741
varImp_ccri,2.9745988,0.37858557
varImp_cni,3.0610369,0.3821647
varImp_ncho,3.2469352,0.48403766
varImp_ncre,2.5895273,0.4121201
varImp_nnaa,3.3451635,0.61094626
varImp_nfa.1,7.4478816,0.75095766
varImp_nadc.1,7.8070658,0.68124399


In [179]:
# med imputed variable importances: 

data.frame(med = apply(outcome_med_experiment[,grep('varImp', colnames(outcome_med_experiment))], 2, median), 
           sd = apply(outcome_med_experiment[,grep('varImp', colnames(outcome_med_experiment))], 2, sd))

Unnamed: 0,med,sd
varImp_in_CEL,1.3127261,0.1260503
varImp_cbvn_nlin,7.0452218,0.7921352
varImp_recov_npar,9.1980841,1.2584866
varImp_ccri,3.121402,0.3687919
varImp_cni,3.3705955,0.4694607
varImp_ncho,3.605308,0.5623417
varImp_ncre,2.9545222,0.4108042
varImp_nnaa,3.4146618,0.603385
varImp_nfa.1,7.6703425,0.6562647
varImp_nadc.1,8.453325,0.694333


In [182]:
# src imputed variable importances: 

data.frame(med = apply(outcome_src_experiment[,grep('varImp', colnames(outcome_src_experiment))], 2, median), 
           sd = apply(outcome_src_experiment[,grep('varImp', colnames(outcome_src_experiment))], 2, sd))

Unnamed: 0,med,sd
varImp_in_CEL,0.921615,0.1155651
varImp_cbvn_nlin,6.9329724,0.7918387
varImp_recov_npar,8.7461374,1.2451968
varImp_ccri,5.5034513,0.4413178
varImp_cni,5.8003091,0.4979162
varImp_ncho,5.8264464,0.5418548
varImp_ncre,6.3240912,0.8025848
varImp_nnaa,6.6407445,0.7900533
varImp_nfa.1,5.396126,0.4905397
varImp_nadc.1,5.6120268,0.5129651
