In [1]:
#install.packages('randomForestSRC')

In [1]:
library(randomForestSRC)
source('AUC_workhorse.R')


 randomForestSRC 2.7.0 
 
 Type rfsrc.news() to see new features, changes, and bug fixes. 
 



In [2]:
options(repr.matrix.max.rows=600, repr.matrix.max.cols=200)

In [3]:
set.seed(102)
i = sample(1000, 1)

In [4]:
train_path = paste('../../AllBiopsies/Data/7thParse_TrainTestSplit/Training_withNec/train_data_',as.character(i),'.csv', sep = "")
test_path = paste('../../AllBiopsies/Data/7thParse_TrainTestSplit/Testing_withNec/test_data_',as.character(i),'.csv', sep = "")

In [5]:
train_data = read.csv(train_path)
test_data = read.csv(test_path)

In [6]:
dim(train_data)
dim(test_data)

In [7]:
data.frame(index = 1:length(colnames(train_data)), colnames = colnames(train_data))

index,colnames
1,b_number
2,t_number
3,sf_number
4,current_scan_date
5,current_surgery_date
6,current_hist_grade
7,current_hist_type
8,roi.label
9,tumor_cell_evaluation
10,necrosis


## Exp 0: Optimizing some parameters

In [9]:
auc_param_comparison = data.frame('train_test_number'=1:500, 'nodesize.1.mtry.1'=NA, 
                                 'nodesize.1.mtry.5'=NA, 
                                 'nodesize.1.mtry.10'=NA, 
                                 'nodesize.3.mtry.1'=NA, 
                                 'nodesize.3.mtry.5'=NA, 
                                 'nodesize.3.mtry.10'=NA,
                                 'nodesize.5.mtry.1'=NA,
                                 'nodesize.5.mtry.5'=NA, 
                                 'nodesize.5.mtry.10'=NA,
                                 'nodesize.8.mtry.1'=NA,
                                 'nodesize.8.mtry.5'=NA,
                                 'nodesize.8.mtry.10'=NA,
                                 'nodesize.10.mtry.1'=NA,
                                 'nodesize.10.mtry.5'=NA,
                                 'nodesize.10.mtry.10'=NA)

In [14]:
for (i in c(1:500)){
    print(i)
    ## define train and test paths
    train_path = paste('../../AllBiopsies/Data/7thParse_TrainTestSplit/Training_withNec/train_data_',as.character(i),'.csv', sep = "")
    test_path = paste('../../AllBiopsies/Data/7thParse_TrainTestSplit/Testing_withNec/test_data_',as.character(i),'.csv', sep = "")
    
    ## read in data
    train_data = read.csv(train_path)
    test_data = read.csv(test_path)

    ## remove "ind" pathology and empty string
    train_data = train_data[which(train_data$tumor_cell_evaluation != "Ind"),]
    train_data = train_data[which(train_data$tumor_cell_evaluation != ""),]
    test_data = test_data[which(test_data$tumor_cell_evaluation != "Ind"),]
    test_data = test_data[which(test_data$tumor_cell_evaluation != ""),]

    ## droplevels: 
    train_data$tumor_cell_evaluation = droplevels(as.factor(train_data$tumor_cell_evaluation))
    test_data$tumor_cell_evaluation = droplevels(as.factor(test_data$tumor_cell_evaluation))
    
    ## define features: 
    features_for_rf = colnames(train_data)[c(16:19, 21:22, 33, 34, 37:43, 46,66, 68,69)]
    outcome = colnames(train_data)[9]
    
    ## create formula 
    features.addsign <- paste(features_for_rf, collapse = "+")
    rf.formula <- as.formula(paste(outcome,features.addsign, sep = "~"))
    
    
    for (nodesize in c(1, 3, 5, 8, 10)){
        for (mtry in c(1, 5, 10)){
            ## run random forest experiment
            rf = rfsrc(formula = rf.formula,
            data = train_data, 
            ntree = 500, 
            mtry = mtry,
            na.action = "na.impute", 
            nodesize = nodesize)

            ## predict on the test set 
            preds = predict(rf, newdata=test_data)
            
            ## find the AUC of the classification: 
            x = preds
            aucS <- auc(x$yvar, if(!is.null(x$predicted.oob) && !all(is.na(x$predicted.oob))) x$predicted.oob else x$predicted)
            
            ## add the results to a data_frame 
            colname = paste('nodesize.', nodesize,'.mtry.',mtry, sep = '')
            auc_param_comparison[i, colname]=aucS

        }
    }

}

In [16]:
summary(auc_param_comparison)

 train_test_number nodesize.1.mtry.1 nodesize.1.mtry.5 nodesize.1.mtry.10
 Min.   :  1.0     Min.   :0.4047    Min.   :0.4120    Min.   :0.3889    
 1st Qu.:125.8     1st Qu.:0.5310    1st Qu.:0.5462    1st Qu.:0.5422    
 Median :250.5     Median :0.5820    Median :0.6100    Median :0.5891    
 Mean   :250.5     Mean   :0.5753    Mean   :0.5983    Mean   :0.5789    
 3rd Qu.:375.2     3rd Qu.:0.6144    3rd Qu.:0.6432    3rd Qu.:0.6213    
 Max.   :500.0     Max.   :0.7302    Max.   :0.7762    Max.   :0.7847    
                   NA's   :429       NA's   :429       NA's   :429       
 nodesize.3.mtry.1 nodesize.3.mtry.5 nodesize.3.mtry.10 nodesize.5.mtry.1
 Min.   :0.4153    Min.   :0.4120    Min.   :0.3843     Min.   :0.4320   
 1st Qu.:0.5326    1st Qu.:0.5373    1st Qu.:0.5327     1st Qu.:0.5319   
 Median :0.5813    Median :0.5855    Median :0.5843     Median :0.5928   
 Mean   :0.5783    Mean   :0.5866    Mean   :0.5809     Mean   :0.5821   
 3rd Qu.:0.6235    3rd Qu.:0.6405    3

## Exp. 1: Using subset of features, using na.impute 

In [18]:
auc_results_exp1 = data.frame('train_test_number'=1:1000, 'AUC'=NA)

In [19]:
for (i in c(1:500)){
    print(i)
    ## define train and test paths
    train_path = paste('../../AllBiopsies/Data/7thParse_TrainTestSplit/Training_withNec/train_data_',as.character(i),'.csv', sep = "")
    test_path = paste('../../AllBiopsies/Data/7thParse_TrainTestSplit/Testing_withNec/test_data_',as.character(i),'.csv', sep = "")
    
    ## read in data
    train_data = read.csv(train_path)
    test_data = read.csv(test_path)

    ## remove "ind" pathology and empty string
    train_data = train_data[which(train_data$tumor_cell_evaluation != "Ind"),]
    train_data = train_data[which(train_data$tumor_cell_evaluation != ""),]
    test_data = test_data[which(test_data$tumor_cell_evaluation != "Ind"),]
    test_data = test_data[which(test_data$tumor_cell_evaluation != ""),]

    ## droplevels: 
    train_data$tumor_cell_evaluation = droplevels(as.factor(train_data$tumor_cell_evaluation))
    test_data$tumor_cell_evaluation = droplevels(as.factor(test_data$tumor_cell_evaluation))
    
    ## define features: 
    features_for_rf = colnames(train_data)[c(16:19, 21:22, 33, 34, 37:43, 46,66, 68,69)]
    outcome = colnames(train_data)[9]
    
    ## create formula 
    features.addsign <- paste(features_for_rf, collapse = "+")
    rf.formula <- as.formula(paste(outcome,features.addsign, sep = "~"))
    
    ## run random forest experiment
    rf = rfsrc(formula = rf.formula,
     data = train_data, 
     ntree = 1000, 
     mtry = 5,
     na.action = "na.impute", 
     nodesize = 8)
    
    ## predict on the test set 
    preds = predict(rf, newdata=test_data)
    
    ## find the AUC of the classification: 
    x = preds
    aucS <- auc(x$yvar, if(!is.null(x$predicted.oob) && !all(is.na(x$predicted.oob))) x$predicted.oob else x$predicted)
    
    ## add the results to a data_frame 
    auc_results_exp1[i, 2]=aucS
}

[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10
[1] 11
[1] 12
[1] 13
[1] 14
[1] 15
[1] 16
[1] 17
[1] 18
[1] 19
[1] 20
[1] 21
[1] 22
[1] 23
[1] 24
[1] 25
[1] 26
[1] 27
[1] 28
[1] 29
[1] 30
[1] 31
[1] 32
[1] 33
[1] 34
[1] 35
[1] 36
[1] 37
[1] 38
[1] 39
[1] 40
[1] 41
[1] 42
[1] 43
[1] 44
[1] 45
[1] 46
[1] 47
[1] 48
[1] 49
[1] 50
[1] 51
[1] 52
[1] 53
[1] 54
[1] 55
[1] 56
[1] 57
[1] 58
[1] 59
[1] 60
[1] 61
[1] 62
[1] 63
[1] 64
[1] 65
[1] 66
[1] 67
[1] 68
[1] 69
[1] 70
[1] 71
[1] 72
[1] 73
[1] 74
[1] 75
[1] 76
[1] 77
[1] 78
[1] 79
[1] 80
[1] 81
[1] 82
[1] 83
[1] 84
[1] 85
[1] 86
[1] 87
[1] 88
[1] 89
[1] 90
[1] 91
[1] 92
[1] 93
[1] 94
[1] 95
[1] 96
[1] 97
[1] 98
[1] 99
[1] 100
[1] 101
[1] 102
[1] 103
[1] 104
[1] 105
[1] 106
[1] 107
[1] 108
[1] 109
[1] 110
[1] 111
[1] 112
[1] 113
[1] 114
[1] 115
[1] 116
[1] 117
[1] 118
[1] 119
[1] 120
[1] 121
[1] 122
[1] 123
[1] 124
[1] 125
[1] 126
[1] 127
[1] 128
[1] 129
[1] 130
[1] 131
[1] 132
[1] 133
[1] 134
[1] 135
[1] 136
[1] 137
[1] 138
[1] 

In [20]:
summary(auc_results_exp1[,2])

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
 0.3462  0.5425  0.6045  0.5989  0.6558  0.8213     500 

## Expt 2: use more features

In [21]:
data.frame(index = 1:length(colnames(train_data)), colnames = colnames(train_data))

index,colnames
1,b_number
2,t_number
3,sf_number
4,current_scan_date
5,current_surgery_date
6,current_hist_grade
7,current_hist_type
8,roi.label
9,tumor_cell_evaluation
10,necrosis


In [22]:
auc_results_exp2 = data.frame('train_test_number'=1:1000, 'AUC'=NA)

In [23]:
for (i in c(1:500)){
    print(i)
    ## define train and test paths
    train_path = paste('../../AllBiopsies/Data/7thParse_TrainTestSplit/Training_withNec/train_data_',as.character(i),'.csv', sep = "")
    test_path = paste('../../AllBiopsies/Data/7thParse_TrainTestSplit/Testing_withNec/test_data_',as.character(i),'.csv', sep = "")
    
    ## read in data
    train_data = read.csv(train_path)
    test_data = read.csv(test_path)

    ## remove "ind" pathology and empty string
    train_data = train_data[which(train_data$tumor_cell_evaluation != "Ind"),]
    train_data = train_data[which(train_data$tumor_cell_evaluation != ""),]
    test_data = test_data[which(test_data$tumor_cell_evaluation != "Ind"),]
    test_data = test_data[which(test_data$tumor_cell_evaluation != ""),]

    ## droplevels: 
    train_data$tumor_cell_evaluation = droplevels(as.factor(train_data$tumor_cell_evaluation))
    test_data$tumor_cell_evaluation = droplevels(as.factor(test_data$tumor_cell_evaluation))
    
    ## define features: 
    features_for_rf = colnames(train_data)[c(16:20, 21:32, 33:36, 37:43, 46,66, 68,69)]
    outcome = colnames(train_data)[9]
    
    ## create formula 
    features.addsign <- paste(features_for_rf, collapse = "+")
    rf.formula <- as.formula(paste(outcome,features.addsign, sep = "~"))
    
    ## run random forest experiment
    rf = rfsrc(formula = rf.formula,
     data = train_data, 
     ntree = 1000, 
     mtry = 5, 
     na.action = 'na.impute',
     nodesize = 5)
    
    ## predict on the test set 
    preds = predict(rf, newdata=test_data)
    
    ## find the AUC of the classification: 
    x = preds
    aucS <- auc(x$yvar, if(!is.null(x$predicted.oob) && !all(is.na(x$predicted.oob))) x$predicted.oob else x$predicted)
    
    ## add the results to a data_frame 
    auc_results_exp2[i, 2]=aucS
}

[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10
[1] 11
[1] 12
[1] 13
[1] 14
[1] 15
[1] 16
[1] 17
[1] 18
[1] 19
[1] 20
[1] 21
[1] 22
[1] 23
[1] 24
[1] 25
[1] 26
[1] 27
[1] 28
[1] 29
[1] 30
[1] 31
[1] 32
[1] 33
[1] 34
[1] 35
[1] 36
[1] 37
[1] 38
[1] 39
[1] 40
[1] 41
[1] 42
[1] 43
[1] 44
[1] 45
[1] 46
[1] 47
[1] 48
[1] 49
[1] 50
[1] 51
[1] 52
[1] 53
[1] 54
[1] 55
[1] 56
[1] 57
[1] 58
[1] 59
[1] 60
[1] 61
[1] 62
[1] 63
[1] 64
[1] 65
[1] 66
[1] 67
[1] 68
[1] 69
[1] 70
[1] 71
[1] 72
[1] 73
[1] 74
[1] 75
[1] 76
[1] 77
[1] 78
[1] 79
[1] 80
[1] 81
[1] 82
[1] 83
[1] 84
[1] 85
[1] 86
[1] 87
[1] 88
[1] 89
[1] 90
[1] 91
[1] 92
[1] 93
[1] 94
[1] 95
[1] 96
[1] 97
[1] 98
[1] 99
[1] 100
[1] 101
[1] 102
[1] 103
[1] 104
[1] 105
[1] 106
[1] 107
[1] 108
[1] 109
[1] 110
[1] 111
[1] 112
[1] 113
[1] 114
[1] 115
[1] 116
[1] 117
[1] 118
[1] 119
[1] 120
[1] 121
[1] 122
[1] 123
[1] 124
[1] 125
[1] 126
[1] 127
[1] 128
[1] 129
[1] 130
[1] 131
[1] 132
[1] 133
[1] 134
[1] 135
[1] 136
[1] 137
[1] 138
[1] 

In [15]:
summary(auc_results_exp2[,2])

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
 0.0833  0.5000  0.5750  0.5789  0.6632  0.9500     507 

## Expt 3: Use same features from expt 1, use proximity imputation: 

In [10]:
auc_results_exp3 = data.frame('train_test_number'=1:250, 'AUC'=NA)

In [11]:
for (i in c(1:250)){
    print(i)
    ## define train and test paths
    train_path = paste('../../AllBiopsies/Data/7thParse_TrainTestSplit/Training_withNec/train_data_',as.character(i),'.csv', sep = "")
    test_path = paste('../../AllBiopsies/Data/7thParse_TrainTestSplit/Testing_withNec/test_data_',as.character(i),'.csv', sep = "")

    ## read in data
    train_data = read.csv(train_path)
    test_data = read.csv(test_path)

    ## remove "ind" pathology and empty string
    train_data = train_data[which(train_data$tumor_cell_evaluation != "Ind"),]
    train_data = train_data[which(train_data$tumor_cell_evaluation != ""),]
    test_data = test_data[which(test_data$tumor_cell_evaluation != "Ind"),]
    test_data = test_data[which(test_data$tumor_cell_evaluation != ""),]

    ## droplevels: 
    train_data$tumor_cell_evaluation = droplevels(as.factor(train_data$tumor_cell_evaluation))
    test_data$tumor_cell_evaluation = droplevels(as.factor(test_data$tumor_cell_evaluation))
    
    ## define features: 
    features_for_rf = colnames(train_data)[c(16:19, 21:22, 33, 34, 37:43, 46,66, 68,69)]
    outcome = colnames(train_data)[9]
    
    ## create formula 
    features.addsign <- paste(features_for_rf, collapse = "+")
    rf.formula <- as.formula(paste(outcome,features.addsign, sep = "~"))
    
    ## run random forest experiment
    rf = rfsrc(formula = rf.formula,
     data = train_data, 
     ntree = 1000, 
     mtry = 5, 
     nodesize = 8, 
     na.action = 'na.impute', 
     nimpute = 5, 
     proximity = TRUE)
    
    ## predict on the test set 
    preds = predict(rf, newdata=test_data)
    
    ## find the AUC of the classification: 
    x = preds
    aucS <- auc(x$yvar, if(!is.null(x$predicted.oob) && !all(is.na(x$predicted.oob))) x$predicted.oob else x$predicted)
    
    ## add the results to a data_frame 
    auc_results_exp3[i, 2]=aucS
}

[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10
[1] 11
[1] 12
[1] 13
[1] 14
[1] 15
[1] 16
[1] 17
[1] 18
[1] 19
[1] 20
[1] 21
[1] 22
[1] 23
[1] 24
[1] 25
[1] 26
[1] 27
[1] 28
[1] 29
[1] 30
[1] 31
[1] 32
[1] 33
[1] 34
[1] 35
[1] 36
[1] 37
[1] 38
[1] 39
[1] 40
[1] 41
[1] 42
[1] 43
[1] 44
[1] 45
[1] 46
[1] 47
[1] 48
[1] 49
[1] 50
[1] 51
[1] 52
[1] 53
[1] 54
[1] 55
[1] 56
[1] 57
[1] 58
[1] 59
[1] 60
[1] 61
[1] 62
[1] 63
[1] 64
[1] 65
[1] 66
[1] 67
[1] 68
[1] 69
[1] 70
[1] 71
[1] 72
[1] 73
[1] 74
[1] 75
[1] 76
[1] 77
[1] 78
[1] 79
[1] 80
[1] 81
[1] 82
[1] 83
[1] 84
[1] 85
[1] 86
[1] 87
[1] 88
[1] 89
[1] 90
[1] 91
[1] 92
[1] 93
[1] 94
[1] 95
[1] 96
[1] 97
[1] 98
[1] 99
[1] 100
[1] 101
[1] 102
[1] 103
[1] 104
[1] 105
[1] 106
[1] 107
[1] 108
[1] 109
[1] 110
[1] 111
[1] 112
[1] 113
[1] 114
[1] 115
[1] 116
[1] 117
[1] 118
[1] 119
[1] 120
[1] 121
[1] 122
[1] 123
[1] 124
[1] 125
[1] 126
[1] 127
[1] 128
[1] 129
[1] 130
[1] 131
[1] 132
[1] 133
[1] 134
[1] 135
[1] 136
[1] 137
[1] 138
[1] 

In [12]:
summary(auc_results_exp3[,2])

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.3147  0.5313  0.5817  0.5792  0.6345  0.7986 

## Expt 4: MICE

In [34]:
library(mice)

“package ‘mice’ was built under R version 3.4.1”Loading required package: lattice

Attaching package: ‘mice’

The following objects are masked from ‘package:base’:

    cbind, rbind



In [31]:
getwd()

In [32]:
train_data = read.csv(paste('../Data/7thParse_TrainTestSplit/Training_withNec/train_data_', i, '.csv', sep="" ))

In [37]:
data.frame(index = 1:length(colnames(train_data)), colnames = colnames(train_data))


index,colnames
1,b_number
2,t_number
3,sf_number
4,current_scan_date
5,current_surgery_date
6,current_hist_grade
7,current_hist_type
8,roi.label
9,tumor_cell_evaluation
10,necrosis


In [35]:
for (i in c(200:500)){
    ## define the data path that you'd like to impute 
    train_data = read.csv(paste('../Data/7thParse_TrainTestSplit/Training_withNec/train_data_', i, '.csv', sep="" ))
    test_data = read.csv(paste('../Data/7thParse_TrainTestSplit/Testing_withNec/test_data_', i, ".csv", sep = ""))
    
    features_to_impute = colnames(train_data)[c(1,2,9,72, 75, 16:19, 21:22, 33, 34, 37:43, 46,66, 68,69)]
    
    ## impute the data
    train_imputed = mice(train_data[,features_to_impute], method = 'pmm', 
                    m = 5, maxit = 5, print = FALSE, seed = 1)
    train_imputed = complete(train_imputed)
    test_imputed = mice(train_data[,features_to_impute], method = 'pmm', 
                    m = 5, maxit = 5, print = FALSE, seed = 1)
    test_imputed = complete(test_imputed)
    
    ## write the files:
    write.csv(train_imputed, file =  paste("/home/sf673542/DataWrangling/AnalyzeData/AllBiopsies/Data/7thParse_TrainTestSplit/Training_PMMImpute_withNec/train_data_", i, ".csv", sep = ""), row.names = F)
    write.csv(train_imputed, file =  paste("/home/sf673542/DataWrangling/AnalyzeData/AllBiopsies/Data/7thParse_TrainTestSplit/Testing_PMMImpute_withNec/test_data_", i, ".csv", sep = ""), row.names = F)
}

“Number of logged events: 410”

In [36]:
auc_results_mice_expt = data.frame('train_test_number'=1:250, 'AUC'=NA)

In [19]:
for (i in 1:250){
    print(i)
    ## define train and test paths
    train_path = paste("/home/sf673542/DataWrangling/AnalyzeData/AllBiopsies/Data/7thParse_TrainTestSplit/Training_PMMImpute_withNec/train_data_", i, ".csv", sep = "")
    test_path = paste("/home/sf673542/DataWrangling/AnalyzeData/AllBiopsies/Data/7thParse_TrainTestSplit/Testing_PMMImpute_withNec/test_data_", i, ".csv", sep = "")

    ## read in data
    train_data = read.csv(train_path)
    test_data = read.csv(test_path)

    ## remove "ind" pathology and empty string
    train_data = train_data[which(train_data$tumor_cell_evaluation != "Ind"),]
    train_data = train_data[which(train_data$tumor_cell_evaluation != ""),]
    test_data = test_data[which(test_data$tumor_cell_evaluation != "Ind"),]
    test_data = test_data[which(test_data$tumor_cell_evaluation != ""),]

    ## droplevels: 
    train_data$tumor_cell_evaluation = droplevels(as.factor(train_data$tumor_cell_evaluation))
    test_data$tumor_cell_evaluation = droplevels(as.factor(test_data$tumor_cell_evaluation))

    ## define features: 
    features_for_rf = colnames(train_data)[6:length(colnames(train_data))]
    outcome = colnames(train_data)[3]

    ## create formula 
    features.addsign <- paste(features_for_rf, collapse = "+")
    rf.formula <- as.formula(paste(outcome,features.addsign, sep = "~"))

    ## run random forest experiment
    rf = rfsrc(formula = rf.formula,
     data = train_data, 
     ntree = 1000, 
     mtry = 5, 
     nodesize = 8, 
     na.action = 'na.omit', 
     nimpute = 5)

    ## predict on the test set 
    preds = predict(rf, newdata=test_data)

    ## find the AUC of the classification: 
    x = preds
    aucS <- auc(x$yvar, if(!is.null(x$predicted.oob) && !all(is.na(x$predicted.oob))) x$predicted.oob else x$predicted)

    ## add the results to a data_frame 
    auc_results_exp3[i, 2]=aucS
}

[1] 1


ERROR: Error in finalizeFormula(formulaPrelim, data): formula is misspecified, object current_hist_gradecurrent_hist_typeroi.labelnecrosisf8_delicatef8_simplef8_complexmib_1bx_pure_treatment_effectnt1dnev1.1nev2.1nev3.1nevrad.1nadc.2nfa.2nev1.2nev2.2nev3.2nevrad.2phn_nparrecovn_nparnlipnlacnewdatacommentsnotes.imaging_codeperf_quantcbv_nlinspec_quantinclude_anatinclude_diffu1000include_diffu2000include_diffu_allinclude_perfinclude_specinclude_one_advanced_modinclude_all_advanced_modsX.CELX.NELX.NECsumin_T2allin_ROIdesired_histno_ffpewaiting_on_path not found


In [43]:
summary(auc_results_exp3[133:dim(auc_results_exp3)[1],2])

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.3323  0.5252  0.5803  0.5749  0.6218  0.7986 

## Expt 4: missForest: 

In [4]:
auc_results_missForest = data.frame('train_test_number'=1:250, 'AUC'=NA)

In [22]:
for (i in 1:250){
    ## define train and test paths
    train_path = paste("/home/sf673542/DataWrangling/AnalyzeData/AllBiopsies/Data/7thParse_TrainTestSplit/Training_withNec/train_data_", i, ".csv", sep = "")
    test_path = paste("/home/sf673542/DataWrangling/AnalyzeData/AllBiopsies/Data/7thParse_TrainTestSplit/Testing_withNec/test_data_", i, ".csv", sep = "")

    ## read in data
    train_data = read.csv(train_path)
    test_data = read.csv(test_path)

    ## remove "ind" pathology and empty string
    train_data = train_data[which(train_data$tumor_cell_evaluation != "Ind"),]
    train_data = train_data[which(train_data$tumor_cell_evaluation != ""),]
    test_data = test_data[which(test_data$tumor_cell_evaluation != "Ind"),]
    test_data = test_data[which(test_data$tumor_cell_evaluation != ""),]

    ## droplevels: 
    train_data$tumor_cell_evaluation = droplevels(as.factor(train_data$tumor_cell_evaluation))
    test_data$tumor_cell_evaluation = droplevels(as.factor(test_data$tumor_cell_evaluation))

    ## define features: 
    features_to_impute = colnames(train_data)[c(1,2,9,72, 75, 16:19, 21:22, 33, 34, 37:43, 46,66, 68,69)]
    outcome = "tumor_cell_evaluation"

    ## create formula 
    features.addsign <- paste(features_to_impute, collapse = "+")
    rf.formula <- as.formula(paste(outcome,features.addsign, sep = "~"))

    ## run random forest experiment
    impute.train = impute(rf.formula,
                   data = train_data, 
                   ntree = 500,
                   nodesize = 8,
                   mtry = 5, 
                   mf.q = 0.1, 
                   splitrule = "random")

    impute.test = impute(rf.formula,
                   data = test_data, 
                   ntree = 500,
                   nodesize = 8,
                   mtry = 5, 
                   mf.q = 0.1, 
                   splitrule = "random")
    
    rf = rfsrc(formula = rf.formula, 
      data = impute.train, 
      ntree =500, 
      nodesize = 8, 
      mtry = 5)

    preds = predict(rf, newdata=impute.test)

    x = preds
    aucS <- auc(x$yvar, if(!is.null(x$predicted.oob) && !all(is.na(x$predicted.oob))) x$predicted.oob else x$predicted)

    ## add the results to a data_frame 
    auc_results_missForest[i, 2]=aucS

}


	 iteration 1 
         >>  err = 0.438 ,  drop = Inf 
	 iteration 2 
         >>  err = 0.352 ,  drop = 0.0855 
	 iteration 3 
         >>  err = 0.382 ,  drop = -0.03 
	 iteration 1 
         >>  err = 0.474 ,  drop = Inf 
	 iteration 2 
         >>  err = 0.404 ,  drop = 0.0702 
	 iteration 3 
         >>  err = 0.476 ,  drop = -0.0717 


In [23]:
features_to_impute

In [24]:
rf = rfsrc(formula = rf.formula, 
      data = impute.train, 
      ntree =500, 
      nodesize = 8, 
      mtry = 5
)

preds = predict(rf, newdata=impute.test)

x = preds
aucS <- auc(x$yvar, if(!is.null(x$predicted.oob) && !all(is.na(x$predicted.oob))) x$predicted.oob else x$predicted)

## add the results to a data_frame 
auc_results_missForest[i, 2]=aucS

In [25]:
auc_results_missForest

train_test_number,AUC
1,0.9291668
2,
3,
4,
5,
6,
7,
8,
9,
10,
