## Introduction: 

This notebook will have all samples, including necrosis. We will use all of the samples with complete data: 

In [7]:
library(randomForestSRC)
options(repr.matrix.max.rows=600, repr.matrix.max.cols=200)
library(randomForest)
#install.packages("Metrics")
library(Metrics)
source('AUC_workhorse.R')

In [8]:
require(randomForestSRC)

In [9]:
set.seed(101)
i = sample(1000, 1)

In [10]:
train_path = paste('../../AllBiopsies/Training_withNec/train_data_',as.character(i),'.csv', sep = "")
test_path = paste('../../AllBiopsies/Testing_withNec/test_data_',as.character(i),'.csv', sep = "")

In [11]:
train_data = read.csv(train_path)
test_data = read.csv(test_path)

In [12]:
dim(train_data)
dim(test_data)

In [13]:
data.frame(index = 1:length(colnames(train_data)), colnames = colnames(train_data))

index,colnames
1,b_number
2,t_number
3,sf_number
4,current_scan_date
5,current_surgery_date
6,current_hist_grade
7,current_hist_type
8,roi.label
9,tumor_cell_evaluation
10,necrosis


In [14]:
features_for_rf = colnames(train_data)[c(16:19, 21:22, 33, 34, 37:43, 46,66, 68,69)]
length(features_for_rf)
outcome = colnames(train_data)[9]

In [15]:
train_data = train_data[,c(outcome, features_for_rf)]
test_data = test_data[,c(outcome, features_for_rf)]

In [16]:
train_data = train_data[which(train_data$tumor_cell_evaluation != "Ind"),]
dim(train_data)
test_data = test_data[which(test_data$tumor_cell_evaluation != "Ind"),]
dim(test_data)

In [17]:
train_data = droplevels.data.frame(train_data)
test_data = droplevels.data.frame(test_data)

In [18]:
features.addsign <- paste(features_for_rf, collapse = "+")
rf.formula <- as.formula(paste(outcome,features.addsign, sep = "~"))

In [19]:
rf = rfsrc(formula = rf.formula,
     data = train_data, 
     ntree = 1000, 
     mtry = 5, 
     nodesize = 10)

In [20]:
print.rfsrc(rf)

                         Sample size: 76
           Frequency of class labels: NA, 14, 13, 20, 29
                     Number of trees: 1000
           Forest terminal node size: 10
       Average no. of terminal nodes: 7.094
No. of variables tried at each split: 5
              Total no. of variables: 19
       Resampling used to grow trees: swr
    Resample size used to grow trees: 76
                            Analysis: RF-C
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
              Normalized brier score: 88.31 
                                 AUC: 56.81 
                          Error rate: 0.66, NA, 0.64, 1, 1, 0.28

Confusion matrix:

          predicted
  observed   0 1 2  3 class.error
           0 0 0 0  0         NaN
         0 0 5 0 5  4      0.6429
         1 0 2 0 2  9      1.0000
         2 0 3 1 0 16      1.0000
         3 0 3 0 5 21      0.2759

	Overall error rate: 65.79% 


In [21]:
preds = predict(rf, newdata=test_data)

In [22]:
x = preds
aucS <- auc(x$yvar, if(!is.null(x$predicted.oob) && !all(is.na(x$predicted.oob))) x$predicted.oob else x$predicted)
aucS

In [39]:
auc_results = data.frame('train_test_number'=1:1000, 'AUC'=NA)

In [41]:
for (i in c(1:11, 13:19, 21:1000)){
    print(i)
    ## define train and test paths
    train_path = paste('../../AllBiopsies/Training_withNec/train_data_',as.character(i),'.csv', sep = "")
    test_path = paste('../../AllBiopsies/Testing_withNec/test_data_',as.character(i),'.csv', sep = "")
    
    ## read in data
    train_data = read.csv(train_path)
    test_data = read.csv(test_path)
    
    ## remove "ind" pathology and empty string
    train_data = train_data[which(train_data$tumor_cell_evaluation != "Ind"),]
    train_data = train_data[which(train_data$tumor_cell_evaluation != ""),]
    test_data = test_data[which(test_data$tumor_cell_evaluation != "Ind"),]
    test_data = test_data[which(test_data$tumor_cell_evaluation != ""),]
    
    ## droplevels: 
    train_data$tumor_cell_evaluation = droplevels(train_data$tumor_cell_evaluation)
    test_data$tumor_cell_evaluation = droplevels(test_data$tumor_cell_evaluation)
    
    ## define features: 
    features_for_rf = colnames(train_data)[c(16:19, 21:22, 33, 34, 37:43, 46,66, 68,69)]
    outcome = colnames(train_data)[9]
    
    ## create formula 
    features.addsign <- paste(features_for_rf, collapse = "+")
    rf.formula <- as.formula(paste(outcome,features.addsign, sep = "~"))
    
    ## run random forest experiment
    rf = rfsrc(formula = rf.formula,
     data = train_data, 
     ntree = 1000, 
     mtry = 5, 
     nodesize = 10)
    
    ## predict on the test set 
    preds = predict(rf, newdata=test_data)
    
    ## find the AUC of the classification: 
    x = preds
    aucS <- auc(x$yvar, if(!is.null(x$predicted.oob) && !all(is.na(x$predicted.oob))) x$predicted.oob else x$predicted)
    
    ## add the results to a data_frame 
    auc_results[i, 2]=aucS
}

[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10
[1] 11
[1] 13
[1] 14
[1] 15
[1] 16
[1] 17
[1] 18
[1] 19
[1] 21
[1] 22
[1] 23
[1] 24


ERROR: Error in UseMethod("droplevels"): no applicable method for 'droplevels' applied to an object of class "c('integer', 'numeric')"


In [23]:
i = 24

In [27]:
train_path = paste('../../AllBiopsies/Training_withNec/train_data_',as.character(i),'.csv', sep = "")
test_path = paste('../../AllBiopsies/Testing_withNec/test_data_',as.character(i),'.csv', sep = "")
train_data = read.csv(train_path)
test_data = read.csv(test_path)

In [42]:
table(train_data$tumor_cell_evaluation)


    0  1  2  3 
 3 78 46 91 95 

In [43]:
table(test_data$tumor_cell_evaluation)


 0  1  2  3 
26 12 28 44 