# Extended figure 2d part 2

In [None]:
#Install required libraries and load data
library(rpart) # version 4.1-15
library(rattle) # version 5.4.0
library(rpart.plot) # version 3.0.9
library(RColorBrewer) # version 1.1-2
library(caret) # version 6.0-86
library(rlang) # version 0.4.10
library(reshape2) # version 1.4.3 
library(scales) # version 1.1.0 
library(dplyr)
library(repr)
library(data.table)

In [None]:
# Load the Ab matrix for the FBM MNC CITE-seq data in as 'train'. This is DSB normalised data, with each cell type subset to k=10 (to account for class imbalance). 
train <- read.csv('/home/jovyan/YS_project/YS_CiteSeq_final_script_templates/Decision_tree/protein_train_test_new_anno_20220331.csv', 
                  header=TRUE, sep=",", row.names="X")

In [None]:
train

In [None]:
mytree <- rpart(
  cell.labels ~ ., 
  data = train, 
  method = "class", 
  cp = -1, #-ve complexity ensures tree growth isn't terminated in favour of reduced complexity. can lead to overfitting, but can be pruned later
  minsplit = 2, # minumim n(obs) in a node before split attempted (this is pre-pruning). Set to =2 to ensure that tree growth is unrestricted
  xval = 100 #  This runs 10-fold cross-validation on my training data (as default)
)

In [None]:
mytree

In [None]:
df <- as.data.frame(mytree$cptable)
colnames(df) <- c("CP", "nsplit", "relError", "xerror", "xstd")
df["CP_plus_xerror"] <- df["CP"] + df["xerror"]
df

In [None]:
# Note this tree making has some stochasicity 
plotcp(mytree)

# Pruning - off visual 
- inputted 23 celltypes

In [None]:
# tune std

# determine best and then take step up if not suitable

In [None]:
df_backup <- data.frame(df)

In [None]:
df_backup

In [None]:
df <- data.frame(df_backup)

# Attempt auto pruning

In [None]:
pruneLevel <- df["CP"][df["xerror"] < min(df["xerror"] + df["xstd"])]  # Define pruning equation       # View nsplits which pass automatic pruning criteria
df <- df[df$CP %in% pruneLevel,]                                       # Subset df for complexities which pass pruning step one
df               

In [None]:
# Step two: Select for top 3 pruning levels which have smallest(cp+xerror)
df <- df[order(df["CP_plus_xerror"]),][0:3,]
df

# n_split = 44 CP makes no sense, 19 is less celltypes than inputted so going for 27 as next suitable n_split

In [None]:
# Step three: Select the pruning level with lowest nsplit with all classes present as terminal node in decision tree 
# Built decision trees for all +ve complexities in df above: nsplit=32 omits 5 classes. nsplit=48 contains all classes > auto_cp must be set at this threshold)
df <- as.data.frame(mytree$cptable)
colnames(df) <- c("CP", "nsplit", "relError", "xerror", "xstd")
auto_cp <- df["CP"][df["nsplit"]==27]  # Define pruning equation   
auto_cp

In [None]:
# visualise final tree
mytree <- prune(mytree, cp = auto_cp) 
printcp(mytree)

In [None]:
df <- as.data.frame(mytree$cptable)
colnames(df) <- c("CP", "nsplit", "relError", "xerror", "xstd")
df

In [None]:
# Visually inspect the feature (Ab) importance in the DT model with final complexity we've selected 
# Make dataframe of Abs and their importance
df <- as.data.frame(mytree$variable.importance)
df["ab"] = rownames(df)

# Plot the Abs (ordered by importance) as a barplot
options(repr.plot.width=100, repr.plot.height=100)
ggplot(df, aes(x=reorder(ab, mytree$variable.importance), y=mytree$variable.importance)) + # 
  geom_bar(stat = "identity", width=0.5)  +
  theme(axis.text.y = element_text(hjust=1, size=70), axis.title.y = element_text(size=70, face="bold"), axis.title.x = element_text(size=70, face="bold")) +
  coord_flip()

In [None]:
# Time to evaluate the DT model for how generalisable it is (incl. purity and recall) using a test dataset

# Load the test data in
test_data <- read.csv('/home/jovyan/YS_project/YS_CiteSeq_final_script_templates/Decision_tree/protein_test_test_new_anno_20220331.csv', 
                  header=TRUE, sep=",", row.names="X")
test_data[1:5]

In [None]:
# Run class predictions for the test data
preds <- factor(predict(mytree, newdata=test_data, type="class"), levels=levels(factor(test_data$cell.labels))) # added in levels argument to be a factor

In [None]:
#table_mat <- table(test_data$cell.labels, predict_unseen)
#table_mat

In [None]:
preds

In [None]:
# reorder the levels for preds (so that the confusion matrix looks nice)
celltype_list <- c(
'HSPC1',
'HSPC2',
'CMP',
'MEMP',
'lymphoid progenitor',
'lymphoid- NK/ILC',
'lymphoid- B lin',
'Lymphoid',
'pDC precursor',
'Monocyte_0',
'Monocyte_1',
'Macrophage',
'Microglia',
'Mast_cell',
'early MK',
'MK',
'early erythroid',
'Erythroid',
'Endothelium',
'Fibroblast',
'Smooth_Muscle',
'Mesothelium',
'Endoderm'
)

In [None]:
anno <- factor(test_data$cell.labels) # added in levels argument to be a factor
anno

In [None]:
length(preds)

In [None]:
length(anno)

In [None]:
levels(preds) <- celltype_list
levels(anno) <- celltype_list

In [None]:
# To evaluate both true positives (sensitivity) and true negatives (specificity), I generate confusion matrix
c_mat <- confusionMatrix(data=preds, reference=anno) # default beta=1 
confusion_matrix_df <- melt(data.frame(c_mat$table))
head(confusion_matrix_df)

In [None]:
# Plot heatmap of confusion matrix (save as pdf in R)
options(repr.plot.width=15, repr.plot.height=15)
ggplot(confusion_matrix_df, aes(x=Prediction, y=Reference)) + 
  geom_tile(aes(fill = value)) + # background colours are mapped according to the value column
  scale_fill_gradient2(low = "white", high = muted("midnightblue"), midpoint = 0) + # colour
  theme(axis.text.x = element_text(angle=90, hjust=1, vjust=1, size=15, face="bold"),
        plot.title = element_text(size=30, face="bold"),
        axis.text.y = element_text(size=20, face="bold")) + 
  ggtitle("Confusion matrix for CITE-seq decision tree") + 
  theme(legend.title=element_text(face="bold", size=8)) + 
  xlab("Test data predicted label based on DT") + ylab("Test data actual label") +
  labs(fill="")


# Plot tree

In [None]:
rpart.plot(mytree, type = 3, clip.right.labs = FALSE, 
           branch = 0, under = FALSE, extra=FALSE, box.palette=0, cex=NULL, compress = TRUE, ycompress = TRUE, Margin=0,)   

In [None]:
rpart.plot(mytree, type = 3, clip.right.labs = FALSE, 
           branch = 1, under = FALSE, extra=FALSE, box.palette=0, cex=NULL, compress = TRUE, ycompress = TRUE, Margin=0,)

In [None]:
rpart.plot(mytree, type = 3, clip.right.labs = FALSE, 
           branch = 1, under = FALSE, extra=FALSE, box.palette=0, cex=NULL, compress = TRUE, ycompress = TRUE, Margin=0,add.labs=FALSE,)  

In [None]:
#jpeg('rplot_decision_tree_test.jpg')
#rpart.plot(mytree, type = 3, clip.right.labs = FALSE, 
#          branch = 1, under = FALSE, extra=FALSE, box.palette=0, cex=NULL, compress = TRUE, ycompress = TRUE, Margin=0,)
#dev.off()

In [None]:
pdf('rplot_decision_tree_with_labels_20220401.pdf')
rpart.plot(mytree, type = 3, clip.right.labs = FALSE, 
           branch = 1, under = FALSE, extra=FALSE, box.palette=0, cex=NULL, compress = TRUE, ycompress = TRUE, Margin=0,)
dev.off()

In [None]:
pdf('rplot_decision_tree_without_labels_20220401.pdf')
rpart.plot(mytree, type = 3, clip.right.labs = FALSE, 
           branch = 1, under = FALSE, extra=FALSE, box.palette=0, cex=NULL, compress = TRUE, ycompress = TRUE, Margin=0,add.labs=FALSE,)
dev.off()