-
Notifications
You must be signed in to change notification settings - Fork 0
/
RF_feature_importance_rank.R
83 lines (70 loc) · 3.97 KB
/
RF_feature_importance_rank.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
require(ranger)
require(dynamicTreeCut)
setwd("~/Box/SILKY/Keting/Projects/Fuyuan")
data <- read.csv("pairwise comparison of TE sequences encoded with labels.csv", row.names=1)
#substrate <- read.csv("TE substrate specificities.csv", row.names=1)
fa <- read.csv("TE fatty acid profiles.csv", row.names=1)
d <- as.matrix(dist(scale(fa)))
hc <- hclust(d=dist(scale(fa)), method="ward.D"); den.h <- as.dendrogram(hc)
cls.h <- cutreeDynamic(hc, method="hybrid",distM=d, cutHeight=50, minClusterSize=10)
names(cls.h) <- hc$labels
cls.h[cls.h==cls.h["rTE28"]] <- 33
cls.h[cls.h==cls.h["rTE40"]] <- 22
cls.h[cls.h==cls.h["rTE48"]] <- 11
cls.h[cls.h==33] <- 3
cls.h[cls.h==22] <- 2
cls.h[cls.h==11] <- 1
### redefine the response for each instance
### each instance is a pairwise comparison of two TEs
### the features are the sequence variation between two TEs at each amino acid position (0, same amino acid; 1, different amino acids)
### the response is defined based on the clustering analysis of the fatty acid profile (1, two TEs belonging to the same cluster; 0, different clusters)
label = rep(1, nrow(data)); names(label)=rownames(data)
for(i in 1:length(label)){
nms <- strsplit(names(label)[i], "->")[[1]]
c <- cls.h[nms]
if(c[1]!=c[2]) label[i]=0
}
### random forest with new data label
data1 <- data; data1$label = label
##### prepare the output matrix
##### the random forest classifier are implemented 10 times to account for the randomness in the classifier construction
##### imp.corr.matrix records the importance score of all features for each classifier
##### pvalue.matrix records the corresponding pvalues
##### pred.err compares the predictive performance using the old datasets (with old response labels) and the new datasets (with the newly defined response labels)
##### the results of pred.err were not used in the manuscripts; it is for other in-house training purposes, and can be ignore for this study
#imp.matrix <- matrix(0, 10, 351, dimnames=list(paste0("RF",1:10), colnames(data)[1:351]))
imp.corr.matrix <- matrix(0, 10, 351, dimnames=list(paste0("RF",1:10), colnames(data)[1:351]))
pvalue.matrix <- matrix(0, 10, 351, dimnames=list(paste0("RF",1:10), colnames(data)[1:351]))
pred.err <- matrix(0, 2, 10, dimnames=list(c("Original", "Transformed"), paste0("Run",1:10)))
pb <- txtProgressBar(min=1, max=10, style=3)
for(i in 1:10){
#### here two classifiers are established, one using the old response labels (rf0), and the other one uses the new response labels (rf)
#### only the results from rf are used in the publication
rf0 <- ranger(label~., data=data, importance="impurity_corrected", classification=T)
rf <- ranger(label~., data=data1, importance="impurity_corrected", classification=T)
imp.corr.matrix[i,] <- importance(rf)
pvalue.matrix[i,] <- importance_pvalues(rf)[,2]
pred.err[1,i] <- rf0$prediction.error
pred.err[2,i] <- rf$prediction.error
setTxtProgressBar(pb, i)
}
##### set the zero pvalue to a minimum value for pvalue correction
pvalue.matrix[pvalue.matrix==0] <- 1e-167
pvalue.corr <- apply(pvalue.matrix, 1, function(x)p.adjust(x, method="BH"))
sig.p <- apply(pvalue.corr, 1, function(x)sum(x<0.001))
pvalue.corr <- cbind(pvalue.corr, sig.p)
##### for each feature, select the maximum pvalue out of ten classifiers as the final pvalue
fin.p <- apply(pvalue.corr, 1, function(x){p=x[1:10]; max(p)})
pvalue.corr <- cbind(pvalue.corr, fin.p)
imp.corr.matrix <- t(imp.corr.matrix)
write.table(pvalue.corr,"pvalues_10Runs_RF.txt")
write.table(imp.corr.matrix, "importance_score_10Runs_RF.txt")
##### for each feature, the final importance is the average among importance scores derived from 10 classifiers
##### the final pvalue is the maximum pvalue out of ten classifiers
imp <- apply(imp.corr.matrix, 1, mean)
pvalues <- fin.p
##### an importance rank was made based on the final importance score
pos <- data.frame(imp, pvalues)
pos <- pos[order(pos$imp, decreasing=T),]
rank <- rownames(pos)
rownames(imp.corr.matrix) <- rownames(pvalue.corr) <- gsub("X","", rownames(imp.corr.matrix))