-
Notifications
You must be signed in to change notification settings - Fork 185
/
textmodel_NB.R
266 lines (230 loc) · 10.2 KB
/
textmodel_NB.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
#' Naive Bayes classifier for texts
#'
#' Currently working for vectors of texts -- not specially defined for a dfm.
#'
#' This naive Bayes model works on word counts, with smoothing.
#' @param x the dfm on which the model will be fit. Does not need to contain
#' only the training documents.
#' @param y vector of training labels associated with each document identified
#' in \code{train}. (These will be converted to factors if not already
#' factors.)
#' @param smooth smoothing parameter for feature counts by class
#' @param prior prior distribution on texts, see details
#' @param distribution count model for text features, can be \code{multinomial}
#' or \code{Bernoulli}
#' @param ... more arguments passed through
#' @return A list of return values, consisting of:
#' @return \item{call}{original function call}
#' @return \item{PwGc}{probability of the word given the class (empirical
#' likelihood)}
#' @return \item{Pc}{class prior probability}
#' @return \item{PcGw}{posterior class probability given the word}
#' @return \item{Pw}{baseline probability of the word}
#' @return \item{data}{list consisting of \code{x} training class, and \code{y}
#' test class}
#' @return \item{distribution}{the distribution argument}
#' @return \item{prior}{argument passed as a prior}
#' @return \item{smooth}{smoothing parameter}
#' @section Predict Methods: A \code{predict} method is also available for a
#' fitted Naive Bayes object, see \code{\link{predict.textmodel_NB_fitted}}.
#' @author Kenneth Benoit
#' @examples
#' ## Example from 13.1 of _An Introduction to Information Retrieval_
#' trainingset <- as.dfm(matrix(c(1, 2, 0, 0, 0, 0,
#' 0, 2, 0, 0, 1, 0,
#' 0, 1, 0, 1, 0, 0,
#' 0, 1, 1, 0, 0, 1,
#' 0, 3, 1, 0, 0, 1),
#' ncol=6, nrow=5, byrow=TRUE,
#' dimnames = list(docs = paste("d", 1:5, sep = ""),
#' features = c("Beijing", "Chinese", "Japan", "Macao",
#' "Shanghai", "Tokyo"))))
#' trainingclass <- factor(c("Y", "Y", "Y", "N", NA), ordered = TRUE)
#' ## replicate IIR p261 prediction for test set (document 5)
#' (nb.p261 <- textmodel_NB(trainingset, trainingclass))
#' predict(nb.p261, newdata = trainingset[5, ])
#'
#' # contrast with other priors
#' predict(textmodel_NB(trainingset, trainingclass, prior = "docfreq"))
#' predict(textmodel_NB(trainingset, trainingclass, prior = "termfreq"))
#'
#' @export
textmodel_NB <- function(x, y, smooth = 1, prior = c("uniform", "docfreq", "termfreq"),
distribution = c("multinomial", "Bernoulli"), ...) {
call <- match.call()
prior <- match.arg(prior)
distribution <- match.arg(distribution)
y <- factor(y) # no effect if already a factor
x.trset <- x[which(!is.na(y)), ]
y.trclass <- y[!is.na(y)]
types <- colnames(x)
docs <- rownames(x)
levs <- levels(y.trclass)
## distribution
if (distribution == "Bernoulli")
x <- tf(x, "boolean")
else
if (distribution != "multinomial")
stop("Distribution can only be multinomial or Bernoulli.")
## prior
if (prior=="uniform") {
Pc <- rep(1/length(levs), length(levs))
names(Pc) <- levs
} else if (prior=="docfreq") {
Pc <- prop.table(table(y.trclass))
Pc_names <- names(Pc)
attributes(Pc) <- NULL
names(Pc) <- Pc_names
} else if (prior=="termfreq") {
# weighted means the priors are by total words in each class
# (the probability that any given word is in a particular class)
temp <- x.trset
rownames(temp) <- y.trclass
colnames(temp) <- rep("all_same", nfeature(temp))
temp <- dfm_compress(temp)
Pc <- prop.table(as.matrix(temp))
Pc_names <- rownames(Pc)
attributes(Pc) <- NULL
names(Pc) <- Pc_names
} else stop("Prior must be either docfreq (default), wordfreq, or uniform")
## multinomial ikelihood: class x words, rows sum to 1
# d <- t(sapply(split(as.data.frame(x.trset), y.trclass), colSums))
# combine all of the class counts
rownames(x.trset) <- y.trclass
d <- dfm_compress(x.trset, margin = "both")
PwGc <- rowNorm(d + smooth)
# order Pc so that these are the same order as rows of PwGc
Pc <- Pc[rownames(PwGc)]
## posterior: class x words, cols sum to 1
PcGw <- colNorm(PwGc * base::outer(Pc, rep(1, ncol(PwGc))))
## P(w)
Pw <- t(PwGc) %*% as.numeric(Pc)
ll <- list(call=call, PwGc=PwGc, Pc=Pc, PcGw = PcGw, Pw = Pw,
data = list(x=x, y=y),
distribution = distribution, prior = prior, smooth = smooth)
class(ll) <- c("textmodel_NB_fitted", class(ll))
return(ll)
}
#' prediction method for Naive Bayes classifier objects
#'
#' implements class predictions using trained Naive Bayes examples
#' @param object a fitted Naive Bayes textmodel
#' @param newdata dfm on which prediction should be made
#' @param ... not used
#' @return A list of two data frames, named \code{docs} and \code{words} corresponding
#' to word- and document-level predicted quantities
#' @return \item{docs}{data frame with document-level predictive quantities:
#' nb.predicted, ws.predicted, bs.predicted, PcGw, wordscore.doc, bayesscore.doc,
#' posterior.diff, posterior.logdiff. Note that the diff quantities are currently
#' implemented only for two-class solutions.}
#' @return \item{words}{data-frame with word-level predictive quantities:
#' wordscore.word, bayesscore.word}
#' @author Kenneth Benoit
#' @rdname predict.textmodel
#' @examples
#' (nbfit <- textmodel_NB(data_dfm_LBGexample, c("A", "A", "B", "C", "C", NA)))
#' (nbpred <- predict(nbfit))
#' @keywords internal textmodel
#' @export
predict.textmodel_NB_fitted <- function(object, newdata = NULL, ...) {
call <- match.call()
if (is.null(newdata)) newdata <- object$data$x
# remove any words for which zero probabilities exist in training set --
# would happen if smooth=0
# the condition assigns the index of zero occurring words to vector "notinref" and only
# trims the objects if this index has length>0
if (length(notinref <- which(colSums(object$PwGc)==0))) {
object$PwGc <- object$PwGc[-notinref]
object$PcGw <- object$PcGw[-notinref]
object$Pw <- object$Pw[-notinref]
object$data$x <- object$data$x[,-notinref]
newdata <- newdata[,-notinref]
}
# make sure feature set is ordered the same in test and training set (#490)
if (ncol(object$PcGw) != ncol(newdata))
stop("feature set in newdata different from that in training set")
if (!identical(colnames(object$PcGw), colnames(newdata)) | setequal(colnames(object$PcGw), colnames(newdata))) {
# if feature names are the same but diff order, reorder
newdata <- newdata[, colnames(object$PcGw)]
} else {
stop("feature set in newdata different from that in training set")
}
# log P(d|c) class conditional document likelihoods
log.lik <- newdata %*% t(log(object$PwGc))
# weight by class priors
log.posterior.lik <- t(apply(log.lik, 1, "+", log(object$Pc)))
# predict MAP class
nb.predicted <- colnames(log.posterior.lik)[apply(log.posterior.lik, 1, which.max)]
## now compute class posterior probabilities
# initialize posterior probabilities matrix
posterior.prob <- matrix(NA, ncol = ncol(log.posterior.lik),
nrow = nrow(log.posterior.lik),
dimnames = dimnames(log.posterior.lik))
# compute posterior probabilities
for (j in seq_len(ncol(log.posterior.lik))) {
base.lpl <- log.posterior.lik[, j]
posterior.prob[, j] <- 1 / (1 + rowSums(exp(log.posterior.lik[, -j, drop = FALSE] - base.lpl)))
}
result <- list(log.posterior.lik = log.posterior.lik,
posterior.prob = posterior.prob,
nb.predicted = nb.predicted,
Pc = object$Pc,
classlabels = names(object$Pc),
call = call)
class(result) <- c("textmodel_NB_predicted", class(result))
result
}
# not used any more
logsumexp <- function(x) {
xmax <- which.max(x)
log1p(sum(exp(x[-xmax] - x[xmax]))) + x[xmax]
}
# @rdname print.textmodel
#' @export
#' @method print textmodel_NB_fitted
print.textmodel_NB_fitted <- function(x, n=30L, ...) {
cat("Fitted Naive Bayes model:\n")
cat("Call:\n\t")
print(x$call)
cat("\n")
cat("\nTraining classes and priors:\n")
print(x$Pc)
cat("\n\t\t Likelihoods:\t\tClass Posteriors:\n")
print(head(t(rbind(x$PwGc, x$PcGw)), n))
cat("\n")
}
# @param x for print method, the object to be printed
# @param n max rows of dfm to print
# @param digits number of decimal places to print for print methods
# @param ... not used in \code{print.textmodel_wordscores_fitted}
#' @export
#' @method print textmodel_NB_predicted
print.textmodel_NB_predicted <- function(x, n = 30L, digits = 4, ...) {
cat("Predicted textmodel of type: Naive Bayes\n")
# cat("Call:\n\t")
# print(x$call)
if (nrow(x$log.posterior.lik) > n)
cat("(showing", n, "of", nrow(x$docs), "documents)\n")
cat("\n")
docsDf <- data.frame(x$log.posterior.lik, x$posterior.prob, x$nb.predicted)
names(docsDf) <- c(paste0("lp(", x$classlabels, ")"),
paste0("Pr(", x$classlabels, ")"),
"Predicted")
k <- length(x$classlabels)
docsDf[, 1:k] <- format(docsDf[, 1:k], nsmall = digits) #digits = digits + 6)
docsDf[, (k+1):(2*k)] <- round(docsDf[, (k+1):(2*k)], digits) #, digits = digits)
docsDf[, (k+1):(2*k)] <- format(docsDf[, (k+1):(2*k)], nsmall = digits) #, digits = digits)
# add a whitespace column for visual padding
docsDf <- cbind(docsDf[, 1:k], " " = rep(" ", nrow(docsDf)), docsDf[, (k+1):(2*k+1)])
print(docsDf[1:(min(n, nrow(docsDf))), ], digits = digits)
cat("\n")
}
## some helper functions
## make rows add up to one
rowNorm <- function(x) {
x / outer(rowSums(x), rep(1, ncol(x)))
}
## make cols add up to one
colNorm <- function(x) {
x / outer(rep(1, nrow(x)), colSums(x))
}