/
topic_models.R
36 lines (22 loc) · 962 Bytes
/
topic_models.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#!/usr/bin/Rscript
library('ProjectTemplate')
load.project()
library('tm')
library('topicmodels')
documents <- transform(documents, Text = as.character(Text))
documents <- subset(documents, nchar(Text) > 5000)
docs <- data.frame(docs = documents$Text,
row.names = documents$Package)
corpus <- Corpus(DataframeSource(docs))
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords('english'))
document.term.matrix <- DocumentTermMatrix(corpus)
#dtm <- removeSparseTerms(document.term.matrix, 0.4)
topic.model <- LDA(document.term.matrix, 25)
terms(topic.model, 50)
topic.assignments <- topics(topic.model)
write.csv(data.frame(Package = names(topic.assignments), Topic = as.numeric(topic.assignments)),
'datatopics.csv',
row.names = FALSE)
# In the future, it would be interesting to iterate over a small number
# of possible topics to find the number that maximizes predictive power.