# LDA Demo

针对200篇名家散文抽取主题

## 载入相关扩展包

In [1]:
library(tidyverse)
library(jsonlite)
library(jiebaR)
library(lda)
library(LDAvis)

## 读入散文数据

In [2]:
prose <- read_json('../Data/prose.json', simplifyVector = T)

## 对散文进行中文分词

In [3]:
segmentor <- worker(stop_word = '../Data/stop-words-common.txt')
content_seg <- sapply(prose$content, segment, segmentor)

## 统计词频，通过词的长度和词频进行过滤

In [27]:
words_table <- table(unlist(content_seg))
words_table_sroted <- sort(words_table, decreasing = TRUE)
need_remove <- words_table_sroted < 5 | 
    str_length(names(words_table_sroted)) < 2 |
    str_detect(names(words_table_sroted), regex('[0-9a-zA-Z]+'))
words_table_sroted <- words_table_sroted[!need_remove]
vocab <- names(words_table_sroted)

## 构造用于LDA的文档

In [5]:
get_wrods <- function(x, vocab) {
    index <- match(x, vocab)
    index <- index[!is.na(index)]
    rbind(as.integer(index - 1), as.integer(rep(1, length(index))))
}

documents <- lapply(content_seg, get_wrods, vocab)

## 构建LDA模型

In [6]:
set.seed(112358)

k <- 10
max_iter <- 5000
alpha <- 0.1
eta <- 0.02

lda_model <- lda.collapsed.gibbs.sampler(
    documents = documents,
    K = k,
    vocab = vocab,
    num.iterations = max_iter,
    alpha = alpha,
    eta = eta,
    initial = NULL,
    burnin = 0,
    compute.log.likelihood = TRUE
)

## # 生成用于LDAVis可视化的JSON格式数据

In [7]:
vis_theta <- t(apply(lda_model$document_sums + alpha, 2, function(x) x / sum(x)))
vis_phi <- t(apply(t(lda_model$topics) + eta, 2, function(x) x / sum(x)))
vis_frequency <- as.integer(words_table_sroted)
vis_doc_length <- sapply(documents, function(x) sum(x[2, ]))
vis_json <- createJSON(
    phi = vis_phi, 
    theta = vis_theta,
    doc.length = vis_doc_length, 
    vocab = vocab,
    term.frequency = vis_frequency
)

## 生成LDAVis的可视化HTML页面

In [8]:
serVis(vis_json, out.dir = 'LDAVis', open.browser = F)
writeLines(iconv(readLines('LDAVis/lda.json'), from = 'GBK', to = 'UTF8'),
           file('LDAVis/lda.json', encoding='UTF-8'))

## 查看不同主题的关键词

In [31]:
topic_words <- top.topic.words(lda_model$topics, num.words = 10)
row.names(topic_words) <- seq(1, 10)
colnames(topic_words) <- paste('word', seq(1, 10), sep = '-')

topic_words

## 查看隶属于不同主题的文档

In [32]:
library(plyr)
topic_documents <- top.topic.documents(lda_model$document_sums, num.documents = 10)
topic_documents_detail <- aaply(topic_documents, c(1, 2), function(id) {
    paste(prose[id, ]$author, prose[id, ]$title, sep = ' - ')
})
colnames(topic_documents_detail) <- paste('prose', seq(1, 10), sep = '-')

topic_documents_detail