Skip to content

Commit

Permalink
- added 'toc' on vignette
Browse files Browse the repository at this point in the history
- added example about extractNoun vector input
  • Loading branch information
gogamza committed Dec 19, 2016
1 parent 052e452 commit 0316d6d
Show file tree
Hide file tree
Showing 4 changed files with 382 additions and 119 deletions.
317 changes: 215 additions & 102 deletions etcs/KoNLP-API.html

Large diffs are not rendered by default.

15 changes: 9 additions & 6 deletions etcs/KoNLP-API.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ and only select Nouns from set.

library(KoNLP)

## Checking user defined dictionary!

useSejongDic()

## Backup was just finished!
Expand All @@ -25,13 +23,18 @@ and only select Nouns from set.

## [1] "롯데마트" "판매" "흑마늘" "양념" "치킨" "논란"

sapply(c("R은 free 소프트웨어이고, [완전하게 무보증]입니다.", "일정한 조건에 따르면, 자유롭게 이것을 재배포할수가 있습니다."), extractNoun)
#now don't need to apply 'sapply' about multiple sentences
#sapply(c("R은 free 소프트웨어이고, [완전하게 무보증]입니다.", "일정한 조건에 따르면, 자유롭게 이것을 재배포할수가 있습니다."), extractNoun)

extractNoun(c("R은 free 소프트웨어이고, [완전하게 무보증]입니다.",
"일정한 조건에 따르면, 자유롭게 이것을 재배포할수가 있습니다.")
)

## $`R은 free 소프트웨어이고, [완전하게 무보증]입니다.`
## [[1]]
## [1] "R" "free" "소프트웨어" "완전" "하게"
## [6] "무보" "증"
##
## $`일정한 조건에 따르면, 자유롭게 이것을 재배포할수가 있습니다.`
## [[2]]
## [1] "일정" "한" "조건" "자유"
## [5] "이것" "재배포할수가"

Expand Down Expand Up @@ -383,7 +386,7 @@ packages.

buildDictionary(ext_dic = c('sejong', 'woorimalsam'),user_dic = data.frame(term="전작권", tag='ncn'), category_dic_nms=c('political'))

## 718106 words dictionary was built.
## 718105 words dictionary was built.

extractNoun(txt)

Expand Down
12 changes: 9 additions & 3 deletions etcs/KoNLP-API_RAW.Rmd
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
---
title: "Introduction to KoNLP API"
author: "Heewon Jeon"
date: "2016-12-14"
output: rmarkdown::html_vignette
date: "2016-12-19"
output:
rmarkdown::html_vignette:
toc: true
vignette: >
%\VignetteIndexEntry{Introduction to KoNLP API}
%\VignetteEngine{knitr::rmarkdown}
Expand All @@ -27,8 +29,12 @@ useSejongDic()
extractNoun("롯데마트가 판매하고 있는 흑마늘 양념 치킨이 논란이 되고 있다.")
sapply(c("R은 free 소프트웨어이고, [완전하게 무보증]입니다.", "일정한 조건에 따르면, 자유롭게 이것을 재배포할수가 있습니다."), extractNoun)
#now don't need to apply 'sapply' about multiple sentences
#sapply(c("R은 free 소프트웨어이고, [완전하게 무보증]입니다.", "일정한 조건에 따르면, 자유롭게 이것을 재배포할수가 있습니다."), extractNoun)
extractNoun(c("R은 free 소프트웨어이고, [완전하게 무보증]입니다.",
"일정한 조건에 따르면, 자유롭게 이것을 재배포할수가 있습니다.")
)
```

Expand Down
157 changes: 149 additions & 8 deletions vignettes/KoNLP-API.Rmd
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
---
title: "Introduction to KoNLP API"
author: "Heewon Jeon"
date: "2016-12-14"
output: rmarkdown::html_vignette
date: "2016-12-19"
output:
rmarkdown::html_vignette:
toc: true
vignette: >
%\VignetteIndexEntry{Introduction to KoNLP API}
%\VignetteEngine{knitr::rmarkdown}
Expand All @@ -25,8 +27,6 @@ and only select Nouns from set.

library(KoNLP)

## Checking user defined dictionary!

useSejongDic()

## Backup was just finished!
Expand All @@ -36,13 +36,18 @@ and only select Nouns from set.

## [1] "롯데마트" "판매" "흑마늘" "양념" "치킨" "논란"

sapply(c("R은 free 소프트웨어이고, [완전하게 무보증]입니다.", "일정한 조건에 따르면, 자유롭게 이것을 재배포할수가 있습니다."), extractNoun)
#now don't need to apply 'sapply' about multiple sentences
#sapply(c("R은 free 소프트웨어이고, [완전하게 무보증]입니다.", "일정한 조건에 따르면, 자유롭게 이것을 재배포할수가 있습니다."), extractNoun)

extractNoun(c("R은 free 소프트웨어이고, [완전하게 무보증]입니다.",
"일정한 조건에 따르면, 자유롭게 이것을 재배포할수가 있습니다.")
)

## $`R은 free 소프트웨어이고, [완전하게 무보증]입니다.`
## [[1]]
## [1] "R" "free" "소프트웨어" "완전" "하게"
## [6] "무보" "증"
##
## $`일정한 조건에 따르면, 자유롭게 이것을 재배포할수가 있습니다.`
## [[2]]
## [1] "일정" "한" "조건" "자유"
## [5] "이것" "재배포할수가"

Expand Down Expand Up @@ -394,7 +399,7 @@ packages.

buildDictionary(ext_dic = c('sejong', 'woorimalsam'),user_dic = data.frame(term="전작권", tag='ncn'), category_dic_nms=c('political'))

## 718106 words dictionary was built.
## 718105 words dictionary was built.

extractNoun(txt)

Expand Down Expand Up @@ -661,3 +666,139 @@ Korean Twitter Analysis
![dendrogram](figures/dendgram.png)


Korean Twitter Analysis
-----------------------

#referenced from http://www.rdatamining.com/

## @knitr init
library(twitteR)
#
# n <- 200
#
# keyword <- "삼성전자"
#
# keyword <- enc2utf8(keyword)
#
# rdmTweets <- searchTwitter(keyword, n)

load(url("http://dl.dropbox.com/u/8686172/twitter.RData"))

nDocs <- length(rdmTweets)



## @knitr preprocess
library(KoNLP)
library(tm)


df <- do.call("rbind", lapply(rdmTweets, as.data.frame))

removeTwit <- function(x) {gsub("@[[:graph:]]*", "", x)}

df$ptext <- sapply(df$text, removeTwit)

removeURL <- function(x) { gsub("http://[[:graph:]]*", "", x)}

df$ptext <- sapply(df$ptext, removeURL)
useSejongDic()
df$ptext <- sapply(df$ptext, function(x) {paste(extractNoun(x), collapse=" ")})

#build corpus
myCorpus_ <- Corpus(VectorSource(df$ptext))
myCorpus_ <- tm_map(myCorpus_, removePunctuation)
myCorpus_ <- tm_map(myCorpus_, removeNumbers)
myCorpus_ <- tm_map(myCorpus_, tolower)
myStopwords <- c(stopwords('english'), "rt")
myCorpus_ <-tm_map(myCorpus_, removeWords, myStopwords)



## @knitr eda

myTdm <- TermDocumentMatrix(myCorpus, control=list(wordLengths=c(2,Inf)))

#inspect frequent term
findFreqTerms(myTdm, lowfreq=10)

#inspect associations
findAssocs(myTdm,'lg',0.25)


## @knitr barplot
library(ggplot2)

termFrequency <- rowSums(as.matrix(myTdm))
termFrequency <- subset(termFrequency,termFrequency>=10)

ggplot(data.frame(term = names(termFrequency), freq=termFrequency), aes(term, freq)) + geom_bar() + coord_flip()


## @knitr wordcloud
#Word Cloud

library(wordcloud)

m <- as.matrix(myTdm)

wordFreq <- sort(rowSums(m),decreasing=TRUE)

set.seed(375)

pal <- brewer.pal(8,"Dark2")

wordcloud(words=names(wordFreq),freq=wordFreq,min.freq=10,random.order=F, rot.per=.1,colors=pal)



## @knitr hclust
myTdm2<-removeSparseTerms(myTdm,sparse=0.95)
m2<-as.matrix(myTdm2)

distMatrix<-dist(scale(m2))

fit<-hclust(distMatrix,method="ward")

plot(fit)

rect.hclust(fit,k=10)

#(groups<-cutree(fit,k=10))



## @knitr kmeans
m3 <- t(m2)
k <- 4
kmres <- kmeans(m3, k)

round(kmres$centers, digits=3)


for(i in 1:k){
cat(paste("cluster ", i, " : ", sep=""))
s <- sort(kmres$centers[i, ], decreasing=T)
cat(names(s)[1:3], "\n")
#print(head(rdmTweets[which(kmres$cluster ==i)],n=3))
}


## @knitr kmedoid
library(fpc)
pamResult <- pamk(m3, metric="manhattan")
(k <- pamResult$nc)

pamResult <- pamResult$pamobject
#print cluster medoids

for(i in 1:k){
cat(paste("cluster",i,":"))
cat(colnames(pamResult$medoids)[which(pamResult$medoids[i,]==1)],"\n")
# print tweets in cluster i
#print(rdmTweets[pamResult$clustering==i])
}

![dendrogram](figures/dendgram.png)


0 comments on commit 0316d6d

Please sign in to comment.