- added 'toc' on vignette

- added example about extractNoun vector input
haven-jeon · Dec 19, 2016 · 0316d6d · 0316d6d
1 parent 052e452
commit 0316d6d
Show file tree

Hide file tree

Showing 4 changed files with 382 additions and 119 deletions.
diff --git a/etcs/KoNLP-API.html b/etcs/KoNLP-API.html
diff --git a/etcs/KoNLP-API.md b/etcs/KoNLP-API.md
@@ -14,8 +14,6 @@ and only select Nouns from set.
 
     library(KoNLP)
 
-    ## Checking user defined dictionary!
-
     useSejongDic()
 
     ## Backup was just finished!
@@ -25,13 +23,18 @@ and only select Nouns from set.
 
     ## [1] "롯데마트" "판매"     "흑마늘"   "양념"     "치킨"     "논란"
 
-    sapply(c("R은 free 소프트웨어이고, [완전하게 무보증]입니다.", "일정한 조건에 따르면, 자유롭게 이것을 재배포할수가 있습니다."), extractNoun)
+    #now don't need to apply 'sapply' about multiple sentences
+    #sapply(c("R은 free 소프트웨어이고, [완전하게 무보증]입니다.", "일정한 조건에 따르면, 자유롭게 이것을 재배포할수가 있습니다."), extractNoun)
+
+    extractNoun(c("R은 free 소프트웨어이고, [완전하게 무보증]입니다.", 
+                  "일정한 조건에 따르면, 자유롭게 이것을 재배포할수가 있습니다.")
+                )
 
-    ## $`R은 free 소프트웨어이고, [완전하게 무보증]입니다.`
+    ## [[1]]
     ## [1] "R"          "free"       "소프트웨어" "완전"       "하게"      
     ## [6] "무보"       "증"        
     ## 
-    ## $`일정한 조건에 따르면, 자유롭게 이것을 재배포할수가 있습니다.`
+    ## [[2]]
     ## [1] "일정"         "한"           "조건"         "자유"        
     ## [5] "이것"         "재배포할수가"
 
@@ -383,7 +386,7 @@ packages.
 
     buildDictionary(ext_dic = c('sejong', 'woorimalsam'),user_dic = data.frame(term="전작권", tag='ncn'), category_dic_nms=c('political'))
 
-    ## 718106 words dictionary was built.
+    ## 718105 words dictionary was built.
 
     extractNoun(txt)
 

diff --git a/etcs/KoNLP-API_RAW.Rmd b/etcs/KoNLP-API_RAW.Rmd
@@ -1,8 +1,10 @@
 ---
 title: "Introduction to KoNLP API"
 author: "Heewon Jeon"
-date: "2016-12-14"
-output: rmarkdown::html_vignette
+date: "2016-12-19"
+output: 
+  rmarkdown::html_vignette:
+    toc: true
 vignette: >
   %\VignetteIndexEntry{Introduction to KoNLP API}
   %\VignetteEngine{knitr::rmarkdown}
@@ -27,8 +29,12 @@ useSejongDic()
 
 extractNoun("롯데마트가 판매하고 있는 흑마늘 양념 치킨이 논란이 되고 있다.")
 
-sapply(c("R은 free 소프트웨어이고, [완전하게 무보증]입니다.", "일정한 조건에 따르면, 자유롭게 이것을 재배포할수가 있습니다."), extractNoun)
+#now don't need to apply 'sapply' about multiple sentences
+#sapply(c("R은 free 소프트웨어이고, [완전하게 무보증]입니다.", "일정한 조건에 따르면, 자유롭게 이것을 재배포할수가 있습니다."), extractNoun)
 
+extractNoun(c("R은 free 소프트웨어이고, [완전하게 무보증]입니다.", 
+              "일정한 조건에 따르면, 자유롭게 이것을 재배포할수가 있습니다.")
+            )
 
 ```
 

diff --git a/vignettes/KoNLP-API.Rmd b/vignettes/KoNLP-API.Rmd
@@ -1,8 +1,10 @@
 ---
 title: "Introduction to KoNLP API"
 author: "Heewon Jeon"
-date: "2016-12-14"
-output: rmarkdown::html_vignette
+date: "2016-12-19"
+output: 
+  rmarkdown::html_vignette:
+    toc: true
 vignette: >
   %\VignetteIndexEntry{Introduction to KoNLP API}
   %\VignetteEngine{knitr::rmarkdown}
@@ -25,8 +27,6 @@ and only select Nouns from set.
 
     library(KoNLP)
 
-    ## Checking user defined dictionary!
-
     useSejongDic()
 
     ## Backup was just finished!
@@ -36,13 +36,18 @@ and only select Nouns from set.
 
     ## [1] "롯데마트" "판매"     "흑마늘"   "양념"     "치킨"     "논란"
 
-    sapply(c("R은 free 소프트웨어이고, [완전하게 무보증]입니다.", "일정한 조건에 따르면, 자유롭게 이것을 재배포할수가 있습니다."), extractNoun)
+    #now don't need to apply 'sapply' about multiple sentences
+    #sapply(c("R은 free 소프트웨어이고, [완전하게 무보증]입니다.", "일정한 조건에 따르면, 자유롭게 이것을 재배포할수가 있습니다."), extractNoun)
+
+    extractNoun(c("R은 free 소프트웨어이고, [완전하게 무보증]입니다.", 
+                  "일정한 조건에 따르면, 자유롭게 이것을 재배포할수가 있습니다.")
+                )
 
-    ## $`R은 free 소프트웨어이고, [완전하게 무보증]입니다.`
+    ## [[1]]
     ## [1] "R"          "free"       "소프트웨어" "완전"       "하게"      
     ## [6] "무보"       "증"        
     ## 
-    ## $`일정한 조건에 따르면, 자유롭게 이것을 재배포할수가 있습니다.`
+    ## [[2]]
     ## [1] "일정"         "한"           "조건"         "자유"        
     ## [5] "이것"         "재배포할수가"
 
@@ -394,7 +399,7 @@ packages.
 
     buildDictionary(ext_dic = c('sejong', 'woorimalsam'),user_dic = data.frame(term="전작권", tag='ncn'), category_dic_nms=c('political'))
 
-    ## 718106 words dictionary was built.
+    ## 718105 words dictionary was built.
 
     extractNoun(txt)
 
@@ -661,3 +666,139 @@ Korean Twitter Analysis
 ![dendrogram](figures/dendgram.png)
 
 
+Korean Twitter Analysis
+-----------------------
+
+    #referenced from http://www.rdatamining.com/
+
+    ## @knitr init
+    library(twitteR)
+    # 
+    # n <- 200
+    # 
+    # keyword <- "삼성전자"
+    # 
+    # keyword <- enc2utf8(keyword)
+    # 
+    # rdmTweets <- searchTwitter(keyword,  n)
+
+    load(url("http://dl.dropbox.com/u/8686172/twitter.RData"))
+
+    nDocs <- length(rdmTweets)
+
+
+
+    ## @knitr preprocess
+    library(KoNLP)
+    library(tm)
+
+
+    df <- do.call("rbind", lapply(rdmTweets, as.data.frame))
+
+    removeTwit <- function(x) {gsub("@[[:graph:]]*", "", x)}
+
+    df$ptext <- sapply(df$text, removeTwit)
+
+    removeURL <- function(x) { gsub("http://[[:graph:]]*", "", x)}
+
+    df$ptext <- sapply(df$ptext, removeURL)
+    useSejongDic()
+    df$ptext <- sapply(df$ptext, function(x) {paste(extractNoun(x), collapse=" ")}) 
+
+    #build corpus
+    myCorpus_ <- Corpus(VectorSource(df$ptext))
+    myCorpus_ <- tm_map(myCorpus_, removePunctuation)
+    myCorpus_ <- tm_map(myCorpus_, removeNumbers)
+    myCorpus_ <- tm_map(myCorpus_, tolower)
+    myStopwords <- c(stopwords('english'), "rt")
+    myCorpus_ <-tm_map(myCorpus_, removeWords, myStopwords)
+
+
+
+    ## @knitr eda
+
+    myTdm <- TermDocumentMatrix(myCorpus, control=list(wordLengths=c(2,Inf)))
+
+    #inspect frequent term
+    findFreqTerms(myTdm, lowfreq=10)
+
+    #inspect associations 
+    findAssocs(myTdm,'lg',0.25)
+
+
+    ## @knitr barplot
+    library(ggplot2)
+
+    termFrequency <- rowSums(as.matrix(myTdm))
+    termFrequency <- subset(termFrequency,termFrequency>=10)
+
+    ggplot(data.frame(term = names(termFrequency), freq=termFrequency), aes(term, freq)) + geom_bar() + coord_flip()
+
+
+    ## @knitr wordcloud
+    #Word Cloud 
+
+    library(wordcloud)
+
+    m <- as.matrix(myTdm)
+
+    wordFreq <- sort(rowSums(m),decreasing=TRUE)
+
+    set.seed(375)
+
+    pal <- brewer.pal(8,"Dark2")
+
+    wordcloud(words=names(wordFreq),freq=wordFreq,min.freq=10,random.order=F, rot.per=.1,colors=pal)
+
+
+
+    ## @knitr hclust
+    myTdm2<-removeSparseTerms(myTdm,sparse=0.95)
+    m2<-as.matrix(myTdm2)
+
+    distMatrix<-dist(scale(m2))
+
+    fit<-hclust(distMatrix,method="ward")
+
+    plot(fit)
+
+    rect.hclust(fit,k=10)
+
+    #(groups<-cutree(fit,k=10))
+
+
+
+    ## @knitr kmeans
+    m3 <- t(m2)
+    k <- 4
+    kmres <- kmeans(m3, k)
+
+    round(kmres$centers, digits=3)
+
+
+    for(i in 1:k){
+      cat(paste("cluster ", i, " : ", sep=""))
+      s <- sort(kmres$centers[i, ], decreasing=T)
+      cat(names(s)[1:3], "\n")
+      #print(head(rdmTweets[which(kmres$cluster ==i)],n=3))
+    }
+
+
+    ## @knitr kmedoid
+    library(fpc)
+    pamResult <- pamk(m3, metric="manhattan")
+    (k <- pamResult$nc)
+
+    pamResult <- pamResult$pamobject
+    #print cluster medoids
+
+    for(i in 1:k){
+      cat(paste("cluster",i,":"))
+      cat(colnames(pamResult$medoids)[which(pamResult$medoids[i,]==1)],"\n")
+      # print tweets in cluster i
+      #print(rdmTweets[pamResult$clustering==i])
+    }
+
+![dendrogram](figures/dendgram.png)
+
+