Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
gogamza committed May 13, 2013
1 parent 1c009ec commit 0fc6687
Show file tree
Hide file tree
Showing 6 changed files with 156 additions and 0 deletions.
4 changes: 4 additions & 0 deletions DESCRIPTION
Expand Up @@ -18,10 +18,14 @@ Depends:
rJava (>= 0.9-0),
testthat (>= 0.5),
utils (>= 2.14.0),
stringr (>= 0.6.2),
hash (>= 2.2.6),
tau (>= 0.0-15),
Sejong (>= 0.01)
Collate:
'onLoad.R'
'manageDic.R'
'hangulUtils.R'
'koAnalyzerRun.R'
'tagdata.R'
'Concordances.R'
6 changes: 6 additions & 0 deletions NAMESPACE
@@ -1,4 +1,6 @@
export(backupUsrDic)
export(concordance_file)
export(concordance_str)
export(convertHangulStringToJamos)
export(convertHangulStringToKeyStrokes)
export(extractNoun)
Expand All @@ -10,11 +12,15 @@ export(is.jamo)
export(is.moeum)
export(mergeUserDic)
export(MorphAnalyzer)
export(mutualinformation)
export(reloadAllDic)
export(restoreUsrDic)
export(SimplePos09)
export(SimplePos22)
export(statDic)
export(useSejongDic)
export(useSystemDic)
import(hash)
import("rJava")
import(stringr)
import(tau)
81 changes: 81 additions & 0 deletions R/Concordances.R
@@ -0,0 +1,81 @@
#' concordance for input text vector
#'
#' returns concordance text for input pattern and span.
#'
#' @author Heewon Jeon
#' @param string input text as character vector or single character
#' @param pattern patterns of central words
#' @param span how many character will be produced around input pattern
#' @import stringr
#' @export
concordance_str <- function(string, pattern, span=5){
str_match(string, sprintf(".{0,%d}%s.{0,%d}", span, pattern, span))
}


#' concordance for input text file
#'
#' returns concordance text for input file.
#'
#' @author Heewon Jeon
#' @param filename file name
#' @param pattern patterns of central words
#' @param span how many character will be produced around input pattern
#' @param encoding filename's encoding
#' @export
concordance_file <- function(filename, pattern, encoding=getOption('encoding'), span=5){
f = file(filename, "r",encoding=encoding); on.exit(close(f), add = TRUE)
while(TRUE) {
next_line = readLines(f, n = 1, warn=FALSE)
if(length(next_line) == 0) {
break
}
ret <- concordance_str(next_line, pattern, span)
if(!is.na(ret)){
if(exists("retu")){
retu <- rbind(retu, ret)
}else{
retu <- ret
}
}
}
return(retu)
}



#' mutual information for input text
#'
#' returns mutual information or t-scores for input text
#'
#' @author Heewon Jeon
#' @param text input character vector
#' @param method for calculations(`mutual' or `t-scores')
#' @import tau
#' @import hash
#' @export
mutualinformation <- function(text, query="", method=c("mutual", "tscores")){
unigram <- hash(textcnt(text, method="string", n=1))
bigram <- hash(textcnt(text, method="string", n=2))
num_of_words <- sum(values(unigram))
num_of_bigrams <- sum(values(bigram))

method <- match.arg(method)
bigram_names <- Filter(function(x) { query %in% unlist(strsplit(x, split=" ")) | query == "" },
names(bigram))
if(method == "mutual"){
#calc mutual_information
sapply(bigram_names, function(x) {
bi <- unlist(strsplit(x, split=" "))
log( (bigram[[x]] * num_of_words)/(unigram[[bi[1]]] * unigram[[bi[2]]]) )
}, USE.NAMES=TRUE)
}else if(method == "tscores"){
#calc tscores
sapply(bigram_names, function(x) {
bi <- unlist(strsplit(x, split=" "))
(bigram[[x]] - 1/num_of_words * unigram[[bi[1]]] * unigram[[bi[2]]]) / sqrt(bigram[[x]])
}, USE.NAMES=TRUE)
}
}


24 changes: 24 additions & 0 deletions man/concordance_file.Rd
@@ -0,0 +1,24 @@
\name{concordance_file}
\alias{concordance_file}
\title{concordance for input text file}
\usage{
concordance_file(filename, pattern,
encoding = getOption("encoding"), span = 5)
}
\arguments{
\item{filename}{file name}

\item{pattern}{patterns of central words}

\item{span}{how many character will be produced around
input pattern}

\item{encoding}{filename's encoding}
}
\description{
returns concordance text for input file.
}
\author{
Heewon Jeon
}
22 changes: 22 additions & 0 deletions man/concordance_str.Rd
@@ -0,0 +1,22 @@
\name{concordance_str}
\alias{concordance_str}
\title{concordance for input text vector}
\usage{
concordance_str(string, pattern, span = 5)
}
\arguments{
\item{string}{input text as character vector or single
character}

\item{pattern}{patterns of central words}

\item{span}{how many character will be produced around
input pattern}
}
\description{
returns concordance text for input pattern and span.
}
\author{
Heewon Jeon
}

19 changes: 19 additions & 0 deletions man/mutualinformation.Rd
@@ -0,0 +1,19 @@
\name{mutualinformation}
\alias{mutualinformation}
\title{mutual information for input text}
\usage{
mutualinformation(text, query = "",
method = c("mutual", "tscores"))
}
\arguments{
\item{text}{input character vector}

\item{method}{for calculations(`mutual' or `t-scores')}
}
\description{
returns mutual information or t-scores for input text
}
\author{
Heewon Jeon
}

0 comments on commit 0fc6687

Please sign in to comment.