Merge branch 'release/1.3.0'

k3jph · Mar 1, 2019 · 702c69c · 702c69c
2 parents ef5779b + 6d4d2d0
commit 702c69c
Show file tree

Hide file tree

Showing 69 changed files with 1,740 additions and 592 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -4,3 +4,4 @@
 paper.md
 paper.bib
 COPYRIGHT.md
+^.github
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,38 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To Reproduce**
+Steps to reproduce the behavior:
+1. Go to '...'
+2. Click on '....'
+3. Scroll down to '....'
+4. See error
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+
+**Desktop (please complete the following information):**
+ - OS: [e.g. iOS]
+ - Browser [e.g. chrome, safari]
+ - Version [e.g. 22]
+
+**Smartphone (please complete the following information):**
+ - Device: [e.g. iPhone6]
+ - OS: [e.g. iOS8.1]
+ - Browser [e.g. stock browser, safari]
+ - Version [e.g. 22]
+
+**Additional context**
+Add any other context about the problem here.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
diff --git a/.travis.yml b/.travis.yml
@@ -1,14 +1,15 @@
 language: r
-sudo: required
 
-# System dependencies for building
-r_binary_packages:
+r:
+  - oldrel
+  - release
+  - devel
+
+r_packages:
  - BH
  - Rcpp
  - testthat
-
-r_github_packages:
- - jimhester/covr
+ - covr
 
 after_success:
  - Rscript -e 'covr::coveralls()'
diff --git a/COPYRIGHT.md b/COPYRIGHT.md
@@ -1,7 +1,7 @@
 Simplified BSD License
 ======================
 
-_Copyright © 2015-2018, James P. Howard, II <<jh@jameshoward.us>>_  
+_Copyright © 2015-2019, James P. Howard, II <<jh@jameshoward.us>>_  
 _All rights reserved._
 
 Redistribution and use in source and binary forms, with or without

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: phonics
 Type: Package
 Title: Phonetic Spelling Algorithms
-Version: 1.2.3
-Date: 2019-01-09
+Version: 1.3.0
+Date: 2019-03-01
 Encoding: UTF-8
 Authors@R: c(person(given = "James P.", family = "Howard, II",
                     email = "jh@jameshoward.us", role = c("aut", "cre")),
@@ -15,7 +15,8 @@ Description: Provides a collection of phonetic algorithms including
 License: BSD_2_clause + file LICENSE
 LazyData: TRUE
 Imports:
-    Rcpp (>= 0.12.1)
+    Rcpp (>= 0.12.1),
+	data.table
 Suggests:
     testthat,
     knitr

diff --git a/NAMESPACE b/NAMESPACE
@@ -9,10 +9,13 @@ export(mra_encode)
 export(nysiis)
 export(onca)
 export(phonex)
+export(phonics)
 export(refinedSoundex)
 export(rogerroot)
 export(soundex)
 export(statcan)
 importFrom(Rcpp,evalCpp)
+importFrom(data.table,":=")
+importFrom(data.table,data.table)
 importFrom(utils,read.csv)
 useDynLib(phonics)
diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -1,103 +1,19 @@
 # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
-#' @rdname metaphone
-#' @name metaphone
-#' @title Generate phonetic versions of strings with Metaphone
-#'
-#' @description
-#' The function \code{metaphone} phonentically encodes the
-#' given string using the metaphone algorithm.
-#'
-#' @param word string or vector of strings to encode
-#' @param maxCodeLen  maximum length of the resulting encodings, in characters
-#'
-#' @details There is some discrepency
-#' with respect to how the metaphone algorithm actually works. For
-#' instance, there is a version in the Java Apache Commons library.
-#' There is a version provided within PHP. These do not provide the same
-#' results.  On the questionable theory that the implementation in PHP
-#' is probably more well known, this code should match it in output.
-#'
-#' This implementation is based on a Javascript implementation which is
-#' itself based on the PHP internal implementation.
-#'
-#' The variable \code{maxCodeLen} is the limit on how long the returned
-#' metaphone should be.
-#'
-#' @return a character vector containing the metaphones of \code{word},
-#' or an NA if the \code{word} value is NA
-#'
-#' @section Caveats:
-#' The \code{metaphone} algorithm is only
-#' defined for inputs over the standard English alphabet, \emph{i.e.},
-#' "A-Z." For inputs outside this range, the output is undefined.
-#'
-#' @family phonics
-#'
-#' @examples
-#' metaphone("wheel")
-#' metaphone(c("school", "benji"))
-#'
-#' @useDynLib phonics
-#' @importFrom Rcpp evalCpp
-#' @export
-metaphone <- function(word, maxCodeLen = 10L) {
-    .Call('_phonics_metaphone', PACKAGE = 'phonics', word, maxCodeLen)
+metaphone_internal <- function(word, maxCodeLen = 10L) {
+    .Call('_phonics_metaphone_internal', PACKAGE = 'phonics', word, maxCodeLen)
 }
 
-#' @rdname soundex
-#' @name soundex
-#' @title Soundex
-#'
-#' @description
-#' The Soundex phonetic algorithms
-#'
-#' @param word string or vector of strings to encode
-#' @param maxCodeLen  maximum length of the resulting encodings, in characters
-#'
-#' @details The function \code{soundex} phonentically encodes the given
-#' string using the soundex algorithm.  The function \code{refinedSoundex}
-#' uses Apache's refined soundex algorithm.  Both implementations are loosely
-#' based on the Apache Commons Java editons.
-#'
-#' The variable \code{maxCodeLen} is the limit on how long the returned
-#' soundex should be.
-#'
-#' @return soundex encoded character vector
-#'
-#' @section Caveats:
-#' The \code{soundex} and \code{refinedSoundex} algorithms are only
-#' defined for inputs over the standard English alphabet, \emph{i.e.},
-#' "A-Z." For inputs outside this range, the output is undefined.
-#'
-#' @references
-#' Charles P. Bourne and Donald F. Ford, "A study of methods for
-#' systematically abbreviating English words and names," \emph{Journal
-#' of the ACM}, vol. 8, no. 4 (1961), p. 538-552.
-#'
-#' Howard B. Newcombe, James M. Kennedy, "Record linkage: making
-#' maximum use of the discriminating power of identifying information,"
-#' \emph{Communications of the ACM}, vol. 5, no. 11 (1962), p. 563-566.
-#'
-#' @family phonics
-#'
-#' @examples
-#' soundex("wheel")
-#' soundex(c("school", "benji"))
-#'
 #' @useDynLib phonics
 #' @importFrom Rcpp evalCpp
-#' @export
-soundex <- function(word, maxCodeLen = 4L) {
-    .Call('_phonics_soundex', PACKAGE = 'phonics', word, maxCodeLen)
+soundex_internal <- function(word, maxCodeLen = 4L) {
+    .Call('_phonics_soundex_internal', PACKAGE = 'phonics', word, maxCodeLen)
 }
 
-#' @rdname soundex
 #' @useDynLib phonics
 #' @importFrom Rcpp evalCpp
-#' @export
-refinedSoundex <- function(word, maxCodeLen = 10L) {
-    .Call('_phonics_refinedSoundex', PACKAGE = 'phonics', word, maxCodeLen)
+refinedSoundex_internal <- function(word, maxCodeLen = 10L) {
+    .Call('_phonics_refinedSoundex_internal', PACKAGE = 'phonics', word, maxCodeLen)
 }
 
diff --git a/R/caverphone.R b/R/caverphone.R
@@ -1,4 +1,4 @@
-## Copyright (c) 2015, James P. Howard, II <jh@jameshoward.us>
+## Copyright (c) 2015-2019, James P. Howard, II <jh@jameshoward.us>
 ##
 ## Redistribution and use in source and binary forms, with or without
 ## modification, are permitted provided that the following conditions are
@@ -32,6 +32,7 @@
 #' @param word string or vector of strings to encode
 #' @param maxCodeLen   maximum length of the resulting encodings, in characters
 #' @param modified     if \code{TRUE}, use the Caverphone 2 algorithm
+#' @param clean if \code{TRUE}, return \code{NA} for unknown alphabetical characters
 #'
 #' @details
 #'
@@ -42,12 +43,17 @@
 #' The variable \code{modified} directs \code{caverphone} to use the
 #' Caverphone2 method, instead of the original.
 #'
-#' @return the Caverphone encoded character vector
+#' The \code{caverphone} algorithm is only defined for inputs over the
+#' standard English alphabet, \emph{i.e.}, "A-Z.". Non-alphabetical
+#' characters are removed from the string in a locale-dependent fashion.
+#' This strips spaces, hyphens, and numbers.  Other letters, such as
+#' "Ü," may be permissible in the current locale but are unknown to
+#' \code{caverphone}.  For inputs outside of its known range, the output is
+#' undefined and \code{NA} is returned and a \code{warning} this thrown.
+#' If \code{clean} is \code{FALSE}, \code{caverphone} attempts to process the
+#' strings.  The default is \code{TRUE}.
 #'
-#' @section Caveats:
-#' The \code{caverphone} algorithm is only
-#' defined for inputs over the standard English alphabet, \emph{i.e.},
-#' "A-Z." For inputs outside this range, the output is undefined.
+#' @return the Caverphone encoded character vector
 #'
 #' @references
 #'
@@ -65,7 +71,7 @@
 #' caverphone("Stevenson", maxCodeLen = 4)
 #'
 #' @export
-caverphone <- function(word, maxCodeLen = NULL, modified = FALSE) {
+caverphone <- function(word, maxCodeLen = NULL, modified = FALSE, clean = TRUE) {
     ## From here on, this is a line-for-line translation of the Apache
     ## Commons Caverphone and Caverphone2 implementations, which both
     ## used regular expressions for substantially all of the work.
@@ -77,18 +83,31 @@ caverphone <- function(word, maxCodeLen = NULL, modified = FALSE) {
         else
             maxCodeLen <- 6
 
-    ## First, remove any nonalphabetical characters and lowercase it
-    word <- gsub("[^[:alpha:]]*", "", word, perl = TRUE)
+    ## First, uppercase it and test for unprocessable characters
     word <- tolower(word)
+    listNulls <- is.null(word)
+    listNAs <- is.na(word)
+    if(any(nonalpha <- grepl("[^a-z]", word, perl = TRUE)) && clean)
+        warning("unknown characters found, results may not be consistent")
+    word <- gsub("[^a-z]*", "", word, perl = TRUE)
 
     if(modified == TRUE)
         word <- caverphone_modified(word)
     else
         word <- caverphone_original(word)
 
     ## Pad the wording with maxCodeLen 1s and truncate
-	word <- gsub("$", paste(rep(1, maxCodeLen), collapse = ""), word, perl = TRUE)
+    ones <- paste(rep(1, maxCodeLen), sep = "", collapse = "")
+    word <- gsub("$", ones, word, perl = TRUE)
     word <- substr(word, 1, maxCodeLen)
+    word <- gsub(ones, "", word, perl = TRUE)
+
+    ## Yeah, we already processed them, but now get rid of them
+    word[listNulls] <- NA
+    word[listNAs] <- NA
+    if(clean)
+        word[nonalpha] <- NA
+
     return(word)
 }
 
@@ -213,7 +232,7 @@ caverphone_modified <- function(word) {
     word <- gsub("l", "2", word, perl = TRUE)
     word <- gsub("2", "", word, perl = TRUE)
     word <- gsub("3$", "A", word, perl = TRUE)
-	word <- gsub("3", "", word, perl = TRUE)
+    word <- gsub("3", "", word, perl = TRUE)
 
     return(word)
 }