Skip to content

Commit

Permalink
pre-cran flight check
Browse files Browse the repository at this point in the history
  • Loading branch information
hrbrmstr committed Sep 16, 2018
1 parent d6ee3f0 commit 58b3f8f
Show file tree
Hide file tree
Showing 105 changed files with 2,647 additions and 1,020 deletions.
44 changes: 14 additions & 30 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,42 +1,26 @@
Package: wand
Type: Package
Title: Retrieve 'Magic' Attributes from Files and Directories
Version: 0.2.1
Date: 2017-10-20
Version: 0.3.0
Date: 2018-09-16
Authors@R: c(
person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"),
comment = c(ORCID = "0000-0001-5670-2640")),
person("Christos", "Zoulas", role=("ctb"), comment="libmagic"),
person("Mans", "Rullgard", role=("ctb"), comment="file"),
person("Jonathan", "Ong", role=("ctb"), comment="mime-db")
comment = c(ORCID = "0000-0001-5670-2640"))
)
Maintainer: Bob Rudis <bob@rud.is>
Description: The 'libmagic' library provides functions to determine
'MIME' type and other metadata from files through their "magic"
attributes. This is useful when you do not wish to rely solely on
the honesty of a user or the extension on a file name. It also
incorporates other metadata from the 'mime-db' database
<https://github.com/jshttp/mime-db>.
URL: http://github.com/hrbrmstr/wand
BugReports: https://github.com/hrbrmstr/wand/issues
NeedsCompilation: yes
LazyData: true
SystemRequirements: libmagic (>= 5.14) for Unix/Linux/macOS; Rtools 3.3+ for Windows
Description: 'MIME' types are shorthand descriptors for file contents and can be
determined from "magic" bytes in file headers, file contents or intuited from
file extensions. Tools are provided to perform curated "magic" tests as well
as mapping 'MIME' types from a database of over 1,500 extension mappings.
URL: http://gitlab.com/hrbrmstr/wand
BugReports: https://gitlab.com/hrbrmstr/wand/issues
Encoding: UTF-8
License: AGPL
Suggests:
testthat
testthat,
covr
Imports:
tools
Depends:
R (>= 3.2.0)
Imports:
dplyr,
purrr,
rappdirs,
stats,
stringi,
tibble,
tidyr,
utils,
Rcpp
Encoding: UTF-8
LinkingTo: Rcpp
RoxygenNote: 6.0.1.9000
13 changes: 0 additions & 13 deletions INSTALL

This file was deleted.

18 changes: 4 additions & 14 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,16 +1,6 @@
# Generated by roxygen2: do not edit by hand

export(incant)
export(magic_wand_file)
import(purrr)
import(stats)
import(stringi)
import(tibble)
import(tidyr)
importFrom(Rcpp,sourceCpp)
importFrom(dplyr,left_join)
importFrom(dplyr,mutate)
importFrom(dplyr,mutate_all)
importFrom(rappdirs,user_cache_dir)
importFrom(utils,unzip)
useDynLib(wand, .registration=TRUE)
export(get_content_type)
export(guess_content_type)
export(simplemagic_mime_db)
importFrom(tools,file_ext)
11 changes: 0 additions & 11 deletions R/RcppExports.R

This file was deleted.

821 changes: 820 additions & 1 deletion R/aaa.r

Large diffs are not rendered by default.

32 changes: 32 additions & 0 deletions R/check-office.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
check_office <- function(hdr, path) {

# [Content_Types.xml] || length 19
c(
0x5b,0x43,0x6f,0x6e,0x74,0x65,0x6e,0x74,0x5f,0x54,
0x79,0x70,0x65,0x73,0x5d,0x2e,0x78,0x6d,0x6c
) -> pat_content_types

# _rels/.rels || length 11
pat_rels <- c(0x5f,0x72,0x65,0x6c,0x73,0x2f,0x2e,0x72,0x65,0x6c,0x73)

if ((all(pat_content_types == hdr[31:49])) || (all(pat_rels == hdr[31:41]))) {

hdr <- readBin(path, "raw", n=4096)

pat_word <- c(0x77,0x6f,0x72,0x64,0x2f)
if (length(seq_in(hdr, pat_word)) > 0)
return("application/vnd.openxmlformats-officedocument.wordprocessingml.document")

pat_ppt <- c(0x70,0x70,0x74,0x2f)
if (length(seq_in(hdr, pat_ppt)) > 0)
return("application/vnd.openxmlformats-officedocument.presentationml.presentation")

pat_xl <- c(0x78,0x6c,0x2f)
if (length(seq_in(hdr, pat_xl)) > 0)
return("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")

}

return(NULL)

}
31 changes: 0 additions & 31 deletions R/datasets.r

This file was deleted.

116 changes: 116 additions & 0 deletions R/get-content-type.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#' Discover MIME type of a file based on contents
#'
#' There are a limited number of header "magic" bytes checked directly by
#' this function but cover quite a bit of ground. After that, [guess_content_type()] is called which uses
#' file extension-to-MIME mappings. File an issue or PR if more magic-byte-level
#' comparisons are required/desired. If no match is found, `???` is returned.
#'
#' @details
#' Initial in-R header mapping logic borrowed from `MimeTypes.java` from
#' [`servoy-client`](https://github.com/Servoy/servoy-client)
#'
#' @md
#' @param path path to a file
#' @return character vector
#' @export
#' @examples
#' get_content_type(system.file("extdat", "test.pdf", package="simplemagic"))
get_content_type <- function(path) {

path <- path.expand(path)
if (!file.exists(path)) stop("File not found.", call.=FALSE)

hdr <- readBin(path, "raw", n=1024)

if (all(c(0xCA,0xFE,0xBA,0xBE) == hdr[1:4])) return("application/java-vm")

if (all(c(0xD0,0xCF,0x11,0xE0,0xA1,0xB1,0x1A,0xE1) == hdr[1:8])) {
guessed_name <- guess_content_type(path)
if ((length(guessed_name) == 1) && (guessed_name != "???")) return(guessed_name)
return("application/msword")
}

if (all(c(0x25,0x50,0x44,0x46,0x2d,0x31,0x2e) == hdr[1:7])) return("application/pdf")
if (all(c(0x25,0x50,0x44,0x46) == hdr[1:4])) return("application/x-pdf")

if (all(c(0x38,0x42,0x50,0x53,0x00,0x01) == hdr[1:6])) return("image/photoshop")

if (all(c(0x25,0x21,0x50,0x53) == hdr[1:4])) return("application/postscript")

if (all(c(0xff,0xfb,0x30) == hdr[1:3])) return("audio/mp3")
if (all(c(0xff,0xfb,0xd0) == hdr[1:3])) return("audio/mp3")
if (all(c(0xff,0xfb,0x90) == hdr[1:3])) return("audio/mp3")
if (all(c(0x49,0x44,0x33) == hdr[1:3])) return("audio/mp3")
if (all(c(0xAC,0xED) == hdr[1:2])) return("application/x-java-serialized-object")

if (hdr[1] == 0x3c) { # "<"
if (all(c(0x68,0x74,0x6d,0x6c) == hdr[2:5])) return("text/html") # "html"
if (all(c(0x48,0x54,0x4d,0x4c) == hdr[2:5])) return("text/html") # "HTML"
if (all(c(0x48,0x45,0x41,0x44) == hdr[2:5])) return("text/html") # "HEAD"
if (all(c(0x68,0x65,0x61,0x64) == hdr[2:5])) return("text/html") # "head"
if (all(c(0x3f,0x78,0x6d,0x6c,0x20) == hdr[2:6])) return("application/xml")
}

if (all(c(0xfe,0xff) == hdr[1:2])) {
if (all(c(0x00,0x3c,0x00,0x3f,0x00,0x78) == hdr[3:8])) return("application/xml")
}

if (all(c(0x42,0x4d) == hdr[1:2])) return("image/bmp")
if (all(c(0x49,0x49,0x2a,0x00) == hdr[1:4])) return("image/tiff")
if (all(c(0x4D,0x4D,0x00,0x2a) == hdr[1:4])) return("image/tiff")
if (all(c(0x47,0x49,0x46,0x38) == hdr[1:4])) return("image/gif")
if (all(c(0x23,0x64,0x65,0x66) == hdr[1:4])) return("image/x-bitmap")
if (all(c(0x21,0x20,0x58,0x50,0x4d,0x32) == hdr[1:6])) return("image/x-pixmap")
if (all(c(137,80,78,71,13,10,26,10) == hdr[1:8])) return("image/png")

if (all(c(0x23,0x21,0x2f,0x62,0x69,0x6e,0x2f,0x6e,0x6f,0x64,0x65) == hdr[1:11]))
return("application/javascript")
if (all(c(0x23,0x21,0x2f,0x62,0x69,0x6e,0x2f,0x6e,0x6f,0x64,0x65,0x6a,0x73) == hdr[1:13]))
return("application/javascript")
if (all(c(0x23,0x21,0x2f,0x75,0x73,0x72,0x2f,0x62,0x69,0x6e,0x2f,0x6e,0x6f,0x64,0x65) == hdr[1:15]))
return("application/javascript")
if (all(c(0x23,0x21,0x2f,0x75,0x73,0x72,0x2f,0x62,0x69,0x6e,0x2f,0x6e,0x6f,0x64,0x65,0x6a,0x73) == hdr[1:17]))
return("application/javascript")
if (all(c(0x23,0x21,0x2f,0x75,0x73,0x72,0x2f,0x62,0x69,0x6e,0x2f,0x65,0x6e,0x76,0x20,0x6e,0x6f,0x64,0x65) == hdr[1:19]))
return("application/javascript")
if (all(c(0x23,0x21,0x2f,0x75,0x73,0x72,0x2f,0x62,0x69,0x6e,0x2f,0x65,0x6e,0x76,0x20,0x6e,0x6f,0x64,0x65,0x6a,0x73) == hdr[1:21]))
return("application/javascript")

if (all(c(0xFF,0xD8,0xFF) == hdr[1:3])) {
if (0xE0 == hdr[4]) return("image/jpeg")
if (0xE1 == hdr[4]) {
if (all(c(0x45,0x78,0x69,0x66,0x00) == hdr[7:11])) return("image/jpeg") # Exif
}
if (0xEE == hdr[4]) return("image/jpg")
}

if (all(c(0x41,0x43) == hdr[1:2]) && all(c(0x00,0x00,0x00,0x00,0x00) == hdr[7:11]))
return("application/acad")

if (all(c(0x2E,0x73,0x6E,0x64) == hdr[1:4])) return("audio/basic")
if (all(c(0x64,0x6E,0x73,0x2E) == hdr[1:4])) return("audio/basic")
if (all(c(0x52,0x49,0x46,0x46) == hdr[1:4])) return("audio/x-wav") # "RIFF"

if (all(c(0x50, 0x4b) == hdr[1:2])) { # "PK"

office_type <- check_office(hdr, path)
if (length(office_type) > 0) return(office_type)

guessed_name <- guess_content_type(path)
if ((length(guessed_name) == 1) && (guessed_name != "???")) return(guessed_name)

return("application/zip")

}

if (all(c(0x5a,0x4d) == hdr[1:2])) return("x-system/exe")

if (all(c(0x75,0x73,0x74,0x61,0x72) == hdr[258:262])) return("application/pax")

if (all(c(0x00,0x00,0x01,0xBA) == hdr[1:4])) return("video/mpeg")
if (all(c(0x00,0x00,0x01,0xB3) == hdr[1:4])) return("video/mpeg")


return(guess_content_type(path))

}
32 changes: 32 additions & 0 deletions R/guess-content-type.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#' Guess MIME type from filename (extension)
#'
#' Uses an internal database of over 1,500 file extension-to-MIME mappings to
#' return one or more associated types for a given input path. If no match is
#' found, `???` is returned.
#'
#' @details
#' Incorporates standard IANA MIME extension mappings and those from
#' [`servoy-client`](https://github.com/Servoy/servoy-client) and
#' [stevenwdv](https://github.com/stevenwdv)'s
#' [`allMimeTypes.json`](https://s-randomfiles.s3.amazonaws.com/mime/allMimeTypes.json).
#'
#' @md
#' @param path path to file
#' @return character vector
#' @export
#' @examples
#' guess_content_type(system.file("extdat", "test.pdf", package="simplemagic"))
guess_content_type <- function(path) {

path <- path.expand(path)
if (!file.exists(path)) stop("File not found.", call.=FALSE)

extension <- trimws(tolower(tools::file_ext(path)))

res <- simplemagic_mime_db[(simplemagic_mime_db$extension == extension),]$mime_type

if (length(res) == 0) return("???")

return(unique(res))

}
15 changes: 15 additions & 0 deletions R/util.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
seq_in <- function(source_vector, pattern_vector) {

which(
Reduce(
'+',
lapply(
seq_along(y <- lapply(pattern_vector, '==', source_vector)),
function(x) {
y[[x]][x:(length(source_vector) - length(pattern_vector) + x)]
}
)
) == length(pattern_vector)
)

}
22 changes: 10 additions & 12 deletions R/wand-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,17 @@
#' file name. It also incorporates other metadata from the mime-db database
#' <https://github.com/jshttp/mime-db>
#'
#' Based on \code{file} / \code{libmagic} - \url{https://github.com/file/file}
#' @section Some important details:
#'
#' The header checking is minimal (i.e. nowhere near as comprehensive as `libmagic`) but
#' covers quite a bit of ground. If there are content-check types from
#' [`magic sources`](https://github.com/threatstack/libmagic/tree/master/magic/)
#' that you would like coded into the package, please file an issue and
#' _include the full line(s)_ from that linked `magic.tab` that you would like mapped.
#'
#' @md
#' @name wand
#' @docType package
#' @author Bob Rudis (@@hrbrmstr)
#' @import purrr
#' @import tibble
#' @import tidyr
#' @import stringi
#' @importFrom rappdirs user_cache_dir
#' @useDynLib wand, .registration=TRUE
#' @importFrom Rcpp sourceCpp
#' @importFrom utils unzip
#' @importFrom dplyr mutate left_join mutate_all
#' @import stats
#' @author Bob Rudis (bob@@rud.is)
#' @importFrom tools file_ext
NULL
Loading

0 comments on commit 58b3f8f

Please sign in to comment.