-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
105 changed files
with
2,647 additions
and
1,020 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,42 +1,26 @@ | ||
Package: wand | ||
Type: Package | ||
Title: Retrieve 'Magic' Attributes from Files and Directories | ||
Version: 0.2.1 | ||
Date: 2017-10-20 | ||
Version: 0.3.0 | ||
Date: 2018-09-16 | ||
Authors@R: c( | ||
person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"), | ||
comment = c(ORCID = "0000-0001-5670-2640")), | ||
person("Christos", "Zoulas", role=("ctb"), comment="libmagic"), | ||
person("Mans", "Rullgard", role=("ctb"), comment="file"), | ||
person("Jonathan", "Ong", role=("ctb"), comment="mime-db") | ||
comment = c(ORCID = "0000-0001-5670-2640")) | ||
) | ||
Maintainer: Bob Rudis <bob@rud.is> | ||
Description: The 'libmagic' library provides functions to determine | ||
'MIME' type and other metadata from files through their "magic" | ||
attributes. This is useful when you do not wish to rely solely on | ||
the honesty of a user or the extension on a file name. It also | ||
incorporates other metadata from the 'mime-db' database | ||
<https://github.com/jshttp/mime-db>. | ||
URL: http://github.com/hrbrmstr/wand | ||
BugReports: https://github.com/hrbrmstr/wand/issues | ||
NeedsCompilation: yes | ||
LazyData: true | ||
SystemRequirements: libmagic (>= 5.14) for Unix/Linux/macOS; Rtools 3.3+ for Windows | ||
Description: 'MIME' types are shorthand descriptors for file contents and can be | ||
determined from "magic" bytes in file headers, file contents or intuited from | ||
file extensions. Tools are provided to perform curated "magic" tests as well | ||
as mapping 'MIME' types from a database of over 1,500 extension mappings. | ||
URL: http://gitlab.com/hrbrmstr/wand | ||
BugReports: https://gitlab.com/hrbrmstr/wand/issues | ||
Encoding: UTF-8 | ||
License: AGPL | ||
Suggests: | ||
testthat | ||
testthat, | ||
covr | ||
Imports: | ||
tools | ||
Depends: | ||
R (>= 3.2.0) | ||
Imports: | ||
dplyr, | ||
purrr, | ||
rappdirs, | ||
stats, | ||
stringi, | ||
tibble, | ||
tidyr, | ||
utils, | ||
Rcpp | ||
Encoding: UTF-8 | ||
LinkingTo: Rcpp | ||
RoxygenNote: 6.0.1.9000 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,16 +1,6 @@ | ||
# Generated by roxygen2: do not edit by hand | ||
|
||
export(incant) | ||
export(magic_wand_file) | ||
import(purrr) | ||
import(stats) | ||
import(stringi) | ||
import(tibble) | ||
import(tidyr) | ||
importFrom(Rcpp,sourceCpp) | ||
importFrom(dplyr,left_join) | ||
importFrom(dplyr,mutate) | ||
importFrom(dplyr,mutate_all) | ||
importFrom(rappdirs,user_cache_dir) | ||
importFrom(utils,unzip) | ||
useDynLib(wand, .registration=TRUE) | ||
export(get_content_type) | ||
export(guess_content_type) | ||
export(simplemagic_mime_db) | ||
importFrom(tools,file_ext) |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
check_office <- function(hdr, path) { | ||
|
||
# [Content_Types.xml] || length 19 | ||
c( | ||
0x5b,0x43,0x6f,0x6e,0x74,0x65,0x6e,0x74,0x5f,0x54, | ||
0x79,0x70,0x65,0x73,0x5d,0x2e,0x78,0x6d,0x6c | ||
) -> pat_content_types | ||
|
||
# _rels/.rels || length 11 | ||
pat_rels <- c(0x5f,0x72,0x65,0x6c,0x73,0x2f,0x2e,0x72,0x65,0x6c,0x73) | ||
|
||
if ((all(pat_content_types == hdr[31:49])) || (all(pat_rels == hdr[31:41]))) { | ||
|
||
hdr <- readBin(path, "raw", n=4096) | ||
|
||
pat_word <- c(0x77,0x6f,0x72,0x64,0x2f) | ||
if (length(seq_in(hdr, pat_word)) > 0) | ||
return("application/vnd.openxmlformats-officedocument.wordprocessingml.document") | ||
|
||
pat_ppt <- c(0x70,0x70,0x74,0x2f) | ||
if (length(seq_in(hdr, pat_ppt)) > 0) | ||
return("application/vnd.openxmlformats-officedocument.presentationml.presentation") | ||
|
||
pat_xl <- c(0x78,0x6c,0x2f) | ||
if (length(seq_in(hdr, pat_xl)) > 0) | ||
return("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet") | ||
|
||
} | ||
|
||
return(NULL) | ||
|
||
} |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
#' Discover MIME type of a file based on contents | ||
#' | ||
#' There are a limited number of header "magic" bytes checked directly by | ||
#' this function but cover quite a bit of ground. After that, [guess_content_type()] is called which uses | ||
#' file extension-to-MIME mappings. File an issue or PR if more magic-byte-level | ||
#' comparisons are required/desired. If no match is found, `???` is returned. | ||
#' | ||
#' @details | ||
#' Initial in-R header mapping logic borrowed from `MimeTypes.java` from | ||
#' [`servoy-client`](https://github.com/Servoy/servoy-client) | ||
#' | ||
#' @md | ||
#' @param path path to a file | ||
#' @return character vector | ||
#' @export | ||
#' @examples | ||
#' get_content_type(system.file("extdat", "test.pdf", package="simplemagic")) | ||
get_content_type <- function(path) { | ||
|
||
path <- path.expand(path) | ||
if (!file.exists(path)) stop("File not found.", call.=FALSE) | ||
|
||
hdr <- readBin(path, "raw", n=1024) | ||
|
||
if (all(c(0xCA,0xFE,0xBA,0xBE) == hdr[1:4])) return("application/java-vm") | ||
|
||
if (all(c(0xD0,0xCF,0x11,0xE0,0xA1,0xB1,0x1A,0xE1) == hdr[1:8])) { | ||
guessed_name <- guess_content_type(path) | ||
if ((length(guessed_name) == 1) && (guessed_name != "???")) return(guessed_name) | ||
return("application/msword") | ||
} | ||
|
||
if (all(c(0x25,0x50,0x44,0x46,0x2d,0x31,0x2e) == hdr[1:7])) return("application/pdf") | ||
if (all(c(0x25,0x50,0x44,0x46) == hdr[1:4])) return("application/x-pdf") | ||
|
||
if (all(c(0x38,0x42,0x50,0x53,0x00,0x01) == hdr[1:6])) return("image/photoshop") | ||
|
||
if (all(c(0x25,0x21,0x50,0x53) == hdr[1:4])) return("application/postscript") | ||
|
||
if (all(c(0xff,0xfb,0x30) == hdr[1:3])) return("audio/mp3") | ||
if (all(c(0xff,0xfb,0xd0) == hdr[1:3])) return("audio/mp3") | ||
if (all(c(0xff,0xfb,0x90) == hdr[1:3])) return("audio/mp3") | ||
if (all(c(0x49,0x44,0x33) == hdr[1:3])) return("audio/mp3") | ||
if (all(c(0xAC,0xED) == hdr[1:2])) return("application/x-java-serialized-object") | ||
|
||
if (hdr[1] == 0x3c) { # "<" | ||
if (all(c(0x68,0x74,0x6d,0x6c) == hdr[2:5])) return("text/html") # "html" | ||
if (all(c(0x48,0x54,0x4d,0x4c) == hdr[2:5])) return("text/html") # "HTML" | ||
if (all(c(0x48,0x45,0x41,0x44) == hdr[2:5])) return("text/html") # "HEAD" | ||
if (all(c(0x68,0x65,0x61,0x64) == hdr[2:5])) return("text/html") # "head" | ||
if (all(c(0x3f,0x78,0x6d,0x6c,0x20) == hdr[2:6])) return("application/xml") | ||
} | ||
|
||
if (all(c(0xfe,0xff) == hdr[1:2])) { | ||
if (all(c(0x00,0x3c,0x00,0x3f,0x00,0x78) == hdr[3:8])) return("application/xml") | ||
} | ||
|
||
if (all(c(0x42,0x4d) == hdr[1:2])) return("image/bmp") | ||
if (all(c(0x49,0x49,0x2a,0x00) == hdr[1:4])) return("image/tiff") | ||
if (all(c(0x4D,0x4D,0x00,0x2a) == hdr[1:4])) return("image/tiff") | ||
if (all(c(0x47,0x49,0x46,0x38) == hdr[1:4])) return("image/gif") | ||
if (all(c(0x23,0x64,0x65,0x66) == hdr[1:4])) return("image/x-bitmap") | ||
if (all(c(0x21,0x20,0x58,0x50,0x4d,0x32) == hdr[1:6])) return("image/x-pixmap") | ||
if (all(c(137,80,78,71,13,10,26,10) == hdr[1:8])) return("image/png") | ||
|
||
if (all(c(0x23,0x21,0x2f,0x62,0x69,0x6e,0x2f,0x6e,0x6f,0x64,0x65) == hdr[1:11])) | ||
return("application/javascript") | ||
if (all(c(0x23,0x21,0x2f,0x62,0x69,0x6e,0x2f,0x6e,0x6f,0x64,0x65,0x6a,0x73) == hdr[1:13])) | ||
return("application/javascript") | ||
if (all(c(0x23,0x21,0x2f,0x75,0x73,0x72,0x2f,0x62,0x69,0x6e,0x2f,0x6e,0x6f,0x64,0x65) == hdr[1:15])) | ||
return("application/javascript") | ||
if (all(c(0x23,0x21,0x2f,0x75,0x73,0x72,0x2f,0x62,0x69,0x6e,0x2f,0x6e,0x6f,0x64,0x65,0x6a,0x73) == hdr[1:17])) | ||
return("application/javascript") | ||
if (all(c(0x23,0x21,0x2f,0x75,0x73,0x72,0x2f,0x62,0x69,0x6e,0x2f,0x65,0x6e,0x76,0x20,0x6e,0x6f,0x64,0x65) == hdr[1:19])) | ||
return("application/javascript") | ||
if (all(c(0x23,0x21,0x2f,0x75,0x73,0x72,0x2f,0x62,0x69,0x6e,0x2f,0x65,0x6e,0x76,0x20,0x6e,0x6f,0x64,0x65,0x6a,0x73) == hdr[1:21])) | ||
return("application/javascript") | ||
|
||
if (all(c(0xFF,0xD8,0xFF) == hdr[1:3])) { | ||
if (0xE0 == hdr[4]) return("image/jpeg") | ||
if (0xE1 == hdr[4]) { | ||
if (all(c(0x45,0x78,0x69,0x66,0x00) == hdr[7:11])) return("image/jpeg") # Exif | ||
} | ||
if (0xEE == hdr[4]) return("image/jpg") | ||
} | ||
|
||
if (all(c(0x41,0x43) == hdr[1:2]) && all(c(0x00,0x00,0x00,0x00,0x00) == hdr[7:11])) | ||
return("application/acad") | ||
|
||
if (all(c(0x2E,0x73,0x6E,0x64) == hdr[1:4])) return("audio/basic") | ||
if (all(c(0x64,0x6E,0x73,0x2E) == hdr[1:4])) return("audio/basic") | ||
if (all(c(0x52,0x49,0x46,0x46) == hdr[1:4])) return("audio/x-wav") # "RIFF" | ||
|
||
if (all(c(0x50, 0x4b) == hdr[1:2])) { # "PK" | ||
|
||
office_type <- check_office(hdr, path) | ||
if (length(office_type) > 0) return(office_type) | ||
|
||
guessed_name <- guess_content_type(path) | ||
if ((length(guessed_name) == 1) && (guessed_name != "???")) return(guessed_name) | ||
|
||
return("application/zip") | ||
|
||
} | ||
|
||
if (all(c(0x5a,0x4d) == hdr[1:2])) return("x-system/exe") | ||
|
||
if (all(c(0x75,0x73,0x74,0x61,0x72) == hdr[258:262])) return("application/pax") | ||
|
||
if (all(c(0x00,0x00,0x01,0xBA) == hdr[1:4])) return("video/mpeg") | ||
if (all(c(0x00,0x00,0x01,0xB3) == hdr[1:4])) return("video/mpeg") | ||
|
||
|
||
return(guess_content_type(path)) | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
#' Guess MIME type from filename (extension) | ||
#' | ||
#' Uses an internal database of over 1,500 file extension-to-MIME mappings to | ||
#' return one or more associated types for a given input path. If no match is | ||
#' found, `???` is returned. | ||
#' | ||
#' @details | ||
#' Incorporates standard IANA MIME extension mappings and those from | ||
#' [`servoy-client`](https://github.com/Servoy/servoy-client) and | ||
#' [stevenwdv](https://github.com/stevenwdv)'s | ||
#' [`allMimeTypes.json`](https://s-randomfiles.s3.amazonaws.com/mime/allMimeTypes.json). | ||
#' | ||
#' @md | ||
#' @param path path to file | ||
#' @return character vector | ||
#' @export | ||
#' @examples | ||
#' guess_content_type(system.file("extdat", "test.pdf", package="simplemagic")) | ||
guess_content_type <- function(path) { | ||
|
||
path <- path.expand(path) | ||
if (!file.exists(path)) stop("File not found.", call.=FALSE) | ||
|
||
extension <- trimws(tolower(tools::file_ext(path))) | ||
|
||
res <- simplemagic_mime_db[(simplemagic_mime_db$extension == extension),]$mime_type | ||
|
||
if (length(res) == 0) return("???") | ||
|
||
return(unique(res)) | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
seq_in <- function(source_vector, pattern_vector) { | ||
|
||
which( | ||
Reduce( | ||
'+', | ||
lapply( | ||
seq_along(y <- lapply(pattern_vector, '==', source_vector)), | ||
function(x) { | ||
y[[x]][x:(length(source_vector) - length(pattern_vector) + x)] | ||
} | ||
) | ||
) == length(pattern_vector) | ||
) | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.