/
wikisource_page.R
125 lines (120 loc) 路 4.34 KB
/
wikisource_page.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#' Download a page from Wikisource
#'
#' Download the text of a Wikisource page into a data frame using its url.
#'
#' @param wikiurl The url of a Wikisource page that will
#' be downloaded.
#'
#' @param page A string naming the Wikisource page downloaded.
#'
#' @param cleaned A boolean variable for cleaning the Wikisource page.
#'
#' @return A four column tbl_df (a type of data frame; see tibble or
#' dplyr packages) with one row for each line of the text or texts,
#' with four columns.
#'
#' \describe{
#' \item{text}{A character column}
#' \item{page}{A column naming the page downloaded}
#' \item{language}{A character column with a two letter string refering
#' to the language of the text}
#' \item{url}{A character column with the url of the Wikisource page
#' of the text}
#' }
#'
#' @examples
#'
#' \dontrun{
#' # download Sonnet 18 of Shakespeare
#' wikisource_page("https://en.wikisource.org/wiki/Shakespeare%27s_Sonnets/Sonnet_18", "Sonnet 18")
#'
#' # download Sonnets 116, 73 and 130 of Shakespeare
#' library(purrr)
#'
#' urls <- paste0("https://en.wikisource.org/wiki/Shakespeare%27s_Sonnets/Sonnet_", c(116, 73, 130))
#' sonnets <- map2_df(urls, paste0("Sonnet ", c(116, 73, 130)), wikisource_page)
#' }
#'
#' @importFrom rvest html_node
#' @importFrom rvest html_nodes
#' @importFrom rvest html_text
#' @importFrom magrittr "%>%"
#' @importFrom tibble tibble
#' @importFrom xml2 read_html
#' @importFrom xml2 xml_remove
#' @importFrom urltools domain
#' @importFrom urltools suffix_extract
#'
#' @export
wikisource_page <- function(wikiurl, page = NA, cleaned = TRUE) {
# gracefully fail is issue with Internet ressources
if(is.null(gracefully_fail(wikiurl))) {
return(invisible(NULL))
}
# READING WIKIPAGE FROM WIKISOURCE
wikipage <- wikiurl %>%
xml2::read_html() %>%
rvest::html_node(".mw-parser-output")
## reading other wiki page structures
if(length(wikipage) == 0) {
wikipage <- wikiurl %>%
xml2::read_html() %>%
rvest::html_node("#mw-content-text")
}
# IMPROVING NODE SELECTION
## if nodes not empty, remove selected id or class nodes
## Note: other id or class nodes could be added in future releases of the package
if(cleaned) {
if(length(rvest::html_nodes(wikipage, ".ws-noexport")) != 0) {
wiki_nodes_remove <- rvest::html_nodes(wikipage, ".ws-noexport")
xml2::xml_remove(wiki_nodes_remove)
}
if(length(rvest::html_nodes(wikipage, ".noprint")) != 0) {
wiki_nodes_remove <- rvest::html_nodes(wikipage, ".noprint")
xml2::xml_remove(wiki_nodes_remove)
}
if(length(rvest::html_nodes(wikipage, "#headerContainer")) != 0) {
wiki_nodes_remove <- rvest::html_nodes(wikipage, "#headerContainer")
xml2::xml_remove(wiki_nodes_remove)
}
if(length(rvest::html_nodes(wikipage, "#headertemplate")) != 0) {
wiki_nodes_remove <- rvest::html_nodes(wikipage, "#headertemplate")
xml2::xml_remove(wiki_nodes_remove)
}
if(length(rvest::html_nodes(wikipage, "#subheader")) != 0) {
wiki_nodes_remove <- rvest::html_nodes(wikipage, "#subheader")
xml2::xml_remove(wiki_nodes_remove)
}
if(length(rvest::html_nodes(wikipage, ".subheadertemplate")) != 0) {
wiki_nodes_remove <- rvest::html_nodes(wikipage, ".subheadertemplate")
xml2::xml_remove(wiki_nodes_remove)
}
if(length(rvest::html_nodes(wikipage, ".catlinks")) != 0) {
wiki_nodes_remove <- rvest::html_nodes(wikipage, ".catlinks")
xml2::xml_remove(wiki_nodes_remove)
}
if(length(rvest::html_nodes(wikipage, "table")) != 0) {
wiki_nodes_remove <- rvest::html_nodes(wikipage, "table")
xml2::xml_remove(wiki_nodes_remove)
}
}
# EXTRACTING LANGUAGE OF THE WIKIPAGE
language <- tryCatch(wikiurl %>%
urltools::domain() %>%
urltools::suffix_extract() %>%
.$subdomain,
error = function(e) NA)
# CREATING WIKIPAGE DATAFRAME
wikipage <- wikipage %>%
rvest::html_text() %>%
strsplit(., "\n") %>% #stringr::str_split("\n") %>%
unlist() %>%
#purrr::discard(!str_detect(., ""),
#!str_detect(., " "),
#!str_detect(., " ")) %>% #remove blank cells
tibble::tibble(text = .,
page = page,
language = language,
url = wikiurl)
return(wikipage)
}