/
scrape_parallel.R
62 lines (45 loc) · 1.17 KB
/
scrape_parallel.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
## code to prepare `scrape_parallel` dataset goes here
# ====step 0: prepare====
source("data-raw/load-pkg.R", encoding = "UTF-8")
data("zone_city")
tbl_city <- zone_city
# ====step 2 : get district with parallel method====
require(parallel)
detectCores()
cl <- makeCluster(4)
clusterEvalQ(cl, {
require(magrittr)
require(tidyverse)
require(rvest)
require(httr)
require(glue)
require(stringr)
})
clusterExport(cl, "tbl_city")
clusterExport(cl, "get.tbl")
get_par <- function(i, dt = tbl_city){
K <- 10
tot <- nrow(dt)
page <- ceiling(tot/K)
blocks <- ((i-1)*K):(i*K)
rows <- blocks[-1]
cat(glue::glue("fretch page {i} with min {min(rows)}, and max {max(rows)}"), sep = "\n")
tbl_out <- dt %>%
.[rows,] %>%
mutate(
url = str_replace(
url, pattern = "\\.html",
replacement = str_c("/",cid,".html"))
) %>%
mutate(dt = map(.x = url, .f = get.tbl, len=6) ) %>%
unnest(dt)
return(tbl_out)
}
K <- 10
tot <- nrow(tbl_city)
page <- ceiling(tot/K)
# now run parallel computing
## note: this process will be failed
## if your internet is lows peed or not stable.
s <- parLapply(cl, 1:page, get_par)
check <- s[[1]]