# PC Session 2

**Author:**
[Helge Liebert](https://hliebert.github.io/)

#  Web Scraping: Wikipedia examples


## Requirements

In [None]:
## install and load required library
## install.packages("rvest", dependencies = TRUE)
library(rvest)

## Scraping a wiki table

In [None]:
## 1) fetch and parse the website
page <- read_html("https://en.wikipedia.org/wiki/Infant_mortality")
## 2) extract the html node containing the table
table <- html_node(page, css = "table.wikitable:nth-child(133)")
## 3) extract the table as a data frame
mrates <- html_table(table)
mrates

## Investigating page elements and navigation

In [None]:
## using css or xpath selectors is equivalent
##table <- html_node(page, xpath = "//*[@id='mw-content-text']/div/table[3]")
table <- html_node(page, css = "table.wikitable:nth-child(133)")
mrates <- html_table(table)
mrates

In [None]:
## list table nodes
html_nodes(page, "table")

In [None]:
## check out links in the table
html_nodes(table, "a")
html_nodes(table, "a") %>% html_attr("href")
## html_attr(html_nodes(table, "a"), "href") ## this is the same as the piping syntax
tablelinks <- html_attr(html_nodes(table, "a"), "href")

In [None]:
## filtering for Somalia
link <- grep("Somalia", tablelinks, value = TRUE)
link

In [None]:
## looking at html elements and their attributes
# html_nodes(page, "link")
# html_nodes(page, "a") %>% html_attr("href")
html_nodes(page, "a")
html_attr(html_nodes(table, "a"), "href")

In [None]:
## follwing a link to another page, fetching another table
session <- html_session("https://en.wikipedia.org/wiki/Infant_mortality")
session <- follow_link(session, "Somalia")
session

In [None]:
page <- read_html(session)
html_nodes(page, "title")
table <- html_node(page, css = "table.wikitable:nth-child(128)")
regions <- html_table(table)
regions

## Regex filtering

In [None]:
## filtering links
page <- read_html("https://en.wikipedia.org/wiki/Infant_mortality")
wikilinks <- html_attr(html_nodes(page, "a"), "href")
wikilinks

In [None]:
## regex examples
#grep("^/wiki", wikilinks, value = TRUE)
#grep("^/wiki.*[0-9][0-9]$", wikilinks, value = TRUE)
#grep("^/wiki.*File:.*", wikilinks, value = TRUE)
#grep("^(?!.*:)/wiki/.*Mortality", wikilinks, value = TRUE, perl = TRUE)
#grep("^(?!.*:)/wiki/.*[Mm]ortality", wikilinks, value = TRUE, perl = TRUE)

In [None]:
# compound expression
links <- grep("^(?!.*:)(/wiki/.*Mortality)", wikilinks, value = TRUE, perl = TRUE)
links

In [None]:
# sometimes easier to do it in multiple steps for readability
links <- grep("^/wiki/", wikilinks, value = TRUE)
links <- grep("Mortality|Somalia", links, value = TRUE)
links <- grep(":", links, value = TRUE, invert = TRUE)
links

In [None]:
# select only internal links matching with mortality or somalia, no files or category pages
links <- grep("^(?!.*:)(/wiki/.*Mortality)|(/wiki/.*Somalia)", wikilinks,
              ignore.case = TRUE, value = TRUE, perl = TRUE)
links <- unique(links)
links

In [None]:
# navigate to linked page
session <- jump_to(session, links[1])
page <- read_html(session)
html_nodes(page, "title")

In [None]:
# navigate to linked page
session <- jump_to(session, links[5])
page <- read_html(session)
html_nodes(page, "title")