# PC Session 1

**Author:**
[Helge Liebert](https://hliebert.github.io/)

# Basic R regular expression functions

In [None]:
## Libraries
library("gutenbergr")
library("stringr")
library("data.table")

## Testing, indexing and substitution

In [None]:
## Vectors of European capitals and countries, positions match
capitals <- c(
  "The Hague",
  "Andorra la Vella",
  "Athens",
  "Belgrade",
  "Berlin",
  "Bern",
  "Bratislava",
  "Brussels",
  "Bucharest",
  "Budapest",
  "Chisinau",
  "Copenhagen",
  "Dublin",
  "Helsinki",
  "Kiev",
  "Lisbon",
  "Ljubljana",
  "London",
  "Luxembourg",
  "Madrid",
  "Minsk",
  "Monaco",
  "Moscow",
  "Nicosia",
  "Oslo",
  "Paris",
  "Podgorica",
  "Prague",
  "Reykjavik",
  "Riga",
  "Rome",
  "San Marino",
  "Sarajevo",
  "Skopje",
  "Sofia",
  "Stockholm",
  "Tallinn",
  "Tirana",
  "Vaduz",
  "Valletta",
  "Vatican City",
  "Vienna",
  "Vilnius",
  "Warsaw",
  "Zagreb"
)

countries <- c(
  "Netherlands",
  "Andorra",
  "Greece",
  "Serbia",
  "Germany",
  "Switzerland",
  "Slovakia",
  "Belgium",
  "Romania",
  "Hungary",
  "Moldova",
  "Denmark",
  "Ireland",
  "Finland",
  "Ukraine",
  "Portugal",
  "Slovenia",
  "United Kingdom",
  "Luxembourg",
  "Spain",
  "Belarus",
  "Monaco",
  "Russia",
  "Cyprus",
  "Norway",
  "France",
  "Montenegro",
  "Czech Republic",
  "Iceland",
  "Latvia",
  "Italy",
  "San Marino",
  "Bosnia & Herzegovina",
  "North Macedonia",
  "Bulgaria",
  "Sweden",
  "Estonia",
  "Albania",
  "Liechtenstein",
  "Malta",
  "Holy See",
  "Austria",
  "Lithuania",
  "Poland",
  "Croatia"
)

In [None]:
capitals[31]
countries[31]

In [None]:
## testing and indexing
grep("Rome", capitals)

In [None]:
## using boolean indexing
grepl("Rome", capitals)

In [None]:
## variations
grep("^R", capitals)

In [None]:
grep("^R", capitals, value = TRUE)

In [None]:
grep("^R.*a$", capitals, value = TRUE)

In [None]:
grep("^R", capitals, value = TRUE, invert = TRUE)

In [None]:
grepl("^R", capitals)
!grepl("^R", capitals)

In [None]:
countries[!grepl("^R", capitals)]

In [None]:
## data.table %like% operator is just like grepl()
capitals %like% "^R"
!(capitals %like% "^R")

In [None]:
## other binary operators are also helpful
capitals == "Riga"
capitals %in% "Riga"

In [None]:
capitals == "Riga" | capitals == "Madrid"
capitals %in% c("Riga", "Madrid")

In [None]:
## substitution
gsub("some", "another ", "something")

In [None]:
## substitution
input <- "something"
input
gsub("something", "something else", input)

In [None]:
## functions are vectorized, no need to for looping/functional apply
input <- c("something", "another thing here", "something again")
input
gsub("something", "something else", input)

In [None]:
## backreferences are possible
gsub("(something).*(else)", "\\2", "something else")

In [None]:
## also other transformations
gsub("(something).*(else)", "\\U\\2", "something else", perl = TRUE)

In [None]:
## fixing the capital of the Netherlands
gsub("The Hague", "Amsterdam", capitals)
capitals
capitals <- gsub("The Hague", "Amsterdam", capitals)
capitals

In [None]:
## the stringr library has more dedicated string functions, though many are
## duplicates or can easily be derived from base functions.
str_detect(countries, "tia")

In [None]:
## these are the same
countries[grepl("land", countries)]
countries[str_detect(countries, "land")]

In [None]:
all.equal(countries[grepl("land", countries)], countries[str_detect(countries, "land")])

In [None]:
## counting occurences
str_count(countries, "land")

## Exercises

### Extracting information from the country and capital lists

In [None]:
## TASK: Find all capitals in countries beginning with a vowel and not ending with "land".
## ...

### Extracting information from classic literature

In [None]:
## Project Gutenberg offers lots of free classic literature
gutenberg_works(languages = "en")

### Robinson Crusoe


In [None]:
## Robinson Crusoe
gutenberg_works(author == "Defoe, Daniel", title == "The Life and Adventures of Robinson Crusoe")

In [None]:
## Download it
robinson <- gutenberg_download(521, mirror = "http://mirrors.xmission.com/gutenberg/")

In [None]:
## Vector, each element is a line from the book
robinson <- robinson$text

In [None]:
sapply(robinson, print)

In [None]:
## TASK: How many lines in the book mention Friday?
## ...

In [None]:
## TASK: How many lines in the book mention Friday or goats?
## ...

In [None]:
## TASK: On which line does he first mention finding another man's footprint on the beach?
##       What does the paragraph say?
## ...

### Faust


In [None]:
## Faust, English
gutenberg_works(author == "Goethe, Johann Wolfgang von", title == "Faust")
faust <- gutenberg_download(3023, mirror = "http://mirrors.xmission.com/gutenberg/")
faust <- faust$text

In [None]:
## Faust, German
gutenberg_works(author == "Goethe, Johann Wolfgang von", title == "Faust: Der Tragödie erster Teil", languages = "de")
faust.de <- gutenberg_download(2229, mirror = "http://mirrors.xmission.com/gutenberg/")
faust.de <- faust.de$text

In [None]:
## TASK: Find the paragraph with the famous citation where Mephistopheles introduces himself to Faust in his study.
## ...

In [None]:
## Hint: Places and actors are in full caps.
## Hint: Faust asks devil who he is.