# Basic R regular expression functions

In [None]:
## Libraries
library("gutenbergr")
library("stringr")
library("data.table")

## Testing, indexing and substitution

In [None]:
## Vectors of European capitals and countries, positions match
capitals <- c(
  "The Hague",
  "Andorra la Vella",
  "Athens",
  "Belgrade",
  "Berlin",
  "Bern",
  "Bratislava",
  "Brussels",
  "Bucharest",
  "Budapest",
  "Chisinau",
  "Copenhagen",
  "Dublin",
  "Helsinki",
  "Kiev",
  "Lisbon",
  "Ljubljana",
  "London",
  "Luxembourg",
  "Madrid",
  "Minsk",
  "Monaco",
  "Moscow",
  "Nicosia",
  "Oslo",
  "Paris",
  "Podgorica",
  "Prague",
  "Reykjavik",
  "Riga",
  "Rome",
  "San Marino",
  "Sarajevo",
  "Skopje",
  "Sofia",
  "Stockholm",
  "Tallinn",
  "Tirana",
  "Vaduz",
  "Valletta",
  "Vatican City",
  "Vienna",
  "Vilnius",
  "Warsaw",
  "Zagreb"
)

countries <- c(
  "Netherlands",
  "Andorra",
  "Greece",
  "Serbia",
  "Germany",
  "Switzerland",
  "Slovakia",
  "Belgium",
  "Romania",
  "Hungary",
  "Moldova",
  "Denmark",
  "Ireland",
  "Finland",
  "Ukraine",
  "Portugal",
  "Slovenia",
  "United Kingdom",
  "Luxembourg",
  "Spain",
  "Belarus",
  "Monaco",
  "Russia",
  "Cyprus",
  "Norway",
  "France",
  "Montenegro",
  "Czech Republic",
  "Iceland",
  "Latvia",
  "Italy",
  "San Marino",
  "Bosnia & Herzegovina",
  "North Macedonia",
  "Bulgaria",
  "Sweden",
  "Estonia",
  "Albania",
  "Liechtenstein",
  "Malta",
  "Holy See",
  "Austria",
  "Lithuania",
  "Poland",
  "Croatia"
)

In [None]:
## testing and indexing
grep("Rome", capitals)

In [None]:
## using boolean indexing
grepl("Rome", capitals)

In [None]:
## variations
grep("^R", capitals)

In [None]:
grep("^R", capitals, value = TRUE)

In [None]:
grep("^R.*a$", capitals, value = TRUE)

In [None]:
grep("^R", capitals, value = TRUE, invert = TRUE)

In [None]:
grepl("^R", capitals)
!grepl("^R", capitals)

In [None]:
countries[!grepl("^R", capitals)]

In [None]:
## data.table %like% operator is just like grepl()
capitals %like% "^R"
!(capitals %like% "^R")

In [None]:
## other binary operators are also helpful
capitals == "Riga"
capitals %in% "Riga"

In [None]:
capitals == "Riga" | capitals == "Madrid"
capitals %in% c("Riga", "Madrid")

In [None]:
## substitution
gsub("some", "another ", "something")

In [None]:
## substitution
input <- "something"
input
gsub("something", "something else", input)

In [None]:
## functions are vectorized, no need to for looping/functional apply
input <- c("something", "another thing here", "something again")
input
gsub("something", "something else", input)

In [None]:
## backreferences are possible
gsub("(something).*(else)", "\\2", input, perl = TRUE)

In [None]:
## also other transformations
gsub("(something).*(else)", "\\U\\2", input, perl = TRUE)

In [None]:
## fixing the capital of the Netherlands
gsub("The Hague", "Amsterdam", capitals)

In [None]:
## the stringr library has more dedicated string functions, though many are
## duplicates or can easily be derived from base functions.
str_detect(countries, "tia")

In [None]:
## these are the same
countries[grepl("land", countries)]
countries[str_detect(countries, "land")]

In [None]:
all.equal(countries[grepl("land", countries)], countries[str_detect(countries, "land")])

In [None]:
## counting occurences
str_count(countries, "land")

## Exercises

### Extracting information from the country and capital lists

In [None]:
## TASK: Find all capitals in countries beginning with a vowel and not ending with "land".
capitals[grepl("^[AEIOU].*", countries) & !grepl("land$", countries)]
capitals[grepl("(?!.*land)^[AEIOU]", countries, perl = TRUE)]

In [None]:
## TASK: Find all countries that contain exactly two words in the title and swap them.
grep("([A-Z][a-z]+) ([A-Z][a-z]+)", countries, value = TRUE)
gsub("([A-Z][a-z]+) ([A-Z][a-z]+)", "\\2 \\1", countries)

In [None]:
## accounting for Bosnia and Herzegovina
grep("([A-Z][a-z]+)( (& )*)([A-Z][a-z]+)", countries, value = TRUE)
gsub("([A-Z][a-z]+)( | & )([A-Z][a-z]+)", "\\3\\2\\1", countries)

### Extracting information from classic literature

In [None]:
## Project Gutenberg offers lots of free classic literature
gutenberg_works(languages = "en")

In [None]:
gutenberg_works(author == "London, Jack", title == "Call of the Wild")
## gutenberg_works(author == "Twain, Mark", title == "Adventures of Huckleberry Finn")
## gutenberg_works(author == "Goethe, Johann Wolfgang von", title == "Faust")
## gutenberg_works(author == "Dostoyevsky, Fyodor", title == "Crime and Punishment")
## gutenberg_works(author == "Conrad, Joseph", title == "Heart of Darkness")
## gutenberg_works(author == "Defoe, Daniel", title == "The Life and Adventures of Robinson Crusoe")

### Robinson Crusoe


In [None]:
## Robinson Crusoe
gutenberg_works(author == "Defoe, Daniel", title == "The Life and Adventures of Robinson Crusoe")

In [None]:
## Download it
robinson <- gutenberg_download(521, mirror = "http://mirrors.xmission.com/gutenberg/")

In [None]:
## Vector, each element is a line from the book
robinson <- robinson$text

In [None]:
sapply(robinson, print)

In [None]:
## TASK: How many lines in the book mention Friday?
## ...

In [None]:
sum(grepl("Friday", robinson))

In [None]:
## TASK: How many lines in the book mention Friday or goats?
## ...

In [None]:
sum(grepl("Friday|goat|goats", robinson))

In [None]:
## TASK: On which line does he first mention finding another man's footprint on the beach?
##       What does the paragraph say?
## ...

In [None]:
grep("foot", robinson)

In [None]:
grep("print", robinson)

In [None]:
# lines which mention foot and print
grepl("foot", robinson) & grepl("print", robinson)

In [None]:
robinson[grepl("foot", robinson) & grepl("print", robinson)]

In [None]:
## Alternative way of doing this?
grep("foot.*print", robinson)

In [None]:
## but also
grep("print.*foot", robinson)

In [None]:
## groups and logical OR
grep("(print.*foot)|(foot.*print)", robinson)

In [None]:
robinson[grep("(print.*foot)|(foot.*print)", robinson)]

In [None]:
## Positive lookahead assertion, non-consuming
grep("(?=.*print)(?=.*foot)", robinson, perl = TRUE)

In [None]:
# these are all the same
grep("(print.*foot)|(foot.*print)", robinson, perl = TRUE)
grep("(?=.*print)(?=.*foot)", robinson, perl = TRUE)
union(grep(".*print.*foot", robinson), grep(".*foot.*print", robinson))

## or just use logicals here, often more readable
## grepl("print.*foot", robinson) | grepl("foot.*print", robinson)

In [None]:
robinson[grep("(?=.*print)(?=.*foot)", robinson, perl = TRUE)]

In [None]:
robinson[grep("man.s.*foot", robinson)]

In [None]:
robinson[grep("man\\Ss.*foot", robinson)]

In [None]:
line <- grep("print of a man.s naked foot", robinson)
line

In [None]:
print(robinson[(line - 1):(line + 19)])

### Faust


In [None]:
## Faust, English
gutenberg_works(author == "Goethe, Johann Wolfgang von", title == "Faust")
faust <- gutenberg_download(3023, mirror = "http://mirrors.xmission.com/gutenberg/")
faust <- faust$text

In [None]:
## Faust, German
gutenberg_works(author == "Goethe, Johann Wolfgang von", title == "Faust: Der Tragödie erster Teil", languages = "de")
faust_de <- gutenberg_download(2229, mirror = "http://mirrors.xmission.com/gutenberg/")
faust_de <- faust_de$text

In [None]:
## TASK: Find the paragraph with the famous citation where Mephistopheles introduces himself to Faust in his study.
## ...

In [None]:
## Hint: Places and actors are in full caps.
## Hint: Faust asks devil who he is.

In [None]:
## Hint: Devil speaks in riddles.

In [None]:
## Hint: "part of power", "spirit which denies"
## Hint: "gutes will und böses schafft", "geist der verneint"

In [None]:
faust[grep("will.*evil", faust, ignore.case = TRUE)]

In [None]:
faust[grep("work.*good", faust, ignore.case = TRUE)]

In [None]:
print(faust[1000:1200])

In [None]:
## none of these are informative
## faust[grep("will", faust, ignore.case = TRUE)]
## faust[grep("good", faust, ignore.case = TRUE)]
## faust[grep("eternal", faust, ignore.case = TRUE)]
## faust[grep("power", faust, ignore.case = TRUE)]

In [None]:
faust[grep("who.*you", faust, ignore.case = TRUE)]

In [None]:
## if you remember parts of the citation correctly, this is it
faust[grep("who.*thou", faust, ignore.case = TRUE)]

In [None]:
## if you remember parts of the citation correctly, this is it
faust[grep("spirit.*denies", faust, ignore.case = TRUE)]

In [None]:
## Alternatively, identify the chapter in the study and the speakers (Faust and Mephistopheles)
grep("MEPHISTOPHELES", faust)

In [None]:
grep("STUDY", faust)

In [None]:
begin_chapter <- grep("STUDY", faust)[1]
devil_speaks <- grep("MEPHISTOPHELES", faust)
faust_speaks <- grep("FAUST", faust)
other_speaks <- grep("[A-Z][A-Z]+", faust)

In [None]:
devil_speaks_instudy <- devil_speaks[devil_speaks > begin_chapter]
faust_speaks_instudy <- faust_speaks[faust_speaks > begin_chapter]

devil_speaks_instudy <- devil_speaks_instudy[devil_speaks_instudy > min(faust_speaks_instudy)]
faust_speaks_instudy <- faust_speaks_instudy[faust_speaks_instudy > min(devil_speaks_instudy)]

devil_speaks_instudy <- devil_speaks_instudy[1:10]
faust_speaks_instudy <- faust_speaks_instudy[1:10]

In [None]:
devil_speaks_instudy
faust_speaks_instudy

In [None]:
devil.talks <- unlist(Map(`:`, devil_speaks_instudy, faust_speaks_instudy - 1))

In [None]:
print(faust[devil.talks])

In [None]:
cat(faust[1996:2021],  sep = "\n")

In [None]:
## German (much easier if you remember the citation)
grep("geist.*verneint", faust_de, value = TRUE, ignore.case = TRUE)

In [None]:
line <- grep("geist.*verneint", faust_de, ignore.case = TRUE)
print(faust_de[(line - 10):(line + 7)])