# Methods for getting data from internet

## Using Excel 
Using the 'data' in main menu, we can import the table from websites. This is useful espcially when we want to extract table from websites like Wikepedia.

## Web scraping 
The R package 'rvest' provides a simple way to do web scraping. Suppose we want to extract the information of university rankings from this [web page](http://www.shanghairanking.com/ARWU2020.html). Then we first open the page. 

- On the page, right click and choose 'verify'.
- Use 'Ctrl+Shift+ c', so you can move the cursor to check the 'label' of the contents you want to extract. It turns out the lists of university rankings has label of 'a'
![webscraping](webscraping.png)
- Now we can start to do webscraping. 

In [86]:
library(rvest)
library(dplyr)
url = 'http://www.shanghairanking.com/ARWU2020.html'
university_ranking = read_html( url )%>%
  html_nodes('a')%>% # find the nodes of 'a'
  html_text() %>% # extract out the text
  data.frame(unit =.)%>% # change to a data frame
  filter (grepl('University',unit, fixed = TRUE))  

head(university_ranking)

unit
Harvard University
Stanford University
University of Cambridge
"University of California, Berkeley"
Princeton University
Columbia University


# Data Cleaning using Regular Expression 
I only list some of the most frequently used regular expressions. It seems enough if we can get used to the below technics.

In [15]:
library(htmlwidgets)
library(stringr)

"package 'htmlwidgets' was built under R version 3.6.3"

## Indicating start/end

In [87]:
# Start, ending, and Boundary

a = c('xerotic','funny','you','understand','york','believe','exile')
grepl ( '^y', a) # find the strings starting with y
grepl( 'x$', a) # find the strings ending with x
grepl( '^...$', a) # find the strings with exactly three elements
grepl( '.....',a) # find the strings with more than 5 elements


## Boundary

In [None]:
grepl('^([a-z])(.){0,}\\1$', a)

a = 'we can see you in the dark, and you cannot excape.'
str_view_all(a, '\\b([a-z]){3,3}\\b') # find out all the words with exactly three letters. 
str_view_all(a,'\\b([a-zA-Z]){3,}\\b') # find out all the words with more than three letters

## Or and Not Operation

In [11]:
b = c('1a3sd','23alp390jfa93','p3209f34ph8','39jefawoi30r')
grepl ( '[abc][jkl]',a) # Contains two letters, first in abc, second in jkl
grepl ( '[^abc]',a) # The first letter must be in abc
grepl ( '[0-9]',a) # The first letter must be a number
grepl ( 'a(3|9)s',a) # Find 'a3s' or 'a9s'

a = c('indeed', 'you','are','so','sepcial','and','gifted')
grepl ( '^[aiueo]',a) # Find words starting with vowel.
grepl ( '[^e]ed$', a) # Find words that ends with 'ed' but not 'eed'
grepl ('i(ng|ze)',a ) # Find words that ends with 'ing' or 'ize'

a = c("1", "12", "123", "1234")
grepl('^\\d{3}$', a) # Find a string with exactly three numerics

a = c('1988-04-12','04-12-88','12-04-1988')
grepl('\\d{2}-\\d{2}-\\d{4}', a) # \\d always means numbercis 

## Repetition and Backreference

In [15]:
# repeat (consecutively)
a = c('rhyzmn','strike','hair','fake')
grepl ( '^[^aiueo]*[^aiueo]$',a) # Find the words with no vowel.
grepl ( '^[^aiueo]{3}',a)  # Find the words whose first three letters are non-vowel
grepl ( '[aiueo]{3,}', a) # Find the words who have three executive vowel.
grepl ( '([aiueo][^aiueo]){2,}', a) # Find the words with two or more executive vowel-nonvewel pairs

# repeat (not consecutively, and specified what you want to repeat)
a = c('bible','bad')
grepl( 'b.*b',a) # b repeated at least twice

# Backreference/ grouping : repeat (not specifying what you want to repeat)
# A good way to capture many patterns 

a = c('poop','aggregate','abc is not cba','dad','abracadabra')

grepl("(.)(.)\\2\\1", a) # Find sth like'xyyx' 
grepl("(.).*\\1.*\\1",a) # Find anythingthat appears three times
grepl("(.)(.)(.).*\\3\\2\\1", a) # Find the 'xyz'...'zyx' thing

grepl( '^(.).*\\1$', a) # Find the words where start and end are the same letter.
grepl ('(..).*\\1', a) # Find the words where contains like 'xy'...'xy'...


## Greedy and Lazy

In [33]:
# greedy and lazy 
# if, for example, we want to find (1st) and (2nd) from the following
str_view_all("(1st) other (2nd)", "\\(.+\\)") # Run this! greedy (find the longest strings that can satisfy the pattern)
str_view_all("(1st) other (2nd)", "\\(.+?\\)") # Run this ! lazy

a = 'is there any island in Atlantis, New York, or Tokyo?'

str_view_all(a, '\\b[A-Z].*?\\b') # Find all the letters starting with a capital letter: lazy!
str_view_all(a, '\\b[A-Z].*\\b') # This is wrong: greedy!

## Some applications 

### Subset

In [None]:
# we have gone through the basics of rex. Now we can do sth.
sum(str_detect(words, "^t")) # How many words begins with t?
words[str_detect (words, '^[^aiueo]{1,}$')] # Find out the words with no vowels
str_subset (words, '^[^aiueo]{1,}$') # The same
sum(grepl('^[^aiueo]{1,}$', words)) # The same

str_subset (words,'(^x|x$)') # Find all the words that start with x or ends with x
str_subset(words, '^[aiueo].*[^aiueo]$') # Find all the words that start with vowels but ends with non-vowels
max(str_count(words, '[^aiueo]')) # What is the maximum number of non-vowels ?



### Extract

In [54]:
# extract the pattern we want
a = c('I like swimming and dancing','you like jogging')
str_extract_all (a, '^(.){1,}?\\b' ) # Extract the first word from each strings. use LAZY!
str_extract_all (a,"\\b[A-z]*ing\\b") # Find out all the words with ing as ending

a = 'the cat is under the desk'
str_extract_all (a,"(a|the)\\s[a-z]*") # Finding out all the words with article.
a = paste(c('((o|O)ne)', '((t|T)wo)','((t|T)hree)'), collapse = '|')%>%
  paste0('(',., ')\\s[a-z]*')
b = 'one banana is on two apples'
str_extract_all (b, a)  # Find out all the 'one blabla','two blabla'

st = list("amber johnson", "anhar link ari")
str_extract_all (st, "\\ba[a-z]+") # Find all the words starting with 'a'
str_match_all(st, "\\ba[a-z]+")  # Compare the difference between str_extract_all and str_match_all


0
amber

0
anhar
ari


### Count

In [43]:
# For a single string, detect the times of the appearance of a pattern 
str_count("abababa", "aba")
a = 'we can see you in the dark, and you cannot excape.'
str_count(a, '\\b([a-z]){3,3}\\b') # Find out all the words with exactly three letters. 
a = 'is there any island in Atlantis, New York, or Tokyo?'
str_count(a,'\\bis\\b') # Find out all the words 'is'
a = c('123 plus 456 is 579, not 88', 'you are 12 years old, not 9 years old')
str_count (a, '\\d{1,}') # Find all the numbers 

### Replace & Remove 

In [38]:
x <- c("apple", "pear", "banana")
str_replace_all(x, "[aeiou]", "-")
# replace multiple pattern
x <- c("1 house", "2 cars", "3 people")
str_replace_all(x, c("1" = "one", "2" = "two", "3" = "three"))
x <- c("1204", "01204", "001204B") 
str_replace (x, '^[0]{1,}','')  # Get rid of zeros before the first non-zero

str_remove ('tokyo_intel_fortran', '_[^_]*$') # remove the '_fortran'
str_remove ('tokyo_intel_fortran', '_.*$') #  remove the '_intel_fortran'
str_remove (c('tokyo.bunkyo.x', 'osaka.y', 'nagoya'), '\\.[^\\.]*$' ) # remove .x and .y here

### Swap

In [40]:
# Replace + backreference to realize swap

sentences %>% # swap the second and third word
  str_replace("([^\\s]+)\\s([^ ]+)\\s([^ ]+)", "\\1 \\3 \\2") %>% # ([^\\s]+) means the non-space element appears 1 or multiple times
  head(5)

#turn yyyy-mm-dd to mm-dd-yyyy
a = c('1988-04-12','04-12-1998','12-04-1988')
str_replace(a,'(\\d{4})-(\\d{2})-(\\d{2})','\\2-\\3-\\1')

a = c('drug','food','great')
str_replace(a, '^([a-z])(.*)([a-z])$','\\3\\2\\1') # For a word 'x..y', replace it with 'y..x'

x <- c("1978", "2017", "2035")
str_replace(x, '^\\d{2}','')  # Get only first two digits for each year

###  Split

In [45]:
#-----------------------------------
#  split the string
"a|b|c|d" %>%
  str_split("\\|", n = 2)

### Locate


In [44]:
# get the location of the match 
x <- c("A falling ball", "Phone call.")
str_locate_all(x, "all")

start,end
4,6
12,14

start,end
8,10


###  Other examples: Identify the email address

In [88]:
a = c('abc@gmail.com','youtube.com','bdc2@yahoo.co.jp')
grepl( '[a-z0-9]{1,}@[a-z]{1,}\\.', a) # identify the email address.

a = c('abc@gmail.com','www.youtube.com','bdc2@yahoo.co.jp','https://yahoo.co.jp','google.com')
grepl('^(www\\.|https{0,1}://){0,}[^@]{1,}\\.',a) # identify the web adress
# explanation: the web adress starts with www, http(s)://, or nothing. so, we have ^(www\\.|https{0,1}://){0,}
# then, there should be no @ in the address, and must contain at leas one '.'. so, we have [^@]{1,}\\.
# notice that str_extract return the matched pattern, while the str_subset return the whole string that has the matched pattern