`a data table is in tidy format if each row represents one observation and columns represent the different variables available for each of these observations`

In [6]:
# For the tidyverse packages to be optimally used, data need to be reshaped into tidy format
library(tidyverse)

-- [1mAttaching packages[22m --------------------------------------- tidyverse 1.2.1 --
[32mv[39m [34mggplot2[39m 3.2.1     [32mv[39m [34mpurrr  [39m 0.3.3
[32mv[39m [34mtibble [39m 3.0.1     [32mv[39m [34mdplyr  [39m 0.8.3
[32mv[39m [34mtidyr  [39m 1.0.0     [32mv[39m [34mstringr[39m 1.4.0
[32mv[39m [34mreadr  [39m 1.3.1     [32mv[39m [34mforcats[39m 0.4.0
"package 'stringr' was built under R version 3.6.3"-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


In [2]:
library(dslabs)
data("murders")

"package 'dslabs' was built under R version 3.6.3"

In [3]:
head(murders)

state,abb,region,population,total
<chr>,<chr>,<fct>,<dbl>,<dbl>
Alabama,AL,South,4779736,135
Alaska,AK,West,710231,19
Arizona,AZ,West,6392017,232
Arkansas,AR,South,2915918,93
California,CA,West,37253956,1257
Colorado,CO,West,5029196,65


In [8]:
16 %>% sqrt() %>% log2()
#above equivalent to
log2(sqrt(16))

In [9]:
# Remember that the pipe sends values to the first argument, 
# so we can define other arguments as if the first argument is already defined:
16 %>% sqrt() %>% log(base = 2)

In [11]:
# summarizing data
# More informative summaries can often be achieved by first splitting data into groups
# summarize and group_by

In [33]:
murders <- mutate(murders, rate =  total / population * 100000, 
                  rank = rank(-rate))

In [12]:
data(heights)

In [13]:
heights %>% head

sex,height
<fct>,<dbl>
Male,75
Male,70
Male,68
Male,74
Male,61
Female,65


In [15]:
s <- heights %>% 
  filter(sex == "Female") %>%
  summarize(average = mean(height), standard_deviation = sd(height))  # you do not need to always ue summarize with group_by
s

average,standard_deviation
<dbl>,<dbl>
64.93942,3.760656


In [24]:
tryCatch({heights %>% 
      filter(sex == "Female") %>%
      summarize(range = quantile(height, c(0, 0.5, 1)))
        },
     error = function(cond){
         message("error occurred")
         message("With the function summarize, we can only call functions that return a single value.")
     }
)

error occurred
With the function summarize, we can only call functions that return a single value.


In [25]:
us_murder_rate <- murders %>%
  summarize(rate = sum(total) / sum(population) * 100000)
us_murder_rate

rate
<dbl>
3.034555


In [26]:
# as most dplyr functions, summarize always returns a data frame
# to just get the value, use pull
us_murder_rate %>% pull(rate)

`A common operation in data exploration is to first split data into groups and then compute summaries for each group`

In [27]:
heights %>% 
  group_by(sex) %>%
  summarize(average = mean(height), standard_deviation = sd(height))

sex,average,standard_deviation
<fct>,<dbl>,<dbl>
Female,64.93942,3.760656
Male,69.31475,3.611024


In [29]:
# sorting
# for ordering entire tables, the dplyr function arrange is useful

In [31]:
murders %>%
  arrange(population) %>%
  head()
# desc
murders %>%
  arrange(desc(population)) %>%
  head()

state,abb,region,population,total
<chr>,<chr>,<fct>,<dbl>,<dbl>
Wyoming,WY,West,563626,5
District of Columbia,DC,South,601723,99
Vermont,VT,Northeast,625741,2
North Dakota,ND,North Central,672591,4
Alaska,AK,West,710231,19
South Dakota,SD,North Central,814180,8


state,abb,region,population,total
<chr>,<chr>,<fct>,<dbl>,<dbl>
California,CA,West,37253956,1257
Texas,TX,South,25145561,805
Florida,FL,South,19687653,669
New York,NY,Northeast,19378102,517
Illinois,IL,North Central,12830632,364
Pennsylvania,PA,Northeast,12702379,457


In [34]:
# nested sorting
murders %>% 
  arrange(region, rate) %>% 
  head()

state,abb,region,population,total,rate,rank
<chr>,<chr>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>
Vermont,VT,Northeast,625741,2,0.3196211,51
New Hampshire,NH,Northeast,1316470,5,0.3798036,50
Maine,ME,Northeast,1328361,11,0.8280881,44
Rhode Island,RI,Northeast,1052567,16,1.5200933,35
Massachusetts,MA,Northeast,6547629,118,1.8021791,32
New York,NY,Northeast,19378102,517,2.6679599,29


In [40]:
# top N
murders %>% top_n(5, rate)# note rows are not sorted by rate, only filtered
# if want to sort, then use arrange
murders %>% top_n(5, rate) %>% arrange(desc(rate))

state,abb,region,population,total,rate,rank
<chr>,<chr>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>
District of Columbia,DC,South,601723,99,16.452753,1
Louisiana,LA,South,4533372,351,7.742581,2
Maryland,MD,South,5773552,293,5.074866,4
Missouri,MO,North Central,5988927,321,5.359892,3
South Carolina,SC,South,4625364,207,4.475323,5


state,abb,region,population,total,rate,rank
<chr>,<chr>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>
District of Columbia,DC,South,601723,99,16.452753,1
Louisiana,LA,South,4533372,351,7.742581,2
Missouri,MO,North Central,5988927,321,5.359892,3
Maryland,MD,South,5773552,293,5.074866,4
South Carolina,SC,South,4625364,207,4.475323,5


In [42]:
# Tibbles
# tibbles can have complex entries, such as lists and functions, unlike dataframe
tibble(id = c(1, 2, 3), func = c(mean, median, sd))

id,func
<dbl>,<list>
1,"function (x, ...) , UseMethod(""mean"")"
2,"function (x, na.rm = FALSE, ...) , UseMethod(""median"")"
3,"function (x, na.rm = FALSE) , sqrt(var(if (is.vector(x) || is.factor(x)) x else as.double(x), , na.rm = na.rm))"


In [43]:
grades <- data.frame(names = c("John", "Juan", "Jean", "Yao"), 
                     exam_1 = c(95, 80, 90, 85), 
                     exam_2 = c(90, 85, 85, 90))

In [46]:
# converting to tibble
grades <- as_tibble(grades)
grades
# or define directly
grades <- tibble(names = c("John", "Juan", "Jean", "Yao"), 
                     exam_1 = c(95, 80, 90, 85), 
                     exam_2 = c(90, 85, 85, 90))


names,exam_1,exam_2
<fct>,<dbl>,<dbl>
John,95,90
Juan,80,85
Jean,90,85
Yao,85,90


**The dot operator**

`One of the advantages of using the pipe %>% is that we do not have to keep naming new objects as we manipulate the data frame`

In [49]:
rates <-   filter(murders, region == "South") %>% 
  mutate(rate = total / population * 10^5) %>% 
  .$rate
# that is same as
rates <-   filter(murders, region == "South") %>% 
  mutate(rate = total / population * 10^5) %>% 
  pull(rate)

In [50]:
median(rates)

# do function


In [57]:
# lets revisit the quantile issue encountered earlier, mutate only return single column for single column, 
# try to re-solve it without using mutate

#  write a function that fits into the tidyverse approach: that is, it receives a data frame and returns a data frame
my_summary <- function(dat){
  x <- quantile(dat$height, c(0, 0.5, 1))
  tibble(min = x[1], median = x[2], max = x[3])
}


In [54]:
# this method wont work, because my_summary is not part of the tidyverse and does not know how to handled grouped tibbles
heights %>% 
  group_by(sex) %>% 
  my_summary

min,median,max
<dbl>,<dbl>,<dbl>
50,68.5,82.67717


In [59]:
    # we can use the do keyword to make a connection
heights %>% 
  group_by(sex) %>% 
  do(my_summary(.))
# ta daaa, also check how the dot operator comes into play

sex,min,median,max
<fct>,<dbl>,<dbl>,<dbl>
Female,51,64.98031,79.0
Male,50,69.0,82.67717


In [60]:
# what  i see here is a strong usecases present for do keyword, i will think again about those use cases

# purr package contains map, map2, pmap, modify, modify_if, map_df, map_dbl, map_chr etc
<a href = "3-Programming-basics.ipynb" target = new>Example codes</a>

`A particularly useful purrr function for interacting with the rest of the tidyverse is map_df, which always returns a tibble data frame. However, the function being called needs to return a vector or a list with names`

In [61]:
compute_s_n <- function(n){
  x <- 1:n
  tibble(sum = sum(x))
}
n <- 1:25

In [66]:
s_n <- map_df(n, compute_s_n)
# this same as this
map_df(n, ~{
    tibble(sum=sum(1:.x))
})


sum
<int>
1
3
6
10
15
21
28
36
45
55


In [63]:
s_n

sum
<int>
1
3
6
10
15
21
28
36
45
55


In [65]:
map(n, ~{
    sum(1:.x)
})

### Tidyverse conditionals


In [None]:
#one example is using ifelse()

`case_when function is useful for vectorizing conditional statements. It is similar to ifelse but can output any number of values, as opposed to just TRUE or FALSE`

In [67]:
x <- c(-2, -1, 0, 1, 2)
case_when(x < 0 ~ "Negative", 
          x > 0 ~ "Positive", 
          TRUE  ~ "Zero")

In [70]:
ifelse(x<0, 'Negative', 'Positive')# not correct, hence use case_when

In [72]:
# case when similar to switch, but good for vectorised inputs or dataframes

# A common use for this case_when is to define categorical variables based on existing variables

In [76]:
# compare the murder rates in four groups of states: New England, West Coast, South, and other.
murders %>% 
  mutate(group = case_when(
    abb %in% c("ME", "NH", "VT", "MA", "RI", "CT") ~ "New England",
    abb %in% c("WA", "OR", "CA") ~ "West Coast",
    region == "South" ~ "South",
    TRUE ~ "Other")) %>%
  group_by(group) %>%
  summarize(rate = sum(total) / sum(population) * 10^5) 

group,rate
<chr>,<dbl>
New England,1.723796
Other,2.708144
South,3.626558
West Coast,2.899001


In [77]:
# between
# A common operation in data analysis is to determine if a value falls inside an interval


In [81]:
between(x, a, b) # === x >= a & x <= b, goot for tidyverse approach

ERROR: Error in between(x, a, b): object 'a' not found


In [82]:
murders %>% head

state,abb,region,population,total,rate,rank
<chr>,<chr>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>
Alabama,AL,South,4779736,135,2.824424,23
Alaska,AK,West,710231,19,2.675186,27
Arizona,AZ,West,6392017,232,3.629527,10
Arkansas,AR,South,2915918,93,3.18939,17
California,CA,West,37253956,1257,3.374138,14
Colorado,CO,West,5029196,65,1.292453,38


In [89]:
# check the rate between interval 1 - 4
murders %>% filter(between(rate, 1, 4)) %>% head # here filter needs to be used

state,abb,region,population,total,rate,rank
<chr>,<chr>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>
Alabama,AL,South,4779736,135,2.824424,23
Alaska,AK,West,710231,19,2.675186,27
Arizona,AZ,West,6392017,232,3.629527,10
Arkansas,AR,South,2915918,93,3.18939,17
California,CA,West,37253956,1257,3.374138,14
Colorado,CO,West,5029196,65,1.292453,38


In [85]:
x <- c(1,2,3,4,5,6)
between(x, 1, 3)