From 094fd0272fc703a866defa2e88afbccd11922f55 Mon Sep 17 00:00:00 2001 From: jrnold Date: Mon, 9 Jan 2017 09:23:50 -0800 Subject: [PATCH] Working on functions chapter --- _bookdown.yml | 12 +- _main.rds | Bin 995 -> 1123 bytes communicate.Rmd | 3 + docs/communicate-intro.html | 403 +++++++++++++++++++++++++++ docs/communicate.md | 4 + docs/data-import.html | 332 ++++++++++++----------- docs/data-transformation.html | 342 +++++++++++------------ docs/dates-and-times.html | 349 ++++++++++++------------ docs/exploratory-data-analysis.html | 346 ++++++++++++------------ docs/explore-intro.html | 404 +++++++++++++++++++++++++++ docs/explore.md | 6 + docs/factors.html | 336 ++++++++++++----------- docs/index.html | 325 +++++++++++----------- docs/iteration.html | 389 +++++++++++++------------- docs/iteration.md | 30 +- docs/model-basics.html | 351 ++++++++++++------------ docs/model-intro.html | 406 ++++++++++++++++++++++++++++ docs/model.md | 4 +- docs/pipes.html | 404 +++++++++++++++++++++++++++ docs/pipes.md | 4 + docs/program-intro.html | 403 +++++++++++++++++++++++++++ docs/program.md | 4 + docs/r-markdown-formats.html | 405 +++++++++++++++++++++++++++ docs/r-markdown-workflow.html | 324 +++++++++++----------- docs/r-markdown.html | 334 ++++++++++++----------- docs/relational-data.html | 338 ++++++++++++----------- docs/rmarkdown-formats.md | 6 + docs/search_index.json | 39 +-- docs/strings.html | 370 +++++++++++++------------ docs/tibbles.html | 332 ++++++++++++----------- docs/tidy-data.html | 344 +++++++++++------------ docs/vectors.html | 340 ++++++++++++----------- docs/visualize.html | 350 ++++++++++++------------ docs/workflow-basics.html | 324 +++++++++++----------- docs/wrangle.md | 3 + explore.Rmd | 5 + functions.Rmd | 250 +++++++++++++++++ iteration.Rmd | 18 +- model.Rmd | 4 +- pipes.Rmd | 3 + program.Rmd | 3 + rmarkdown-formats.Rmd | 5 + wrangle.Rmd | 2 + 43 files changed, 5785 insertions(+), 2871 deletions(-) create mode 100644 communicate.Rmd create mode 100644 docs/communicate-intro.html create mode 100644 docs/communicate.md create mode 100644 docs/explore-intro.html create mode 100644 docs/explore.md create mode 100644 docs/model-intro.html create mode 100644 docs/pipes.html create mode 100644 docs/pipes.md create mode 100644 docs/program-intro.html create mode 100644 docs/program.md create mode 100644 docs/r-markdown-formats.html create mode 100644 docs/rmarkdown-formats.md create mode 100644 docs/wrangle.md create mode 100644 explore.Rmd create mode 100644 functions.Rmd create mode 100644 pipes.Rmd create mode 100644 program.Rmd create mode 100644 rmarkdown-formats.Rmd create mode 100644 wrangle.Rmd diff --git a/_bookdown.yml b/_bookdown.yml index 17c3389d..770f6bef 100644 --- a/_bookdown.yml +++ b/_bookdown.yml @@ -5,7 +5,7 @@ rmd_files: [ "index.rmd", # "intro.Rmd", -# "explore.Rmd", + "explore.Rmd", "visualize.Rmd", "workflow-basics.Rmd", "transform.Rmd", @@ -13,7 +13,7 @@ rmd_files: [ "EDA.Rmd", # "workflow-projects.Rmd", -# "wrangle.Rmd", + "wrangle.Rmd", "tibble.Rmd", "import.Rmd", "tidy.Rmd", @@ -22,8 +22,8 @@ rmd_files: [ "factors.Rmd", "datetimes.Rmd", -# "program.Rmd", -# "pipes.Rmd", + "program.Rmd", + "pipes.Rmd", # "functions.Rmd", "vectors.Rmd", "iteration.Rmd", @@ -33,10 +33,10 @@ rmd_files: [ #"model-building.Rmd", #"model-many.Rmd", - #"communicate.Rmd", + "communicate.Rmd", "rmarkdown.Rmd", #"communicate-plots.Rmd", - #"rmarkdown-formats.Rmd", + "rmarkdown-formats.Rmd", "rmarkdown-workflow.Rmd", ] diff --git a/_main.rds b/_main.rds index 1e57351d7db51e2f66e1f862e733d7dd80073ea9..ae9213f6a62f0dad8676e0df7157f6f866cfc9f0 100644 GIT binary patch literal 1123 zcmb2|=3oE==C^aai!U1p9KWk_K~2|p;U0S?*S_O5@_FT-%7XQELNr zb6M9v`INg>{7=z0qlNZip66axnOkq)_T#~|i!)d8L}$vryJoc`?NOJ?o8|4doaZLL zmSn!$)!#KIUT^Xqi%5ZkN*njSGd8~*w%_I3wVcHPM34aZBoAC zw!5`VwL5>?bjuaXZ!7t(d48*SmcYqRJu2#k6OTN;wR`5dt>x|2dYu#Iq&}CH+Zpa? zu*UBA+HI$9?4OVswBg4buZKLm{^|MA#&-+b?AqtwXcDdyu6v@g^lGu@p`SS`_qD`p zh(GOf{gL)fZBFpS_>Uc32AP|AE^g4^I$rob^{LUjJU8!Y7nVKwbzH{URacMb}f49t=gS8YmwQeJK8Vj#~Mdje5_u+ z`TFL|d{g583#1l5-qE^6hI{VecZXgqvI#Ifqm@yIxGK{>GqDB#6sD%ID;f5s zaxcPa_RK2@Dt)MY>G6{IJ7>3+Z%B0d?lRJ?ib^fJk7X=}EhEq?mX zcfaSlXwFsJCKT6BEPd`KYf~nzCH1-VytC|`wNWCk*8g5Lc>$mAt!edcOWvMcu3TNJ zY?~~eS8wg?xVNNdLY28h_0?~=XM*)sJ3l@q&}(^Wd(YM_TO~SsxnCTZ__gEJk1I7z iJ+5C>uOCz|e?M2QX4gZdGqb%vNbhq_?|m1-zyJUdbuJbF literal 995 zcmb2|=3oE==C^ZB=G`_BXnU&K`&OrHlAeS7!iKdhJD)@?c)Rk&!S-$D&C_1Y=$+P( z@@5zR{Say2iCbB?Un&NlIT&bj_~QA8>VNm0{oeT4B6s=aOzXFPzuvokdTz6`Z&LQ@ zmsWqD+)AG4U+?F1zi`*vpgr~ChL+qbRZdRda$%#`oDCj9azXP2Jza$)QnPNSR9*if z_i}CTgXWk!DOKn4v-OW_tI2sBsJr;KE~s8uC8ML{%e7|ZiSq5y88XSx7mDl zUWIvLtjoH(u5oPqVE1fOV?ba6r|8A3h0Z7coc(8X{AN+GO8-@xt~qQjn=F(L-cX-p zQX%rz>viCk%U7!8IPdS6Z^4=QYaK_qRpbi&cj2d8Hzhxe^IvyYQ}iSA_Z0=ri&m+;T6aDB^h~}* zPt3mM?cx@@u;=NU;B#}Qy)*x*=(YON`2`sZrk*aj^6~rCxxtoS=dAhEyINV`PVj0Y zZQXnJ4-3omzb?#Y(wa79l}EW}=OS~3AfbH8OCG7eRXEZz%@4i1&E2u#x7@P5XKSkb zg+KT1@x6D&Z`$6Le>q2A-0G5#@mr@hT~uSgtzhcEBZbVFlZw|KUUw*Cu}zq*k4+%k z&pV1`Z{4H|q*u;6{`rZxnWgXS`=Px1U+#atUES)i+qN58@%GQ{RlfJ!*(|7cqwS|i z(u;`(I~Te;skX~;_DWQ#_~gBq7;q9Opxh?MIa#_&C8X%Z#0wn@-48th3QU!$QZb>c z7AA%5#Zngo&I%YU7sx$qdbwTf$YjHOgO=m+wQ74}r+&Sc(d9GlF0_fqGq zcVSyH%U2zG<=2|MD|ynVoM*w!?@D^F&h-n?{&wib@`L}f)PGiGJH*JX36o6YmO7tR zojGOZth9B{JSshY%#~k1TW00DjeQ~Nt3=~-#e1(B?EJg*{yPn`?;EmPH%cC!d2VX( z6XQQS(v}^6 + + + + + + + Exercise Solutions and Notes for R for Data Science + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ +
+ +
+
+ + +
+
+ +
+
+

19 Introduction

+ +
+
+ +
+
+
+ + + + + + + + + + + + + + + + + diff --git a/docs/communicate.md b/docs/communicate.md new file mode 100644 index 00000000..d5dfdb76 --- /dev/null +++ b/docs/communicate.md @@ -0,0 +1,4 @@ + +# (PART) Communicate {-} + +# Introduction {#communicate-intro} diff --git a/docs/data-import.html b/docs/data-import.html index 26ec1af5..f8ed4ca0 100644 --- a/docs/data-import.html +++ b/docs/data-import.html @@ -108,218 +108,228 @@
  • Welcome
  • -
  • 1 Visualize
  • - + diff --git a/docs/iteration.html b/docs/iteration.html index 613b431c..ed7bb79c 100644 --- a/docs/iteration.html +++ b/docs/iteration.html @@ -33,7 +33,7 @@ - + @@ -108,218 +108,228 @@
  • Welcome
  • -
  • 1 Visualize
  • -

    13.3 For loop variations

    +

    16.3 For loop variations

    -

    13.3.1

    +

    16.3.1

    Ex Imagine you have a directory full of CSV files that you want to read in. You have their paths in a vector, files <- dir("data/", pattern = "\\.csv$", full.names = TRUE), and now want to read each one with read_csv(). Write the for loop that will load them into a single data frame.

    I pre-allocate a list, read each file as data frame into an element in that list. This creates a list of data frames. I then use bind_rows to create a single data frame from the list of data frames.

    -
    df <- vector("list", lenght(files))
    +
    df <- vector("list", length(files))
     for (fname in seq_along(files)) {
       df[[i]] <- read_csv(files[[i]])
     }
    @@ -802,8 +813,8 @@ 

    13.3.1

    am = function(x) { factor(x, labels = c("auto", "manual")) } -) -for (var in names(trans)) { +)
    +
    for (var in names(trans)) {
       mtcars[[var]] <- trans[[var]](mtcars[[var]])
     }

    This code mutates the disp and am columns:

    @@ -813,18 +824,13 @@

    13.3.1

    The code works by looping over a named list of functions. It calls the named function in the list on the column of mtcars with the same name, and replaces the values of that column.

    E.g. this is a function:

    -
    trans[["disp"]]
    -#> function(x) x * 0.0163871
    +
    trans[["disp"]]

    This applies the function to the column of mtcars with the same name

    -
    trans[["disp"]](mtcars[["disp"]])
    -#>  [1] 0.0430 0.0430 0.0290 0.0693 0.0967 0.0604 0.0967 0.0394 0.0378 0.0450
    -#> [11] 0.0450 0.0741 0.0741 0.0741 0.1267 0.1235 0.1182 0.0211 0.0203 0.0191
    -#> [21] 0.0323 0.0854 0.0816 0.0940 0.1074 0.0212 0.0323 0.0255 0.0943 0.0389
    -#> [31] 0.0808 0.0325
    +
    trans[["disp"]](mtcars[["disp"]])
    -

    13.4 For loops vs. functionals

    +

    16.4 For loops vs. functionals

    col_summary <- function(df, fun) {
       out <- vector("double", length(df))
       for (i in seq_along(df)) {
    @@ -833,7 +839,7 @@ 

    13.4 For loops vs. functionals

    -

    13.4.1 Exercises

    +

    16.4.1 Exercises

    Ex. 21.4.1.1 Read the documentation for apply(). In the 2d case, what two for loops does it generalise.

    It generalises looping over the rows or columns of a matrix or data-frame.

    Ex. 21.4.1.2 Adapt col_summary() so that it only applies to numeric columns You might want to start with an is_numeric() function that returns a logical vector that has a TRUE corresponding to each numeric column.

    @@ -865,13 +871,13 @@

    13.4.1 Exercises

    -

    13.5 The map functions

    +

    16.5 The map functions

    -

    13.5.1 Shortcuts

    +

    16.5.1 Shortcuts

    Notes The lm() function runs a linear regression. It is covered in the Model Basics chapter.

    -

    13.5.2 Exercises

    +

    16.5.2 Exercises

    Ex Write code that uses one of the map functions to:

    1. Compute the mean of every column in `mtcars`.
     1. Determine the type of each column in `nycflights13::flights`.
    @@ -879,10 +885,8 @@ 

    13.5.2 Exercises

    1. Generate 10 random normals for each of $\mu = -10$, $0$, $10$, and $100$.

    The mean of every column in mtcars:

    map_dbl(mtcars, mean)
    -#> Warning in mean.default(.x[[i]], ...): argument is not numeric or logical:
    -#> returning NA
     #>     mpg     cyl    disp      hp    drat      wt    qsec      vs      am 
    -#>  20.091   6.188   3.781 146.688   3.597   3.217  17.849   0.438      NA 
    +#>  20.091   6.188 230.722 146.688   3.597   3.217  17.849   0.438   0.406 
     #>    gear    carb 
     #>   3.688   2.812

    The type of every column in nycflights13::flights.

    @@ -974,7 +978,7 @@

    13.5.2 Exercises

    Use map_lgl with the function is.factor,

    map_lgl(mtcars, is.factor)
     #>   mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb 
    -#> FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
    +#> FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

    Ex What happens when you use the map functions on vectors that aren’t lists? What does map(1:5, runif) do? Why?

    The function map applies the function to each element of the vector.

    map(1:5, runif)
    @@ -1030,18 +1034,18 @@ 

    13.5.2 Exercises

    -

    13.6 Dealing with Failure

    +

    16.6 Dealing with Failure

    -

    13.7 Mapping over multiple arguments

    +

    16.7 Mapping over multiple arguments

    -

    13.8 Walk

    +

    16.8 Walk

    -

    13.9 Other patterns of for loops

    +

    16.9 Other patterns of for loops

    -

    13.9.1 Exercises

    +

    16.9.1 Exercises

    Ex Implement your own version of every() using a for loop. Compare it with purrr::every(). What does purrr’s version do that your version doesn’t?

    # Use ... to pass arguments to the function
     every2 <- function(.x, .p, ...) {
    @@ -1110,13 +1114,16 @@ 

    13.9.1 Exercises

    + + +
    - + diff --git a/docs/iteration.md b/docs/iteration.md index ec748b56..62e56d53 100644 --- a/docs/iteration.md +++ b/docs/iteration.md @@ -25,6 +25,12 @@ library("tidyverse") library("stringr") ``` +The package **microbenchmark** is used for timing code + +```r +library("microbenchmark") +``` + ## For Loops @@ -434,7 +440,6 @@ Microbenchmark will run an R expression a number of times and time it. Define a function that appends to an integer vector. ```r -library("microbenchmark") add_to_vector <- function(n) { output <- vector("integer", 0) for (i in seq_len(n)) { @@ -445,7 +450,7 @@ add_to_vector <- function(n) { microbenchmark(add_to_vector(10000), times = 3) #> Unit: milliseconds #> expr min lq mean median uq max neval -#> add_to_vector(10000) 156 158 172 159 180 201 3 +#> add_to_vector(10000) 185 196 201 206 209 211 3 ``` And one that pre-allocates it. @@ -460,8 +465,8 @@ add_to_vector_2 <- function(n) { } microbenchmark(add_to_vector_2(10000), times = 3) #> Unit: milliseconds -#> expr min lq mean median uq max neval -#> add_to_vector_2(10000) 7.1 7.25 7.33 7.39 7.44 7.49 3 +#> expr min lq mean median uq max neval +#> add_to_vector_2(10000) 7.05 7.14 8.02 7.23 8.5 9.77 3 ``` The pre-allocated vector is about **100** times faster! @@ -484,7 +489,7 @@ This creates a list of data frames. I then use `bind_rows` to create a single data frame from the list of data frames. ```r -df <- vector("list", lenght(files)) +df <- vector("list", length(files)) for (fname in seq_along(files)) { df[[i]] <- read_csv(files[[i]]) } @@ -607,6 +612,9 @@ trans <- list( factor(x, labels = c("auto", "manual")) } ) +``` + +```r for (var in names(trans)) { mtcars[[var]] <- trans[[var]](mtcars[[var]]) } @@ -624,21 +632,15 @@ E.g. this is a function: ```r trans[["disp"]] -#> function(x) x * 0.0163871 ``` This applies the function to the column of `mtcars` with the same name ```r trans[["disp"]](mtcars[["disp"]]) -#> [1] 0.0430 0.0430 0.0290 0.0693 0.0967 0.0604 0.0967 0.0394 0.0378 0.0450 -#> [11] 0.0450 0.0741 0.0741 0.0741 0.1267 0.1235 0.1182 0.0211 0.0203 0.0191 -#> [21] 0.0323 0.0854 0.0816 0.0940 0.1074 0.0212 0.0323 0.0255 0.0943 0.0389 -#> [31] 0.0808 0.0325 ``` - ## For loops vs. functionals @@ -714,10 +716,8 @@ The mean of every column in `mtcars`: ```r map_dbl(mtcars, mean) -#> Warning in mean.default(.x[[i]], ...): argument is not numeric or logical: -#> returning NA #> mpg cyl disp hp drat wt qsec vs am -#> 20.091 6.188 3.781 146.688 3.597 3.217 17.849 0.438 NA +#> 20.091 6.188 230.722 146.688 3.597 3.217 17.849 0.438 0.406 #> gear carb #> 3.688 2.812 ``` @@ -833,7 +833,7 @@ Use `map_lgl` with the function `is.factor`, ```r map_lgl(mtcars, is.factor) #> mpg cyl disp hp drat wt qsec vs am gear carb -#> FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE +#> FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE ``` diff --git a/docs/model-basics.html b/docs/model-basics.html index f114989b..1f57c3c0 100644 --- a/docs/model-basics.html +++ b/docs/model-basics.html @@ -32,8 +32,8 @@ - - + + @@ -108,218 +108,228 @@
  • Welcome
  • -
  • 1 Visualize
      -
    • 1.1 Introduction
        -
      • 1.1.1 Prerequisites
      • -
      • 1.1.2 First Steps
      • -
      • 1.1.3 Aesthetic mappings
      • -
      • 1.1.4 Facets
      • -
      • 1.1.5 Geometric Objects
      • -
      • 1.1.6 Statistical Transformations
      • +
      • I Explore
      • +
      • 1 Introduction
      • +
      • 2 Visualize
          +
        • 2.1 Introduction
        • -
        • 1.2 Position Adjustments
        • -
        • 1.3 Coordinate Systems
        • -
        • 2 Workflow Basics
            -
          • 2.1 Practice
              -
            • 2.1.1 Exercises
            • +
            • 3 Workflow Basics
            • -
            • 3 Data Transformation
                -
              • 3.1 Prerequisites
              • -
              • 3.2 Filter
              • -
              • 3.3 Exercises
              • -
              • 3.4 Arrange
                  -
                • 3.4.1 Exercises
                • +
                • 4 Data Transformation
                    +
                  • 4.1 Prerequisites
                  • +
                  • 4.2 Filter
                  • +
                  • 4.3 Exercises
                  • +
                  • 4.4 Arrange
                  • -
                  • 3.5 Mutate
                      -
                    • 3.5.1 Exercises
                    • +
                    • 4.5 Mutate
                    • -
                    • 3.6 Grouped summaries with summarise()
                        -
                      • 3.6.1 Exercises
                      • +
                      • 4.6 Grouped summaries with summarise()
                      • -
                      • 3.7 Grouped mutates and filters
                      • -
                      • 4 Exploratory Data Analysis
                          -
                        • 4.1 Introduction
                            -
                          • 4.1.1 Questions
                          • -
                          • 4.1.2 Variation
                          • +
                          • 5 Exploratory Data Analysis
                              +
                            • 5.1 Introduction
                            • -
                            • 4.2 Missing Values
                                -
                              • 4.2.1 Exercises
                              • +
                              • 5.2 Missing Values
                              • -
                              • 4.3 Covariation
                              • -
                              • 5 Tibbles
                                  -
                                • 5.1 Prerquisites
                                • -
                                • 5.2 Creating Tibbles
                                • -
                                • 5.3 Tibbles vs. data.frame
                                • -
                                • 5.4 Subsetting
                                • -
                                • 5.5 Interacting with older code
                                • -
                                • 5.6 Exercises
                                • +
                                • II Wrangle
                                • +
                                • 6 Tibbles
                                • -
                                • 6 Data Import
                                    -
                                  • 6.1 Introduction
                                  • -
                                  • 6.2 Getting started
                                      -
                                    • 6.2.1 Exercises
                                    • +
                                    • 7 Data Import
                                        +
                                      • 7.1 Introduction
                                      • +
                                      • 7.2 Getting started
                                      • -
                                      • 6.3 Parsing a vector
                                      • -
                                      • 7 Tidy Data
                                          -
                                        • 7.1 Introduction
                                        • -
                                        • 7.2 Tidy Data
                                            -
                                          • 7.2.1 Exercises
                                          • +
                                          • 8 Tidy Data
                                              +
                                            • 8.1 Introduction
                                            • +
                                            • 8.2 Tidy Data
                                            • -
                                            • 7.3 Spreading and Gathering
                                                -
                                              • 7.3.1 Exercises
                                              • +
                                              • 8.3 Spreading and Gathering
                                              • -
                                              • 7.4 Separating and Uniting
                                                  -
                                                • 7.4.1 Exercises
                                                • +
                                                • 8.4 Separating and Uniting
                                                • -
                                                • 7.5 Missing Values
                                                    -
                                                  • 7.5.1 Exercises
                                                  • +
                                                  • 8.5 Missing Values
                                                  • -
                                                  • 7.6 Case Study
                                                  • -
                                                  • 8 Relational Data
                                                      -
                                                    • 8.1 Prerequisites
                                                    • -
                                                    • 8.2 nycflights13
                                                        -
                                                      • 8.2.1 Exercises
                                                      • +
                                                      • 9 Relational Data
                                                          +
                                                        • 9.1 Prerequisites
                                                        • +
                                                        • 9.2 nycflights13
                                                        • -
                                                        • 8.3 Keys
                                                        • -
                                                        • 8.4 Mutating Joins
                                                            -
                                                          • 8.4.1 Exercises
                                                          • +
                                                          • 9.3 Keys
                                                          • +
                                                          • 9.4 Mutating Joins
                                                          • -
                                                          • 8.5 Filtering Joins
                                                          • -
                                                          • 9 Strings
                                                              -
                                                            • 9.1 Introduction
                                                            • -
                                                            • 9.2 String Basics
                                                                -
                                                              • 9.2.1 Exercises
                                                              • +
                                                              • 10 Strings
                                                                  +
                                                                • 10.1 Introduction
                                                                • +
                                                                • 10.2 String Basics
                                                                • -
                                                                • 9.3 Matching Patterns and Regular Expressions
                                                                    -
                                                                  • 9.3.1 Exercises
                                                                  • -
                                                                  • 9.3.2 Repitition
                                                                  • -
                                                                  • 9.3.3 Grouping and backreferences
                                                                  • +
                                                                  • 10.3 Matching Patterns and Regular Expressions
                                                                  • -
                                                                  • 9.4 Tools
                                                                      -
                                                                    • 9.4.1 Detect matches
                                                                    • -
                                                                    • 9.4.2 Exercises
                                                                    • -
                                                                    • 9.4.3 Extract Matches
                                                                    • -
                                                                    • 9.4.4 Grouped Matches
                                                                    • -
                                                                    • 9.4.5 Splitting
                                                                    • +
                                                                    • 10.4 Tools
                                                                    • -
                                                                    • 9.5 Other types of patterns
                                                                        -
                                                                      • 9.5.1 Exercises
                                                                      • +
                                                                      • 10.5 Other types of patterns
                                                                      • -
                                                                      • 9.6 stringi
                                                                      • -
                                                                      • 10 Factors
                                                                          -
                                                                        • 10.1 Introduction
                                                                        • -
                                                                        • 10.2 Creating Factors
                                                                        • -
                                                                        • 10.3 General Social Survey
                                                                            -
                                                                          • 10.3.1 Exercises
                                                                          • +
                                                                          • 11 Factors
                                                                              +
                                                                            • 11.1 Introduction
                                                                            • +
                                                                            • 11.2 Creating Factors
                                                                            • +
                                                                            • 11.3 General Social Survey
                                                                            • -
                                                                            • 10.4 Modifying factor order
                                                                                -
                                                                              • 10.4.1 Exercises
                                                                              • +
                                                                              • 11.4 Modifying factor order
                                                                              • -
                                                                              • 10.5 Modifying factor levels
                                                                              • -
                                                                              • 11 Dates and Times
                                                                                  -
                                                                                • 11.1 Prerequisite
                                                                                • -
                                                                                • 11.2 Creating date/times
                                                                                    -
                                                                                  • 11.2.1 Exercises
                                                                                  • +
                                                                                  • 12 Dates and Times
                                                                                      +
                                                                                    • 12.1 Prerequisite
                                                                                    • +
                                                                                    • 12.2 Creating date/times
                                                                                    • -
                                                                                    • 11.3 Date-Time Components
                                                                                        -
                                                                                      • 11.3.1 Exercises
                                                                                      • +
                                                                                      • 12.3 Date-Time Components
                                                                                      • -
                                                                                      • 11.4 Time Spans
                                                                                      • -
                                                                                      • 12 Vectors
                                                                                          -
                                                                                        • 12.1 Introduction
                                                                                        • -
                                                                                        • 12.2 Important types of Atomic Vector
                                                                                            -
                                                                                          • 12.2.1 Exercises
                                                                                          • +
                                                                                          • III Program
                                                                                          • +
                                                                                          • 13 Introduction
                                                                                          • +
                                                                                          • 14 Pipes
                                                                                          • +
                                                                                          • 15 Vectors
                                                                                              +
                                                                                            • 15.1 Introduction
                                                                                            • +
                                                                                            • 15.2 Important types of Atomic Vector
                                                                                            • -
                                                                                            • 12.3 Using atomic vectors
                                                                                            • -
                                                                                            • 12.4 Recursive Vectors (lists)
                                                                                                -
                                                                                              • 12.4.1 Exercises
                                                                                              • +
                                                                                              • 15.3 Using atomic vectors
                                                                                              • +
                                                                                              • 15.4 Recursive Vectors (lists)
                                                                                              • -
                                                                                              • 12.5 Augmented Vectors
                                                                                              • -
                                                                                              • 13 Iteration
                                                                                                  -
                                                                                                • 13.1 Introduction
                                                                                                • -
                                                                                                • 13.2 For Loops
                                                                                                    -
                                                                                                  • 13.2.1 Exercises
                                                                                                  • +
                                                                                                  • 16 Iteration
                                                                                                      +
                                                                                                    • 16.1 Introduction
                                                                                                    • +
                                                                                                    • 16.2 For Loops
                                                                                                    • -
                                                                                                    • 13.3 For loop variations
                                                                                                        -
                                                                                                      • 13.3.1
                                                                                                      • +
                                                                                                      • 16.3 For loop variations
                                                                                                      • -
                                                                                                      • 13.4 For loops vs. functionals
                                                                                                          -
                                                                                                        • 13.4.1 Exercises
                                                                                                        • +
                                                                                                        • 16.4 For loops vs. functionals
                                                                                                        • -
                                                                                                        • 13.5 The map functions
                                                                                                            -
                                                                                                          • 13.5.1 Shortcuts
                                                                                                          • -
                                                                                                          • 13.5.2 Exercises
                                                                                                          • +
                                                                                                          • 16.5 The map functions
                                                                                                          • -
                                                                                                          • 13.6 Dealing with Failure
                                                                                                          • -
                                                                                                          • 13.7 Mapping over multiple arguments
                                                                                                          • -
                                                                                                          • 13.8 Walk
                                                                                                          • -
                                                                                                          • 13.9 Other patterns of for loops
                                                                                                          • -
                                                                                                          • 14 Model Introduction
                                                                                                          • -
                                                                                                          • 15 Model Basics
                                                                                                              -
                                                                                                            • 15.1 Prerequisites
                                                                                                            • -
                                                                                                            • 15.2 A simple model
                                                                                                                -
                                                                                                              • 15.2.1 Exercises
                                                                                                              • +
                                                                                                              • IV Model
                                                                                                              • +
                                                                                                              • 17 Introduction
                                                                                                              • +
                                                                                                              • 18 Model Basics
                                                                                                                  +
                                                                                                                • 18.1 Prerequisites
                                                                                                                • +
                                                                                                                • 18.2 A simple model
                                                                                                                • -
                                                                                                                • 15.3 Visualizing Models
                                                                                                                    -
                                                                                                                  • 15.3.1 Exercises
                                                                                                                  • +
                                                                                                                  • 18.3 Visualizing Models
                                                                                                                  • -
                                                                                                                  • 15.4 Formulas and Model Families
                                                                                                                  • -
                                                                                                                  • 16 R Markdown
                                                                                                                      -
                                                                                                                    • 16.1 R Markdown Basics @@ -338,17 +348,17 @@

                                                                                                                      -

                                                                                                                      15 Model Basics

                                                                                                                      +

                                                                                                                      18 Model Basics

                                                                                                                      Distinction between family of models and fitted model is a useful way to think about models. Especially as we can abstract some families of models to be themselves a fitted model of a more flexible family of models. For example, linear regression is a special case of GLM or Gaussian Processes etc.

                                                                                                                      -

                                                                                                                      15.1 Prerequisites

                                                                                                                      +

                                                                                                                      18.1 Prerequisites

                                                                                                                      library(tidyverse)
                                                                                                                       library(modelr)
                                                                                                                       options(na.action = na.warn)

                                                                                                                      The option na.action determines how missing values are handled. It is a function. na.warn sets it so that there is a warning if there are any missing values (by default, R will just silently drop them).

                                                                                                                      -

                                                                                                                      15.2 A simple model

                                                                                                                      +

                                                                                                                      18.2 A simple model

                                                                                                                      ggplot(sim1, aes(x, y)) +
                                                                                                                         geom_point()

                                                                                                                      @@ -429,7 +439,7 @@

                                                                                                                      15.2 A simple model

                                                                                                                      #> (Intercept) x #> 4.22 2.05
                                                                                                                      -

                                                                                                                      15.2.1 Exercises

                                                                                                                      +

                                                                                                                      18.2.1 Exercises

                                                                                                                      sim1a <- tibble(
                                                                                                                         x = rep(1:10, each = 3),
                                                                                                                         y = x * 1.5 + 6 + rt(length(x), df = 2)
                                                                                                                      @@ -507,7 +517,7 @@ 

                                                                                                                      15.2.1 Exercises

                                                                                                                      -

                                                                                                                      15.3 Visualizing Models

                                                                                                                      +

                                                                                                                      18.3 Visualizing Models

                                                                                                                      More complicated models can be visualized with

                                                                                                                      1. predictions
                                                                                                                      2. @@ -569,7 +579,7 @@

                                                                                                                        15.3 Visualizing Models

                                                                                                                        geom_point()

                                                                                                                      -

                                                                                                                      15.3.1 Exercises

                                                                                                                      +

                                                                                                                      18.3.1 Exercises

                                                                                                                      1. nstead of using lm() to fit a straight line, you can use loess() to fit a smooth curve. Repeat the process of model fitting, grid generation, predictions, and visualisation on sim1 using loess() instead of lm(). How does the result compare to geom_smooth()?
                                                                                                                      @@ -615,7 +625,7 @@

                                                                                                                      15.3.1 Exercises

                                                                                                                      -

                                                                                                                      15.4 Formulas and Model Families

                                                                                                                      +

                                                                                                                      18.4 Formulas and Model Families

                                                                                                                      df <- tribble(
                                                                                                                         ~y, ~x1, ~x2,
                                                                                                                         4, 2, 5,
                                                                                                                      @@ -634,7 +644,7 @@ 

                                                                                                                      15.4 Formulas and Model Families< #> 1 2 #> 2 1

                                                                                                                      -

                                                                                                                      15.4.1 Categorical Variables

                                                                                                                      +

                                                                                                                      18.4.1 Categorical Variables

                                                                                                                      df <- tribble(
                                                                                                                         ~ sex, ~ response,
                                                                                                                         "male", 1,
                                                                                                                      @@ -730,15 +740,15 @@ 

                                                                                                                      15.4.1 Categorical Variables

                                                                                                                      TODO We should visualize interactions with plotly

                                                                                                                      -

                                                                                                                      15.4.2 Exercises

                                                                                                                      +

                                                                                                                      18.4.2 Exercises

                                                                                                                      -

                                                                                                                      15.5 Missing values

                                                                                                                      +

                                                                                                                      18.5 Missing values

                                                                                                                      TODO Need to write a tidyverse compliant na.omit function.

                                                                                                                      -

                                                                                                                      15.6 Other model families

                                                                                                                      +

                                                                                                                      18.6 Other model families

                                                                                                                      NOTE It’s worth mentioning these as more general models. Though they don’t appear as much in social science work. I should try to explain that. I can think of several reasons

                                                                                                                      • preference for easy to explain models (though I think that’s wrong–most people can’t visualize high-dimensional space well, and interpret results marginally even though they are conditional)
                                                                                                                      • @@ -748,13 +758,16 @@

                                                                                                                        15.6 Other model families

                                                                                                                      + + +
                                                                                                                      - - + + diff --git a/docs/model-intro.html b/docs/model-intro.html new file mode 100644 index 00000000..5a3f2240 --- /dev/null +++ b/docs/model-intro.html @@ -0,0 +1,406 @@ + + + + + + + + Exercise Solutions and Notes for R for Data Science + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                      + +
                                                                                                                      + +
                                                                                                                      + +
                                                                                                                      +
                                                                                                                      + + +
                                                                                                                      +
                                                                                                                      + +
                                                                                                                      +
                                                                                                                      +

                                                                                                                      17 Introduction

                                                                                                                      +

                                                                                                                      Some of the discussion of models is slightly different, and has a different emphasis than in most social science research. This is largely because this book is speaking to data scientists, where the primary goal is prediction rather than theory testing (that I don’t view these as too different is a different story).

                                                                                                                      +

                                                                                                                      The discussion about hypothesis generation vs. confirmation is interesting. Too little emphasis is placed on hypothesis generation in social science. The importance of out of sample testing also receives too little emphasis in political science.

                                                                                                                      +

                                                                                                                      And from this discussion it should be clear that many papers in social science are hypothesis generation masquerading as hypothesis confirmation.

                                                                                                                      + +
                                                                                                                      +
                                                                                                                      + +
                                                                                                                      +
                                                                                                                      +
                                                                                                                      + + + + + + + + + + + + + + + + + diff --git a/docs/model.md b/docs/model.md index 617a936b..6b67a393 100644 --- a/docs/model.md +++ b/docs/model.md @@ -1,5 +1,7 @@ -# Model Introduction +# (PART) Model {-} + +# Introduction {#model-intro} Some of the discussion of models is slightly different, and has a different emphasis than in most social science research. This is largely because this book is speaking to data scientists, where the primary goal is prediction rather than theory testing (that I don't view these as too different is a different story). diff --git a/docs/pipes.html b/docs/pipes.html new file mode 100644 index 00000000..f63d5d0e --- /dev/null +++ b/docs/pipes.html @@ -0,0 +1,404 @@ + + + + + + + + Exercise Solutions and Notes for R for Data Science + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                      + +
                                                                                                                      + +
                                                                                                                      + +
                                                                                                                      +
                                                                                                                      + + +
                                                                                                                      +
                                                                                                                      + +
                                                                                                                      +
                                                                                                                      +

                                                                                                                      14 Pipes

                                                                                                                      +

                                                                                                                      No exercises in this chapter.

                                                                                                                      + +
                                                                                                                      +
                                                                                                                      + +
                                                                                                                      +
                                                                                                                      +
                                                                                                                      + + + + + + + + + + + + + + + + + diff --git a/docs/pipes.md b/docs/pipes.md new file mode 100644 index 00000000..e3025f96 --- /dev/null +++ b/docs/pipes.md @@ -0,0 +1,4 @@ + +# Pipes + +No exercises in this chapter. diff --git a/docs/program-intro.html b/docs/program-intro.html new file mode 100644 index 00000000..3340050b --- /dev/null +++ b/docs/program-intro.html @@ -0,0 +1,403 @@ + + + + + + + + Exercise Solutions and Notes for R for Data Science + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                      + +
                                                                                                                      + +
                                                                                                                      + +
                                                                                                                      +
                                                                                                                      + + +
                                                                                                                      +
                                                                                                                      + +
                                                                                                                      +
                                                                                                                      +

                                                                                                                      13 Introduction

                                                                                                                      + +
                                                                                                                      +
                                                                                                                      + +
                                                                                                                      +
                                                                                                                      +
                                                                                                                      + + + + + + + + + + + + + + + + + diff --git a/docs/program.md b/docs/program.md new file mode 100644 index 00000000..1e38ceac --- /dev/null +++ b/docs/program.md @@ -0,0 +1,4 @@ + +# (PART) Program {-} + +# Introduction {#program-intro} diff --git a/docs/r-markdown-formats.html b/docs/r-markdown-formats.html new file mode 100644 index 00000000..1ef4f602 --- /dev/null +++ b/docs/r-markdown-formats.html @@ -0,0 +1,405 @@ + + + + + + + + Exercise Solutions and Notes for R for Data Science + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                      + +
                                                                                                                      + +
                                                                                                                      + +
                                                                                                                      +
                                                                                                                      + + +
                                                                                                                      +
                                                                                                                      + +
                                                                                                                      +
                                                                                                                      +

                                                                                                                      21 R Markdown Formats

                                                                                                                      +

                                                                                                                      No exercises.

                                                                                                                      +

                                                                                                                      This document was built with bookdown. You can see the source at https://github.com/jrnold/e4qf.

                                                                                                                      + +
                                                                                                                      +
                                                                                                                      + +
                                                                                                                      +
                                                                                                                      +
                                                                                                                      + + + + + + + + + + + + + + + + + diff --git a/docs/r-markdown-workflow.html b/docs/r-markdown-workflow.html index a80793da..9a8a47de 100644 --- a/docs/r-markdown-workflow.html +++ b/docs/r-markdown-workflow.html @@ -32,7 +32,7 @@ - + @@ -108,218 +108,228 @@
                                                                                                                    • Welcome
                                                                                                                    • -
                                                                                                                    • 1 Visualize
                                                                                                                    • - + diff --git a/docs/r-markdown.html b/docs/r-markdown.html index 7af49bf0..a116439a 100644 --- a/docs/r-markdown.html +++ b/docs/r-markdown.html @@ -32,8 +32,8 @@ - - + + @@ -108,218 +108,228 @@
                                                                                                                    • Welcome
                                                                                                                    • -
                                                                                                                    • 1 Visualize
                                                                                                                    • - - + + diff --git a/docs/relational-data.html b/docs/relational-data.html index 8d557708..1d2abb37 100644 --- a/docs/relational-data.html +++ b/docs/relational-data.html @@ -108,218 +108,228 @@
                                                                                                                    • Welcome
                                                                                                                    • -
                                                                                                                    • 1 Visualize
                                                                                                                        -
                                                                                                                      • 1.1 Introduction
                                                                                                                          -
                                                                                                                        • 1.1.1 Prerequisites
                                                                                                                        • -
                                                                                                                        • 1.1.2 First Steps
                                                                                                                        • -
                                                                                                                        • 1.1.3 Aesthetic mappings
                                                                                                                        • -
                                                                                                                        • 1.1.4 Facets
                                                                                                                        • -
                                                                                                                        • 1.1.5 Geometric Objects
                                                                                                                        • -
                                                                                                                        • 1.1.6 Statistical Transformations
                                                                                                                        • +
                                                                                                                        • I Explore
                                                                                                                        • +
                                                                                                                        • 1 Introduction
                                                                                                                        • +
                                                                                                                        • 2 Visualize
                                                                                                                            +
                                                                                                                          • 2.1 Introduction
                                                                                                                          • -
                                                                                                                          • 1.2 Position Adjustments
                                                                                                                          • -
                                                                                                                          • 1.3 Coordinate Systems
                                                                                                                          • -
                                                                                                                          • 2 Workflow Basics
                                                                                                                              -
                                                                                                                            • 2.1 Practice
                                                                                                                                -
                                                                                                                              • 2.1.1 Exercises
                                                                                                                              • +
                                                                                                                              • 3 Workflow Basics
                                                                                                                              • -
                                                                                                                              • 3 Data Transformation
                                                                                                                                  -
                                                                                                                                • 3.1 Prerequisites
                                                                                                                                • -
                                                                                                                                • 3.2 Filter
                                                                                                                                • -
                                                                                                                                • 3.3 Exercises
                                                                                                                                • -
                                                                                                                                • 3.4 Arrange
                                                                                                                                    -
                                                                                                                                  • 3.4.1 Exercises
                                                                                                                                  • +
                                                                                                                                  • 4 Data Transformation
                                                                                                                                      +
                                                                                                                                    • 4.1 Prerequisites
                                                                                                                                    • +
                                                                                                                                    • 4.2 Filter
                                                                                                                                    • +
                                                                                                                                    • 4.3 Exercises
                                                                                                                                    • +
                                                                                                                                    • 4.4 Arrange
                                                                                                                                    • -
                                                                                                                                    • 3.5 Mutate
                                                                                                                                        -
                                                                                                                                      • 3.5.1 Exercises
                                                                                                                                      • +
                                                                                                                                      • 4.5 Mutate
                                                                                                                                      • -
                                                                                                                                      • 3.6 Grouped summaries with summarise()
                                                                                                                                          -
                                                                                                                                        • 3.6.1 Exercises
                                                                                                                                        • +
                                                                                                                                        • 4.6 Grouped summaries with summarise()
                                                                                                                                        • -
                                                                                                                                        • 3.7 Grouped mutates and filters
                                                                                                                                        • -
                                                                                                                                        • 4 Exploratory Data Analysis
                                                                                                                                            -
                                                                                                                                          • 4.1 Introduction
                                                                                                                                              -
                                                                                                                                            • 4.1.1 Questions
                                                                                                                                            • -
                                                                                                                                            • 4.1.2 Variation
                                                                                                                                            • +
                                                                                                                                            • 5 Exploratory Data Analysis
                                                                                                                                                +
                                                                                                                                              • 5.1 Introduction
                                                                                                                                              • -
                                                                                                                                              • 4.2 Missing Values
                                                                                                                                                  -
                                                                                                                                                • 4.2.1 Exercises
                                                                                                                                                • +
                                                                                                                                                • 5.2 Missing Values
                                                                                                                                                • -
                                                                                                                                                • 4.3 Covariation
                                                                                                                                                • -
                                                                                                                                                • 5 Tibbles
                                                                                                                                                    -
                                                                                                                                                  • 5.1 Prerquisites
                                                                                                                                                  • -
                                                                                                                                                  • 5.2 Creating Tibbles
                                                                                                                                                  • -
                                                                                                                                                  • 5.3 Tibbles vs. data.frame
                                                                                                                                                  • -
                                                                                                                                                  • 5.4 Subsetting
                                                                                                                                                  • -
                                                                                                                                                  • 5.5 Interacting with older code
                                                                                                                                                  • -
                                                                                                                                                  • 5.6 Exercises
                                                                                                                                                  • +
                                                                                                                                                  • II Wrangle
                                                                                                                                                  • +
                                                                                                                                                  • 6 Tibbles
                                                                                                                                                  • -
                                                                                                                                                  • 6 Data Import
                                                                                                                                                      -
                                                                                                                                                    • 6.1 Introduction
                                                                                                                                                    • -
                                                                                                                                                    • 6.2 Getting started
                                                                                                                                                        -
                                                                                                                                                      • 6.2.1 Exercises
                                                                                                                                                      • +
                                                                                                                                                      • 7 Data Import
                                                                                                                                                          +
                                                                                                                                                        • 7.1 Introduction
                                                                                                                                                        • +
                                                                                                                                                        • 7.2 Getting started
                                                                                                                                                        • -
                                                                                                                                                        • 6.3 Parsing a vector
                                                                                                                                                        • -
                                                                                                                                                        • 7 Tidy Data
                                                                                                                                                            -
                                                                                                                                                          • 7.1 Introduction
                                                                                                                                                          • -
                                                                                                                                                          • 7.2 Tidy Data
                                                                                                                                                              -
                                                                                                                                                            • 7.2.1 Exercises
                                                                                                                                                            • +
                                                                                                                                                            • 8 Tidy Data
                                                                                                                                                                +
                                                                                                                                                              • 8.1 Introduction
                                                                                                                                                              • +
                                                                                                                                                              • 8.2 Tidy Data
                                                                                                                                                              • -
                                                                                                                                                              • 7.3 Spreading and Gathering
                                                                                                                                                                  -
                                                                                                                                                                • 7.3.1 Exercises
                                                                                                                                                                • +
                                                                                                                                                                • 8.3 Spreading and Gathering
                                                                                                                                                                • -
                                                                                                                                                                • 7.4 Separating and Uniting
                                                                                                                                                                    -
                                                                                                                                                                  • 7.4.1 Exercises
                                                                                                                                                                  • +
                                                                                                                                                                  • 8.4 Separating and Uniting
                                                                                                                                                                  • -
                                                                                                                                                                  • 7.5 Missing Values
                                                                                                                                                                      -
                                                                                                                                                                    • 7.5.1 Exercises
                                                                                                                                                                    • +
                                                                                                                                                                    • 8.5 Missing Values
                                                                                                                                                                    • -
                                                                                                                                                                    • 7.6 Case Study
                                                                                                                                                                    • -
                                                                                                                                                                    • 8 Relational Data
                                                                                                                                                                        -
                                                                                                                                                                      • 8.1 Prerequisites
                                                                                                                                                                      • -
                                                                                                                                                                      • 8.2 nycflights13
                                                                                                                                                                          -
                                                                                                                                                                        • 8.2.1 Exercises
                                                                                                                                                                        • +
                                                                                                                                                                        • 9 Relational Data
                                                                                                                                                                            +
                                                                                                                                                                          • 9.1 Prerequisites
                                                                                                                                                                          • +
                                                                                                                                                                          • 9.2 nycflights13
                                                                                                                                                                          • -
                                                                                                                                                                          • 8.3 Keys
                                                                                                                                                                          • -
                                                                                                                                                                          • 8.4 Mutating Joins
                                                                                                                                                                              -
                                                                                                                                                                            • 8.4.1 Exercises
                                                                                                                                                                            • +
                                                                                                                                                                            • 9.3 Keys
                                                                                                                                                                            • +
                                                                                                                                                                            • 9.4 Mutating Joins
                                                                                                                                                                            • -
                                                                                                                                                                            • 8.5 Filtering Joins
                                                                                                                                                                            • -
                                                                                                                                                                            • 9 Strings
                                                                                                                                                                                -
                                                                                                                                                                              • 9.1 Introduction
                                                                                                                                                                              • -
                                                                                                                                                                              • 9.2 String Basics
                                                                                                                                                                                  -
                                                                                                                                                                                • 9.2.1 Exercises
                                                                                                                                                                                • +
                                                                                                                                                                                • 10 Strings
                                                                                                                                                                                    +
                                                                                                                                                                                  • 10.1 Introduction
                                                                                                                                                                                  • +
                                                                                                                                                                                  • 10.2 String Basics
                                                                                                                                                                                  • -
                                                                                                                                                                                  • 9.3 Matching Patterns and Regular Expressions
                                                                                                                                                                                      -
                                                                                                                                                                                    • 9.3.1 Exercises
                                                                                                                                                                                    • -
                                                                                                                                                                                    • 9.3.2 Repitition
                                                                                                                                                                                    • -
                                                                                                                                                                                    • 9.3.3 Grouping and backreferences
                                                                                                                                                                                    • +
                                                                                                                                                                                    • 10.3 Matching Patterns and Regular Expressions
                                                                                                                                                                                    • -
                                                                                                                                                                                    • 9.4 Tools
                                                                                                                                                                                        -
                                                                                                                                                                                      • 9.4.1 Detect matches
                                                                                                                                                                                      • -
                                                                                                                                                                                      • 9.4.2 Exercises
                                                                                                                                                                                      • -
                                                                                                                                                                                      • 9.4.3 Extract Matches
                                                                                                                                                                                      • -
                                                                                                                                                                                      • 9.4.4 Grouped Matches
                                                                                                                                                                                      • -
                                                                                                                                                                                      • 9.4.5 Splitting
                                                                                                                                                                                      • +
                                                                                                                                                                                      • 10.4 Tools
                                                                                                                                                                                      • -
                                                                                                                                                                                      • 9.5 Other types of patterns
                                                                                                                                                                                          -
                                                                                                                                                                                        • 9.5.1 Exercises
                                                                                                                                                                                        • +
                                                                                                                                                                                        • 10.5 Other types of patterns
                                                                                                                                                                                        • -
                                                                                                                                                                                        • 9.6 stringi
                                                                                                                                                                                        • -
                                                                                                                                                                                        • 10 Factors
                                                                                                                                                                                            -
                                                                                                                                                                                          • 10.1 Introduction
                                                                                                                                                                                          • -
                                                                                                                                                                                          • 10.2 Creating Factors
                                                                                                                                                                                          • -
                                                                                                                                                                                          • 10.3 General Social Survey
                                                                                                                                                                                              -
                                                                                                                                                                                            • 10.3.1 Exercises
                                                                                                                                                                                            • +
                                                                                                                                                                                            • 11 Factors
                                                                                                                                                                                                +
                                                                                                                                                                                              • 11.1 Introduction
                                                                                                                                                                                              • +
                                                                                                                                                                                              • 11.2 Creating Factors
                                                                                                                                                                                              • +
                                                                                                                                                                                              • 11.3 General Social Survey
                                                                                                                                                                                              • -
                                                                                                                                                                                              • 10.4 Modifying factor order
                                                                                                                                                                                                  -
                                                                                                                                                                                                • 10.4.1 Exercises
                                                                                                                                                                                                • +
                                                                                                                                                                                                • 11.4 Modifying factor order
                                                                                                                                                                                                • -
                                                                                                                                                                                                • 10.5 Modifying factor levels
                                                                                                                                                                                                • -
                                                                                                                                                                                                • 11 Dates and Times
                                                                                                                                                                                                    -
                                                                                                                                                                                                  • 11.1 Prerequisite
                                                                                                                                                                                                  • -
                                                                                                                                                                                                  • 11.2 Creating date/times
                                                                                                                                                                                                      -
                                                                                                                                                                                                    • 11.2.1 Exercises
                                                                                                                                                                                                    • +
                                                                                                                                                                                                    • 12 Dates and Times
                                                                                                                                                                                                        +
                                                                                                                                                                                                      • 12.1 Prerequisite
                                                                                                                                                                                                      • +
                                                                                                                                                                                                      • 12.2 Creating date/times
                                                                                                                                                                                                      • -
                                                                                                                                                                                                      • 11.3 Date-Time Components
                                                                                                                                                                                                          -
                                                                                                                                                                                                        • 11.3.1 Exercises
                                                                                                                                                                                                        • +
                                                                                                                                                                                                        • 12.3 Date-Time Components
                                                                                                                                                                                                        • -
                                                                                                                                                                                                        • 11.4 Time Spans
                                                                                                                                                                                                        • -
                                                                                                                                                                                                        • 12 Vectors
                                                                                                                                                                                                            -
                                                                                                                                                                                                          • 12.1 Introduction
                                                                                                                                                                                                          • -
                                                                                                                                                                                                          • 12.2 Important types of Atomic Vector
                                                                                                                                                                                                              -
                                                                                                                                                                                                            • 12.2.1 Exercises
                                                                                                                                                                                                            • +
                                                                                                                                                                                                            • III Program
                                                                                                                                                                                                            • +
                                                                                                                                                                                                            • 13 Introduction
                                                                                                                                                                                                            • +
                                                                                                                                                                                                            • 14 Pipes
                                                                                                                                                                                                            • +
                                                                                                                                                                                                            • 15 Vectors
                                                                                                                                                                                                                +
                                                                                                                                                                                                              • 15.1 Introduction
                                                                                                                                                                                                              • +
                                                                                                                                                                                                              • 15.2 Important types of Atomic Vector
                                                                                                                                                                                                              • -
                                                                                                                                                                                                              • 12.3 Using atomic vectors
                                                                                                                                                                                                              • -
                                                                                                                                                                                                              • 12.4 Recursive Vectors (lists)
                                                                                                                                                                                                                  -
                                                                                                                                                                                                                • 12.4.1 Exercises
                                                                                                                                                                                                                • +
                                                                                                                                                                                                                • 15.3 Using atomic vectors
                                                                                                                                                                                                                • +
                                                                                                                                                                                                                • 15.4 Recursive Vectors (lists)
                                                                                                                                                                                                                • -
                                                                                                                                                                                                                • 12.5 Augmented Vectors
                                                                                                                                                                                                                • -
                                                                                                                                                                                                                • 13 Iteration
                                                                                                                                                                                                                    -
                                                                                                                                                                                                                  • 13.1 Introduction
                                                                                                                                                                                                                  • -
                                                                                                                                                                                                                  • 13.2 For Loops
                                                                                                                                                                                                                      -
                                                                                                                                                                                                                    • 13.2.1 Exercises
                                                                                                                                                                                                                    • +
                                                                                                                                                                                                                    • 16 Iteration
                                                                                                                                                                                                                        +
                                                                                                                                                                                                                      • 16.1 Introduction
                                                                                                                                                                                                                      • +
                                                                                                                                                                                                                      • 16.2 For Loops
                                                                                                                                                                                                                      • -
                                                                                                                                                                                                                      • 13.3 For loop variations
                                                                                                                                                                                                                          -
                                                                                                                                                                                                                        • 13.3.1
                                                                                                                                                                                                                        • +
                                                                                                                                                                                                                        • 16.3 For loop variations
                                                                                                                                                                                                                        • -
                                                                                                                                                                                                                        • 13.4 For loops vs. functionals
                                                                                                                                                                                                                            -
                                                                                                                                                                                                                          • 13.4.1 Exercises
                                                                                                                                                                                                                          • +
                                                                                                                                                                                                                          • 16.4 For loops vs. functionals
                                                                                                                                                                                                                          • -
                                                                                                                                                                                                                          • 13.5 The map functions
                                                                                                                                                                                                                              -
                                                                                                                                                                                                                            • 13.5.1 Shortcuts
                                                                                                                                                                                                                            • -
                                                                                                                                                                                                                            • 13.5.2 Exercises
                                                                                                                                                                                                                            • +
                                                                                                                                                                                                                            • 16.5 The map functions
                                                                                                                                                                                                                            • -
                                                                                                                                                                                                                            • 13.6 Dealing with Failure
                                                                                                                                                                                                                            • -
                                                                                                                                                                                                                            • 13.7 Mapping over multiple arguments
                                                                                                                                                                                                                            • -
                                                                                                                                                                                                                            • 13.8 Walk
                                                                                                                                                                                                                            • -
                                                                                                                                                                                                                            • 13.9 Other patterns of for loops
                                                                                                                                                                                                                            • -
                                                                                                                                                                                                                            • 14 Model Introduction
                                                                                                                                                                                                                            • -
                                                                                                                                                                                                                            • 15 Model Basics
                                                                                                                                                                                                                                -
                                                                                                                                                                                                                              • 15.1 Prerequisites
                                                                                                                                                                                                                              • -
                                                                                                                                                                                                                              • 15.2 A simple model
                                                                                                                                                                                                                                  -
                                                                                                                                                                                                                                • 15.2.1 Exercises
                                                                                                                                                                                                                                • +
                                                                                                                                                                                                                                • IV Model
                                                                                                                                                                                                                                • +
                                                                                                                                                                                                                                • 17 Introduction
                                                                                                                                                                                                                                • +
                                                                                                                                                                                                                                • 18 Model Basics
                                                                                                                                                                                                                                    +
                                                                                                                                                                                                                                  • 18.1 Prerequisites
                                                                                                                                                                                                                                  • +
                                                                                                                                                                                                                                  • 18.2 A simple model
                                                                                                                                                                                                                                  • -
                                                                                                                                                                                                                                  • 15.3 Visualizing Models
                                                                                                                                                                                                                                      -
                                                                                                                                                                                                                                    • 15.3.1 Exercises
                                                                                                                                                                                                                                    • +
                                                                                                                                                                                                                                    • 18.3 Visualizing Models
                                                                                                                                                                                                                                    • -
                                                                                                                                                                                                                                    • 15.4 Formulas and Model Families
                                                                                                                                                                                                                                    • -
                                                                                                                                                                                                                                    • 16 R Markdown
                                                                                                                                                                                                                                        -
                                                                                                                                                                                                                                      • 16.1 R Markdown Basics @@ -338,9 +348,9 @@

                                                                                                                                                                                                                                        -

                                                                                                                                                                                                                                        8 Relational Data

                                                                                                                                                                                                                                        +

                                                                                                                                                                                                                                        9 Relational Data

                                                                                                                                                                                                                                        -

                                                                                                                                                                                                                                        8.1 Prerequisites

                                                                                                                                                                                                                                        +

                                                                                                                                                                                                                                        9.1 Prerequisites

                                                                                                                                                                                                                                        library("tidyverse")
                                                                                                                                                                                                                                         library("nycflights13")

                                                                                                                                                                                                                                        Topics, functions

                                                                                                                                                                                                                                        @@ -354,7 +364,7 @@

                                                                                                                                                                                                                                        8.1 Prerequisites

                                                                                                                                                                                                                                        TODO: fuzzy joining

                                                                                                                                                                                                                                        -

                                                                                                                                                                                                                                        8.2 nycflights13

                                                                                                                                                                                                                                        +

                                                                                                                                                                                                                                        9.2 nycflights13

                                                                                                                                                                                                                                        NOTES

                                                                                                                                                                                                                                        nycflights13 is an example of a data-only R package. R packages can contain both functions and data. Since data-sets can get large, often they can be packaged as their own dataset. These sorts of data-only R packages make it convenient for R users to access your data, but it should not be the only way you provide your research data. Not everyone uses R, so the original data should be provided in a program agnostic format (e.g. csv files). This also holds for those using Stata; they should not be distributing data in .dta format files specific to Stata (even if as we saw earlier, other programs can read that data.) Another example of a data-only R package is gapminder.

                                                                                                                                                                                                                                        How does Hadley create his diagrams?

                                                                                                                                                                                                                                        @@ -406,7 +416,7 @@

                                                                                                                                                                                                                                        8.2 nycflights13

                                                                                                                                                                                                                                        #> # ... with 2.612e+04 more rows, and 5 more variables: wind_gust <dbl>, #> # precip <dbl>, pressure <dbl>, visib <dbl>, time_hour <dttm>
                                                                                                                                                                                                                                        -

                                                                                                                                                                                                                                        8.2.1 Exercises

                                                                                                                                                                                                                                        +

                                                                                                                                                                                                                                        9.2.1 Exercises

                                                                                                                                                                                                                                        1. Imagine you wanted to draw (approximately) the route each plane flies from its origin to its destination. What variables would you need? What tables would you need to combine?
                                                                                                                                                                                                                                        @@ -430,7 +440,7 @@

                                                                                                                                                                                                                                        8.2.1 Exercises

                                                                                                                                                                                                                                        -

                                                                                                                                                                                                                                        8.3 Keys

                                                                                                                                                                                                                                        +

                                                                                                                                                                                                                                        9.3 Keys

                                                                                                                                                                                                                                        1. Add a surrogate key to flights.
                                                                                                                                                                                                                                        @@ -541,7 +551,7 @@

                                                                                                                                                                                                                                        8.3 Keys

                                                                                                                                                                                                                                        -

                                                                                                                                                                                                                                        8.4 Mutating Joins

                                                                                                                                                                                                                                        +

                                                                                                                                                                                                                                        9.4 Mutating Joins

                                                                                                                                                                                                                                        flights2 <- flights %>%
                                                                                                                                                                                                                                           select(year:day, hour, origin, dest, tailnum, carrier)
                                                                                                                                                                                                                                         flights2 %>%
                                                                                                                                                                                                                                        @@ -558,7 +568,7 @@ 

                                                                                                                                                                                                                                        8.4 Mutating Joins

                                                                                                                                                                                                                                        #> 6 2013 1 1 5 N39463 UA United Air Lines Inc. #> # ... with 3.368e+05 more rows
                                                                                                                                                                                                                                        -

                                                                                                                                                                                                                                        8.4.1 Exercises

                                                                                                                                                                                                                                        +

                                                                                                                                                                                                                                        9.4.1 Exercises

                                                                                                                                                                                                                                        1. Compute the average delay by destination, then join on the airports data frame so you can show the spatial distribution of delays. Here’s an easy way to draw a map of the United States:
                                                                                                                                                                                                                                        @@ -665,13 +675,13 @@

                                                                                                                                                                                                                                        8.4.1 Exercises

                                                                                                                                                                                                                                        -

                                                                                                                                                                                                                                        8.5 Filtering Joins

                                                                                                                                                                                                                                        +

                                                                                                                                                                                                                                        9.5 Filtering Joins

                                                                                                                                                                                                                                        • semi_join: keep all obs in x with match in y
                                                                                                                                                                                                                                        • anti_join: drop all obs in x with a match in y
                                                                                                                                                                                                                                        -

                                                                                                                                                                                                                                        8.5.1 Exercises

                                                                                                                                                                                                                                        +

                                                                                                                                                                                                                                        9.5.1 Exercises

                                                                                                                                                                                                                                        1. What does it mean for a flight to have a missing tailnum? What do the tail numbers that don’t have a matching record in planes have in common? (Hint: one variable explains ~90% of the problems.)
                                                                                                                                                                                                                                        @@ -783,7 +793,7 @@

                                                                                                                                                                                                                                        8.5.1 Exercises

                                                                                                                                                                                                                                        -

                                                                                                                                                                                                                                        8.6 Set operations

                                                                                                                                                                                                                                        +

                                                                                                                                                                                                                                        9.6 Set operations

                                                                                                                                                                                                                                        No exercises

                                                                                                                                                                                                                                        diff --git a/docs/rmarkdown-formats.md b/docs/rmarkdown-formats.md new file mode 100644 index 00000000..7e71c156 --- /dev/null +++ b/docs/rmarkdown-formats.md @@ -0,0 +1,6 @@ + +# R Markdown Formats + +No exercises. + +This document was built with **bookdown**. You can see the source at https://github.com/jrnold/e4qf. diff --git a/docs/search_index.json b/docs/search_index.json index 1049140c..f8904d08 100644 --- a/docs/search_index.json +++ b/docs/search_index.json @@ -1,20 +1,25 @@ [ ["index.html", "Exercise Solutions and Notes for “R for Data Science” Welcome", " Exercise Solutions and Notes for “R for Data Science” Jeffrey B. Arnold Welcome This contains my exercise solutions and notes for Hadley Wickham and Garret Grolemund, R for Data Science. The original website is at r4ds.had.co.nz. Note that the chapter and section numbers don’t exactly match those of R4DS. Refer to the titles, not the numbers. The text of this work is licensed under the Creative Commons Attribution 4.0 International License. The R Code in this work is licensed under the MIT License. "], -["visualize.html", "1 Visualize 1.1 Introduction 1.2 Position Adjustments 1.3 Coordinate Systems", " 1 Visualize 1.1 Introduction 1.1.1 Prerequisites library("tidyverse") 1.1.2 First Steps 1.1.2.1 Exercises Run ggplot(data = mpg) what do you see? ggplot(data = mpg) Nothing. The plot is created, but ggplot is not given any data to plot. How many rows are in mtcars? How many columns? nrow(mtcars) #> [1] 32 ncol(mtcars) #> [1] 11 This can also be found by printing the dataset, or looking in the environment pane. What does the drv variable describe? Read the help for ?mpg to find out. ?mpg The drv variable takes the following values f = front-wheel drive r = rear wheel drive 4 = 4wd Make a scatterplot of hwy vs cyl ggplot(mpg, aes(x = hwy, y = cyl)) + geom_point() What happens if you make a scatterplot of class vs drv. Why is the plot not useful? ggplot(mpg, aes(x = class, y = drv)) + geom_point() A scatterplot is not a useful way to plot these variables, since both drv and class are factor variables, and the scatterplot cannot show which are overlapping or not. count(mpg, drv, class) #> Source: local data frame [12 x 3] #> Groups: drv [?] #> #> drv class n #> <chr> <chr> <int> #> 1 4 compact 12 #> 2 4 midsize 3 #> 3 4 pickup 33 #> 4 4 subcompact 4 #> 5 4 suv 51 #> 6 f compact 35 #> # ... with 6 more rows 1.1.3 Aesthetic mappings 1.1.3.1 Exercises What’s gone wrong with this code? Why are the points not blue? ggplot(data = mpg) + geom_point(mapping = aes(x = displ, y = hwy, color = "blue")) Since color = "blue" was included within the mapping argument, it was treated as an aesthetic (a mapping between a variable and a value). It was treated as a variable which has only one value: “blue”. Which variables in mpg are categorical? Which variables are continuous? (Hint: type ?mpg to read the documentation for the dataset). How can you see this information when you run mpg? ?mpg When printing the data frame, this information is given at the top of each column within angled brackets. Categorical variables have a class of “character” (<chr>). mpg #> # A tibble: 234 × 11 #> manufacturer model displ year cyl trans drv cty hwy fl #> <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> #> 1 audi a4 1.8 1999 4 auto(l5) f 18 29 p #> 2 audi a4 1.8 1999 4 manual(m5) f 21 29 p #> 3 audi a4 2.0 2008 4 manual(m6) f 20 31 p #> 4 audi a4 2.0 2008 4 auto(av) f 21 30 p #> 5 audi a4 2.8 1999 6 auto(l5) f 16 26 p #> 6 audi a4 2.8 1999 6 manual(m5) f 18 26 p #> # ... with 228 more rows, and 1 more variables: class <chr> The glimpse command from “mpg” shows this: glimpse(mpg) #> Observations: 234 #> Variables: 11 #> $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi", "... #> $ model <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 qua... #> $ displ <dbl> 1.8, 1.8, 2.0, 2.0, 2.8, 2.8, 3.1, 1.8, 1.8, 2.0,... #> $ year <int> 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1... #> $ cyl <int> 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6... #> $ trans <chr> "auto(l5)", "manual(m5)", "manual(m6)", "auto(av)... #> $ drv <chr> "f", "f", "f", "f", "f", "f", "f", "4", "4", "4",... #> $ cty <int> 18, 21, 20, 21, 16, 18, 18, 18, 16, 20, 19, 15, 1... #> $ hwy <int> 29, 29, 31, 30, 26, 26, 27, 26, 25, 28, 27, 25, 2... #> $ fl <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p", "p",... #> $ class <chr> "compact", "compact", "compact", "compact", "comp... Map a continuous variable to color, size, and shape. How do these aesthetics behave differently for categorical vs. continuous variables? The variable cty, city highway miles per gallon, is a continuous variable: ggplot(mpg, aes(x = displ, y = hwy, color = cty)) + geom_point() Instead of using discrete colors, the continous variable uses a scale that goes from black to bluish. ggplot(mpg, aes(x = displ, y = hwy, size = cty)) + geom_point() When mapped to size, the sizes of the points vary continuously with respect to the size (although the legend shows a few representative values) ggplot(mpg, aes(x = displ, y = hwy, shape = cty)) + geom_point() #> Error: A continuous variable can not be mapped to shape When a continuous value is mapped to shape, it gives an error. Though we could split a continuous variable into discrete categories and use shape, this would conceptually not make sense. A continuous numeric variable is ordered, but shapes have no natural order. It is clear that smaller points correspond to smaller values, or once the color scale is given, which points are larger or smaller. But it is not clear whether a square is greater or less than a circle. What happens if you map the same variable to multiple aesthetics? ggplot(mpg, aes(x = displ, y = hwy, color = hwy, size = displ)) + geom_point() In the above plot, hwy is mapped to both location on the y-axis and color, and displ is mapped to both location on the x-axis and size. The code works and produces a plot, even if it is a bad one. Mapping a single variable to multiple aesthetics is redundant. Because it is redundant information, in most cases avoid mapping a single variable to multiple aesthetics. What does the stroke aesthetic do? What shapes does it work with? (Hint: use ?geom_point) The following example is given in ?geom_point: ggplot(mtcars, aes(wt, mpg)) + geom_point(shape = 21, colour = "black", fill = "white", size = 5, stroke = 5) Stroke changes the color of the border for shapes (22-24). What happens if you map an aesthetic to something other than a variable name, like aes(colour = displ < 5)? ggplot(mpg, aes(x = displ, y = hwy, colour = displ < 5)) + geom_point() Aesthetics can also be mapped to expressions (code like displ < 5). It will create a temporary variable which takes values from the result of the expression. In this case, it is logical variable which is TRUE or FALSE. This also explains exercise 1, color = "blue" created a categorical variable that only had one category: “blue”. 1.1.4 Facets 1.1.4.1 Exercises What happens if you facet on a continuous variable? Let’s see ggplot(mpg, aes(x = displ, y = hwy)) + geom_point() + facet_grid(. ~ cty) It converts the continuous varible to a factor and creates facets for all unique values of it. What do the empty cells in plot with facet_grid(drv ~ cyl) mean? How do they relate to this plot? They are cells in which there are no values of the combination of drv and cyl. ggplot(data = mpg) + geom_point(mapping = aes(x = drv, y = cyl)) The locations in the above plot without points are the same cells in facet_grid(drv ~ cyl) tha have no points. What plots does the following code make? What does . do? The symbol . ignores that dimension for faceting. This plot facets by values of drv on the y-axis: ggplot(data = mpg) + geom_point(mapping = aes(x = displ, y = hwy)) + facet_grid(drv ~ .) This plot facets by values of cyl on the x-axis: ggplot(data = mpg) + geom_point(mapping = aes(x = displ, y = hwy)) + facet_grid(. ~ cyl) Read ?facet_wrap. What does nrow do? What does ncol do? What other options control the layout of the individual panels? Why doesn’t facet_grid() have nrow and ncol variables? The arguments nrow (ncol) determines the number of rows (columns) to use when laying out the facets. It is necessary since facet_wrap only facets on one variable. These arguments are unnecessary for facet_grid since the number of rows and columns are determined by the number of unique values of the variables specified. When using facet_grid() you should usually put the variable with more unique levels in the columns. Why? You should put the variable with more unique levels in the columns if the plot is laid out landscape. It is easier to compare relative levels of y by scanning horizontally, so it may be easier to visually compare these levels. I’m actually not sure about the correct answer to this. 1.1.5 Geometric Objects What does show.legend = FALSE do? What happens if you remove it? Why do you think I used it earlier in the chapter? NOTE This doesn’t appear earlier in the chapter Issue #510 What does the se argument to geom_smooth() do? It adds standard error bands to the lines. ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = drv)) + geom_point() + geom_smooth(se = TRUE) #> `geom_smooth()` using method = 'loess' By default se = TRUE: ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = drv)) + geom_point() + geom_smooth() #> `geom_smooth()` using method = 'loess' Will these two graphs look different? Why/why not? No. Because both geom_point and geom_smooth use the same data and mappings. They will inherit those options from the ggplot object, and thus don’t need to specified again (or twice). ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + geom_point() + geom_smooth() #> `geom_smooth()` using method = 'loess' ggplot() + geom_point(data = mpg, mapping = aes(x = displ, y = hwy)) + geom_smooth(data = mpg, mapping = aes(x = displ, y = hwy)) #> `geom_smooth()` using method = 'loess' Recreate the R code necessary to generate the following graphs. ggplot(mpg, aes(x = displ, y = hwy)) + geom_point() + geom_smooth(se = FALSE) #> `geom_smooth()` using method = 'loess' ggplot(mpg, aes(x = displ, y = hwy)) + geom_point() + geom_smooth(mapping = aes(group = drv), se = FALSE) #> `geom_smooth()` using method = 'loess' ggplot(mpg, aes(x = displ, y = hwy, colour = drv)) + geom_point() + geom_smooth(se = FALSE) #> `geom_smooth()` using method = 'loess' ggplot(mpg, aes(x = displ, y = hwy)) + geom_point(mapping = aes(colour = drv)) + geom_smooth(se = FALSE) #> `geom_smooth()` using method = 'loess' ggplot(mpg, aes(x = displ, y = hwy)) + geom_point(aes(colour = drv)) + geom_smooth(aes(linetype = drv), se = FALSE) #> `geom_smooth()` using method = 'loess' ggplot(mpg, aes(x = displ, y = hwy, fill = drv)) + geom_point(color = "white", shape = 21) 1.1.6 Statistical Transformations What is the default geom associated with stat_summary()? How could you rewrite the previous plot to use that geom function instead of the stat function? The default geom for stat_summary is geom_pointrange (see the stat) argument. But, the default stat for geom_pointrange is identity, so use geom_pointrange(stat = "summary"). ggplot(data = diamonds) + geom_pointrange( mapping = aes(x = cut, y = depth), stat = "summary", ) #> No summary function supplied, defaulting to `mean_se() The default message says that stat_summary uses the mean and sd to calculate the point, and range of the line. So lets use the previous values of fun.ymin, fun.ymax, and fun.y: ggplot(data = diamonds) + geom_pointrange( mapping = aes(x = cut, y = depth), stat = "summary", fun.ymin = min, fun.ymax = max, fun.y = median ) What does geom_col() do? How is it different to geom_bar()? geom_col differs from geom_bar in its default stat. geom_col has uses the identity stat. So it expects that a variable already exists for the height of the bars. geom_bar uses the count stat, and so will count observations in groups in order to generate the variable to use for the height of the bars. Most geoms and stats come in pairs that are almost always used in concert. Read through the documentation and make a list of all the pairs. What do they have in common? See the ggplot2 documentation What variables does stat_smooth() compute? What parameters control its behaviour? stat_smooth calculates y: predicted value ymin: lower value of the confidence interval ymax: upper value of the confidence interval se: standard error There’s parameters such as method which determines which method is used to calculate the predictions and confidence interval, and some other arguments that are passed to that. In our proportion bar chart, we need to set group = 1 Why? In other words what is the problem with these two graphs? If group is not set to 1, then all the bars have prop == 1. The function geom_bar assumes that the groups are equal to the x values, since the stat computes the counts within the group. ggplot(data = diamonds) + geom_bar(mapping = aes(x = cut, y = ..prop..)) The problem with these two plots is that the proportions are calculated within the groups. ggplot(data = diamonds) + geom_bar(mapping = aes(x = cut, y = ..prop..)) ggplot(data = diamonds) + geom_bar(mapping = aes(x = cut, fill = color, y = ..prop..)) This is more likely what was intended: ggplot(data = diamonds) + geom_bar(mapping = aes(x = cut, y = ..prop.., group = 1)) ggplot(data = diamonds) + geom_bar(mapping = aes(x = cut, fill = color, y = ..prop.., group = color)) 1.2 Position Adjustments What is the problem with this plot? How could you improve it? There is overplotting because there are multiple observations for each combination of cty and hwy. ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + geom_point() I’d fix it by using a jitter positition adjustment. ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + geom_point(position = "jitter") What parameters to geom_jitter() control the amount of jittering? From the position_jitter documentation, there are two arguments to jitter: width and height, which control the amount of vertical and horizontal jitter. No horizontal jitter ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + geom_point(position = position_jitter(width = 0)) Way too much vertical jitter ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + geom_point(position = position_jitter(width = 0, height = 15)) Only horizontal jitter: ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + geom_point(position = position_jitter(height = 0)) Way too much horizontal jitter: ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + geom_point(position = position_jitter(height = 0, width = 20)) Compare and contrast geom_jitter() with geom_count(). What’s the default position adjustment for geom_boxplot()? Create a visualisation of the mpg dataset that demonstrates it. The default position for geom_boxplot is position_dodge (see its docs). When we add color = class to the boxplot, the different classes within drv are placed side by side, i.e. dodged. If it was position_identity, they would be overlapping. ggplot(data = mpg, aes(x = drv, y = hwy, color = class)) + geom_boxplot() ggplot(data = mpg, aes(x = drv, y = hwy, color = class)) + geom_boxplot(position = "identity") 1.3 Coordinate Systems 1.3.1 Exercises Turn a stacked bar chart into a pie chart using coord_polar(). This is a stacked bar chart with a single category ggplot(mpg, aes(x = factor(1), fill = drv)) + geom_bar() See the documentation for coord_polar for an example of making a pie chart. In particular, theta = "y", meaning that the angle of the chart is the y variable has to be specified. ggplot(mpg, aes(x = factor(1), fill = drv)) + geom_bar(width = 1) + coord_polar(theta = "y") If theta = "y" is not specified, then you get a bullseye chart ggplot(mpg, aes(x = factor(1), fill = drv)) + geom_bar(width = 1) + coord_polar() If you had a multiple stacked bar chart, like, ggplot(data = diamonds) + geom_bar(mapping = aes(x = cut, fill = clarity), position = "fill") you end up with a multi-donut chart ggplot(data = diamonds) + geom_bar(mapping = aes(x = cut, fill = clarity), position = "fill") + coord_polar(theta = "y") What does labs() do? Read the documentation. labs is a shortcut function to add labels to different scales. ggplot(data = mpg, mapping = aes(x = class, y = hwy)) + geom_boxplot() + coord_flip() + labs(y = "Highway MPG", x = "") What’s the difference between coord_quickmap() and coord_map()? See the docs: coord_map uses a 2D projection: by default the Mercatur project of the sphere to the plot. But this requires transforming all geoms. coord_quickmap uses a quick approximation by using the lat/long ratio as an approximation. This is “quick” because the shapes don’t need to be transformed. What does the plot below tell you about the relationship between city and highway mpg? Why is coord_fixed() important? What does geom_abline() do? The coordinates coord_fixed ensures that the abline is at a 45 degree angle, which makes it easy to compare the highway and city mileage against what it would be if they were exactly the same. ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + geom_point() + geom_abline() + coord_fixed() If we didn’t include geom_point, then the line is no longer at 45 degrees: ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + geom_point() + geom_abline() "], -["workflow-basics.html", "2 Workflow Basics 2.1 Practice", " 2 Workflow Basics 2.1 Practice 2.1.1 Exercises Why does this code not work? my_variable <- 10 my_varıable #> Error in eval(expr, envir, enclos): object 'my_varıable' not found The variable being printed is my_varıable, not my_variable: the seventh character is “ı” (LATIN SMALL LETTER DOTLESS I) not “i”. While it wouldn’t have helped much in this case, the importance of distinguishing characters in code is reasons why fonts which clearly distinguish similar characters are preferred in programming: especially important are distinguishing between zero (0), Latin small letter O (o), and Latin capital letter O (O); and the numeral one (1), Latin small letter I (i), Latin capital letter I (i), and Latin small letter L (l). In these fonts, zero and the Latin letter O are often distinguished by using a glyph for zero that uses either a dot in the interior or a slash through it. Also note that the error messages of the form “object ‘…’ not found”, mean just what they say, the object can’t be found by R. This is usually because you either (1) forgot to define the function (or had an error that prevented it from being defined earlier), (2) didn’t load a package with the object, or (3) made a typo in the object’s name (either when using it or when you originally defined it). Tweak each of the following R commands so that they run correctly: library(tidyverse) #> Loading tidyverse: ggplot2 #> Loading tidyverse: tibble #> Loading tidyverse: tidyr #> Loading tidyverse: readr #> Loading tidyverse: purrr #> Loading tidyverse: dplyr #> Conflicts with tidy packages ---------------------------------------------- #> filter(): dplyr, stats #> lag(): dplyr, stats ggplot(dota = mpg) + geom_point(mapping = aes(x = displ, y = hwy)) #> Error in structure(list(data = data, layers = list(), scales = scales_list(), : argument "data" is missing, with no default The error message is argument "data" is missing, with no default. It looks like a typo, dota instead of data. ggplot(data = mpg) + geom_point(mapping = aes(x = displ, y = hwy)) fliter(mpg, cyl = 8) #> Error in eval(expr, envir, enclos): could not find function "fliter" R could not find the function fliter because we made a typo: fliter instead of filter. filter(mpg, cyl = 8) #> Error: filter() takes unnamed arguments. Do you need `==`? We aren’t done yet. But the error message gives a suggestion. Let’s follow it. filter(mpg, cyl == 8) #> # A tibble: 70 × 11 #> manufacturer model displ year cyl trans drv cty #> <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> #> 1 audi a6 quattro 4.2 2008 8 auto(s6) 4 16 #> 2 chevrolet c1500 suburban 2wd 5.3 2008 8 auto(l4) r 14 #> 3 chevrolet c1500 suburban 2wd 5.3 2008 8 auto(l4) r 11 #> 4 chevrolet c1500 suburban 2wd 5.3 2008 8 auto(l4) r 14 #> 5 chevrolet c1500 suburban 2wd 5.7 1999 8 auto(l4) r 13 #> 6 chevrolet c1500 suburban 2wd 6.0 2008 8 auto(l4) r 12 #> # ... with 64 more rows, and 3 more variables: hwy <int>, fl <chr>, #> # class <chr> filter(diamond, carat > 3) #> Error in filter_(.data, .dots = lazyeval::lazy_dots(...)): object 'diamond' not found R says it can’t find the object diamond. This is a typo; the data frame is named diamonds. filter(diamonds, carat > 3) #> # A tibble: 32 × 10 #> carat cut color clarity depth table price x y z #> <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> #> 1 3.01 Premium I I1 62.7 58 8040 9.10 8.97 5.67 #> 2 3.11 Fair J I1 65.9 57 9823 9.15 9.02 5.98 #> 3 3.01 Premium F I1 62.2 56 9925 9.24 9.13 5.73 #> 4 3.05 Premium E I1 60.9 58 10453 9.26 9.25 5.66 #> 5 3.02 Fair I I1 65.2 56 10577 9.11 9.02 5.91 #> 6 3.01 Fair H I1 56.1 62 10761 9.54 9.38 5.31 #> # ... with 26 more rows How did I know? I started typing in diamond and RStudio autocorrected it to diamonds. Since diamonds includes the variable carat and the code works, that appears to have been the problem. Press Alt + Shift + K. What happens? How can you get to the same place using the menus? This gives a menu with keyboard shortcuts. This can be found in the menu under Tools -> Keyboard Shortcuts Help. "], -["data-transformation.html", "3 Data Transformation 3.1 Prerequisites 3.2 Filter 3.3 Exercises 3.4 Arrange 3.5 Mutate 3.6 Grouped summaries with summarise() 3.7 Grouped mutates and filters", " 3 Data Transformation 3.1 Prerequisites library(nycflights13) library(tidyverse) 3.2 Filter glimpse(flights) #> Observations: 336,776 #> Variables: 19 #> $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013,... #> $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,... #> $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,... #> $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 55... #> $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 60... #> $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2... #> $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 7... #> $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 7... #> $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -... #> $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV",... #> $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79... #> $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN... #> $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR"... #> $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL"... #> $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138... #> $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 94... #> $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5,... #> $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, ... #> $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013... 3.3 Exercises Find all flights that Had an arrival delay of two or more hours Flew to Houston (IAH or HOU) Were operated by United, American, or Delta Departed in summer (July, August, and September) Arrived more than two hours late, but didn’t leave late Were delayed by at least an hour, but made up over 30 minutes in flight Departed between midnight and 6am (inclusive) Had an arrival delay of two or more hours Since delay is in minutes, we are looking for flights where arr_delay > 120: flights %>% filter(arr_delay > 120) #> # A tibble: 10,034 × 19 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 1 1 811 630 101 1047 #> 2 2013 1 1 848 1835 853 1001 #> 3 2013 1 1 957 733 144 1056 #> 4 2013 1 1 1114 900 134 1447 #> 5 2013 1 1 1505 1310 115 1638 #> 6 2013 1 1 1525 1340 105 1831 #> # ... with 1.003e+04 more rows, and 12 more variables: #> # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>, #> # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, #> # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm> Flew to Houston (IAH or HOU): flights %>% filter(dest %in% c("IAH", "HOU")) #> # A tibble: 9,313 × 19 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 1 1 517 515 2 830 #> 2 2013 1 1 533 529 4 850 #> 3 2013 1 1 623 627 -4 933 #> 4 2013 1 1 728 732 -4 1041 #> 5 2013 1 1 739 739 0 1104 #> 6 2013 1 1 908 908 0 1228 #> # ... with 9,307 more rows, and 12 more variables: sched_arr_time <int>, #> # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, #> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, #> # minute <dbl>, time_hour <dttm> Were operated by United, American, or Delta The variable carrier has the airline: but it is in two-digit carrier codes. However, we can look it up in the airlines dataset. airlines #> # A tibble: 16 × 2 #> carrier name #> <chr> <chr> #> 1 9E Endeavor Air Inc. #> 2 AA American Airlines Inc. #> 3 AS Alaska Airlines Inc. #> 4 B6 JetBlue Airways #> 5 DL Delta Air Lines Inc. #> 6 EV ExpressJet Airlines Inc. #> # ... with 10 more rows Since there are only 16 rows, its not even worth filtering. Delta is DL, American is AA, and United is UA: filter(flights, carrier %in% c("AA", "DL", "UA")) #> # A tibble: 139,504 × 19 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 1 1 517 515 2 830 #> 2 2013 1 1 533 529 4 850 #> 3 2013 1 1 542 540 2 923 #> 4 2013 1 1 554 600 -6 812 #> 5 2013 1 1 554 558 -4 740 #> 6 2013 1 1 558 600 -2 753 #> # ... with 1.395e+05 more rows, and 12 more variables: #> # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>, #> # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, #> # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm> Departed in summer (July, August, and September) The variable month has the month, and it is numeric. filter(flights, between(month, 7, 9)) #> # A tibble: 86,326 × 19 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 7 1 1 2029 212 236 #> 2 2013 7 1 2 2359 3 344 #> 3 2013 7 1 29 2245 104 151 #> 4 2013 7 1 43 2130 193 322 #> 5 2013 7 1 44 2150 174 300 #> 6 2013 7 1 46 2051 235 304 #> # ... with 8.632e+04 more rows, and 12 more variables: #> # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>, #> # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, #> # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm> Arrived more than two hours late, but didn’t leave late filter(flights, !is.na(dep_delay), dep_delay <= 0, arr_delay > 120) #> # A tibble: 29 × 19 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 1 27 1419 1420 -1 1754 #> 2 2013 10 7 1350 1350 0 1736 #> 3 2013 10 7 1357 1359 -2 1858 #> 4 2013 10 16 657 700 -3 1258 #> 5 2013 11 1 658 700 -2 1329 #> 6 2013 3 18 1844 1847 -3 39 #> # ... with 23 more rows, and 12 more variables: sched_arr_time <int>, #> # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, #> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, #> # minute <dbl>, time_hour <dttm> Were delayed by at least an hour, but made up over 30 minutes in flight filter(flights, !is.na(dep_delay), dep_delay >= 60, arr_delay < 30) #> # A tibble: 206 × 19 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 1 3 1850 1745 65 2148 #> 2 2013 1 3 1950 1845 65 2228 #> 3 2013 1 3 2015 1915 60 2135 #> 4 2013 1 6 1019 900 79 1558 #> 5 2013 1 7 1543 1430 73 1758 #> 6 2013 1 11 1020 920 60 1311 #> # ... with 200 more rows, and 12 more variables: sched_arr_time <int>, #> # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, #> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, #> # minute <dbl>, time_hour <dttm> Departed between midnight and 6am (inclusive). filter(flights, dep_time >= 0, dep_time <= 600) #> # A tibble: 9,344 × 19 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 1 1 517 515 2 830 #> 2 2013 1 1 533 529 4 850 #> 3 2013 1 1 542 540 2 923 #> 4 2013 1 1 544 545 -1 1004 #> 5 2013 1 1 554 600 -6 812 #> 6 2013 1 1 554 558 -4 740 #> # ... with 9,338 more rows, and 12 more variables: sched_arr_time <int>, #> # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, #> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, #> # minute <dbl>, time_hour <dttm> or using between (see next question) filter(flights, between(dep_time, 0, 600)) #> # A tibble: 9,344 × 19 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 1 1 517 515 2 830 #> 2 2013 1 1 533 529 4 850 #> 3 2013 1 1 542 540 2 923 #> 4 2013 1 1 544 545 -1 1004 #> 5 2013 1 1 554 600 -6 812 #> 6 2013 1 1 554 558 -4 740 #> # ... with 9,338 more rows, and 12 more variables: sched_arr_time <int>, #> # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, #> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, #> # minute <dbl>, time_hour <dttm> Another useful dplyr filtering helper is between(). What does it do? Can you use it to simplify the code needed to answer the previous challenges? between(x, left, right) is equivalent to x >= left & x <= right. I already used it in 1.4. How many flights have a missing dep_time? What other variables are missing? What might these rows represent? filter(flights, is.na(dep_time)) #> # A tibble: 8,255 × 19 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 1 1 NA 1630 NA NA #> 2 2013 1 1 NA 1935 NA NA #> 3 2013 1 1 NA 1500 NA NA #> 4 2013 1 1 NA 600 NA NA #> 5 2013 1 2 NA 1540 NA NA #> 6 2013 1 2 NA 1620 NA NA #> # ... with 8,249 more rows, and 12 more variables: sched_arr_time <int>, #> # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, #> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, #> # minute <dbl>, time_hour <dttm> Since arr_time is also missing, these are cancelled flights. Why is NA ^ 0 not missing? Why is NA | TRUE not missing? Why is FALSE & NA not missing? Can you figure out the general rule? (NA * 0 is a tricky counterexample!) NA ^ 0 == 1 since for all numeric values \\(x ^ 0 = 1\\). NA ^ 0 #> [1] 1 NA | TRUE is TRUE because the it doesn’t matter whether the missing value is TRUE or FALSE, x \\lor T = T for all values of x. NA | TRUE #> [1] TRUE Likewise, anything and FALSE is always FALSE. NA & FALSE #> [1] FALSE Because the value of the missing element matters in NA | FALSE and NA & TRUE, these are missing: NA | FALSE #> [1] NA NA & TRUE #> [1] NA wut? Since x * 0 = 0 for all \\(x\\) we might expect NA * 0 = 0, but that’s not the case. NA * 0 #> [1] NA 3.4 Arrange missing values always at the end. 3.4.1 Exercises How could you use arrange() to sort all missing values to the start? (Hint: use is.na()). This sorts by increasing dep_time, but with all missing values put first. arrange(flights, desc(is.na(dep_time)), dep_time) #> # A tibble: 336,776 × 19 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 1 1 NA 1630 NA NA #> 2 2013 1 1 NA 1935 NA NA #> 3 2013 1 1 NA 1500 NA NA #> 4 2013 1 1 NA 600 NA NA #> 5 2013 1 2 NA 1540 NA NA #> 6 2013 1 2 NA 1620 NA NA #> # ... with 3.368e+05 more rows, and 12 more variables: #> # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>, #> # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, #> # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm> Sort flights to find the most delayed flights. Find the flights that left earliest. The most delayed flights are found by sorting by dep_delay in descending order. arrange(flights, desc(dep_delay)) #> # A tibble: 336,776 × 19 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 1 9 641 900 1301 1242 #> 2 2013 6 15 1432 1935 1137 1607 #> 3 2013 1 10 1121 1635 1126 1239 #> 4 2013 9 20 1139 1845 1014 1457 #> 5 2013 7 22 845 1600 1005 1044 #> 6 2013 4 10 1100 1900 960 1342 #> # ... with 3.368e+05 more rows, and 12 more variables: #> # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>, #> # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, #> # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm> If we sort dep_delay in ascending order, we get those that left earliest. There was a flight that left 43 minutes early. arrange(flights, dep_delay) #> # A tibble: 336,776 × 19 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 12 7 2040 2123 -43 40 #> 2 2013 2 3 2022 2055 -33 2240 #> 3 2013 11 10 1408 1440 -32 1549 #> 4 2013 1 11 1900 1930 -30 2233 #> 5 2013 1 29 1703 1730 -27 1947 #> 6 2013 8 9 729 755 -26 1002 #> # ... with 3.368e+05 more rows, and 12 more variables: #> # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>, #> # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, #> # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm> Sort flights to find the fastest flights. I assume that by by “fastest flights” it means the flights with the minimum air time. So I sort by air_time. The fastest flights. The fastest flights area couple of flights between EWR and BDL with an air time of 20 minutes. arrange(flights, air_time) #> # A tibble: 336,776 × 19 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 1 16 1355 1315 40 1442 #> 2 2013 4 13 537 527 10 622 #> 3 2013 12 6 922 851 31 1021 #> 4 2013 2 3 2153 2129 24 2247 #> 5 2013 2 5 1303 1315 -12 1342 #> 6 2013 2 12 2123 2130 -7 2211 #> # ... with 3.368e+05 more rows, and 12 more variables: #> # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>, #> # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, #> # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm> Which flights travelled the longest? Which travelled the shortest? I’ll assume hat travelled the longest or shortest refers to distance, rather than air-time. The longest flights are the Hawaii Air (HA 51) between JFK and HNL (Honolulu) at 4,983 miles. arrange(flights, desc(distance)) #> # A tibble: 336,776 × 19 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 1 1 857 900 -3 1516 #> 2 2013 1 2 909 900 9 1525 #> 3 2013 1 3 914 900 14 1504 #> 4 2013 1 4 900 900 0 1516 #> 5 2013 1 5 858 900 -2 1519 #> 6 2013 1 6 1019 900 79 1558 #> # ... with 3.368e+05 more rows, and 12 more variables: #> # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>, #> # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, #> # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm> Apart from an EWR to LGA flight that was cancelled, the shortest flights are the Envoy Air Flights between EWR and PHL at 80 miles. arrange(flights, distance) #> # A tibble: 336,776 × 19 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 7 27 NA 106 NA NA #> 2 2013 1 3 2127 2129 -2 2222 #> 3 2013 1 4 1240 1200 40 1333 #> 4 2013 1 4 1829 1615 134 1937 #> 5 2013 1 4 2128 2129 -1 2218 #> 6 2013 1 5 1155 1200 -5 1241 #> # ... with 3.368e+05 more rows, and 12 more variables: #> # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>, #> # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, #> # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm> Brainstorm as many ways as possible to select dep_time, dep_delay, arr_time, and arr_delay from flights. A few ways include: select(flights, dep_time, dep_delay, arr_time, arr_delay) #> # A tibble: 336,776 × 4 #> dep_time dep_delay arr_time arr_delay #> <int> <dbl> <int> <dbl> #> 1 517 2 830 11 #> 2 533 4 850 20 #> 3 542 2 923 33 #> 4 544 -1 1004 -18 #> 5 554 -6 812 -25 #> 6 554 -4 740 12 #> # ... with 3.368e+05 more rows select(flights, starts_with("dep_"), starts_with("arr_")) #> # A tibble: 336,776 × 4 #> dep_time dep_delay arr_time arr_delay #> <int> <dbl> <int> <dbl> #> 1 517 2 830 11 #> 2 533 4 850 20 #> 3 542 2 923 33 #> 4 544 -1 1004 -18 #> 5 554 -6 812 -25 #> 6 554 -4 740 12 #> # ... with 3.368e+05 more rows select(flights, matches("^(dep|arr)_(time|delay)$")) #> # A tibble: 336,776 × 4 #> dep_time dep_delay arr_time arr_delay #> <int> <dbl> <int> <dbl> #> 1 517 2 830 11 #> 2 533 4 850 20 #> 3 542 2 923 33 #> 4 544 -1 1004 -18 #> 5 554 -6 812 -25 #> 6 554 -4 740 12 #> # ... with 3.368e+05 more rows using ends_with() doesn’t work well since it would bget sched_arr_time and sched_dep_time. What happens if you include the name of a variable multiple times in a select() call? It ignores the duplicates, and that variable is only included once. No error, warning, or message is emited. select(flights, year, month, day, year, year) #> # A tibble: 336,776 × 3 #> year month day #> <int> <int> <int> #> 1 2013 1 1 #> 2 2013 1 1 #> 3 2013 1 1 #> 4 2013 1 1 #> 5 2013 1 1 #> 6 2013 1 1 #> # ... with 3.368e+05 more rows What does the one_of() function do? Why might it be helpful in conjunction with this vector? The one_of vector allows you to select variables with a character vector rather than as unquoted variable names. It’s useful because then you can easily pass vectors to select(). vars <- c("year", "month", "day", "dep_delay", "arr_delay") select(flights, one_of(vars)) #> # A tibble: 336,776 × 5 #> year month day dep_delay arr_delay #> <int> <int> <int> <dbl> <dbl> #> 1 2013 1 1 2 11 #> 2 2013 1 1 4 20 #> 3 2013 1 1 2 33 #> 4 2013 1 1 -1 -18 #> 5 2013 1 1 -6 -25 #> 6 2013 1 1 -4 12 #> # ... with 3.368e+05 more rows Does the result of running the following code surprise you? How do the select helpers deal with case by default? How can you change that default? select(flights, contains("TIME")) #> # A tibble: 336,776 × 6 #> dep_time sched_dep_time arr_time sched_arr_time air_time #> <int> <int> <int> <int> <dbl> #> 1 517 515 830 819 227 #> 2 533 529 850 830 227 #> 3 542 540 923 850 160 #> 4 544 545 1004 1022 183 #> 5 554 600 812 837 116 #> 6 554 558 740 728 150 #> # ... with 3.368e+05 more rows, and 1 more variables: time_hour <dttm> The default behavior for contains is to ignore case. Yes, it surprises me. Upon reflection, I realized that this is likely the default behavior because dplyr is designed to deal with a variety of data backends, and some database engines don’t differentiate case. To change the behavior add the argument ignore.case = FALSE. Now no variables are selected. select(flights, contains("TIME", ignore.case = FALSE)) #> # A tibble: 336,776 × 0 3.5 Mutate 3.5.1 Exercises Currently dep_time and sched_dep_time are convenient to look at, but hard to compute with because they’re not really continuous numbers. Convert them to a more convenient representation of number of minutes since midnight. To get the departure times in the number of minutes, (integer) divide dep_time by 100 to get the hours since midnight and muliply by 60 and add the remainder of dep_time divided by 100. mutate(flights, dep_time_mins = dep_time %/% 100 * 60 + dep_time %% 100, sched_dep_time_mins = sched_dep_time %/% 100 * 60 + sched_dep_time %% 100) %>% select(dep_time, dep_time_mins, sched_dep_time, sched_dep_time_mins) #> # A tibble: 336,776 × 4 #> dep_time dep_time_mins sched_dep_time sched_dep_time_mins #> <int> <dbl> <int> <dbl> #> 1 517 317 515 315 #> 2 533 333 529 329 #> 3 542 342 540 340 #> 4 544 344 545 345 #> 5 554 354 600 360 #> 6 554 354 558 358 #> # ... with 3.368e+05 more rows This would be more cleanly done by first definining a funciton and reusing that: time2mins <- function(x) { x %/% 100 * 60 + x %% 100 } mutate(flights, dep_time_mins = time2mins(dep_time), sched_dep_time_mins = time2mins(sched_dep_time)) %>% select(dep_time, dep_time_mins, sched_dep_time, sched_dep_time_mins) #> # A tibble: 336,776 × 4 #> dep_time dep_time_mins sched_dep_time sched_dep_time_mins #> <int> <dbl> <int> <dbl> #> 1 517 317 515 315 #> 2 533 333 529 329 #> 3 542 342 540 340 #> 4 544 344 545 345 #> 5 554 354 600 360 #> 6 554 354 558 358 #> # ... with 3.368e+05 more rows Compare air_time with arr_time - dep_time. What do you expect to see? What do you see? What do you need to do to fix it? Since arr_time and dep_time may be in different time zones, the air_time doesn’t equal the difference. We would need to account for time-zones in these calculations. mutate(flights, air_time2 = arr_time - dep_time, air_time_diff = air_time2 - air_time) %>% filter(air_time_diff != 0) %>% select(air_time, air_time2, dep_time, arr_time, dest) #> # A tibble: 326,128 × 5 #> air_time air_time2 dep_time arr_time dest #> <dbl> <int> <int> <int> <chr> #> 1 227 313 517 830 IAH #> 2 227 317 533 850 IAH #> 3 160 381 542 923 MIA #> 4 183 460 544 1004 BQN #> 5 116 258 554 812 ATL #> 6 150 186 554 740 ORD #> # ... with 3.261e+05 more rows Compare dep_time, sched_dep_time, and dep_delay. How would you expect those three numbers to be related? I’d expect dep_time, sched_dep_time, and dep_delay to be related so that dep_time - sched_dep_time = dep_delay. mutate(flights, dep_delay2 = dep_time - sched_dep_time) %>% filter(dep_delay2 != dep_delay) %>% select(dep_time, sched_dep_time, dep_delay, dep_delay2) #> # A tibble: 99,777 × 4 #> dep_time sched_dep_time dep_delay dep_delay2 #> <int> <int> <dbl> <int> #> 1 554 600 -6 -46 #> 2 555 600 -5 -45 #> 3 557 600 -3 -43 #> 4 557 600 -3 -43 #> 5 558 600 -2 -42 #> 6 558 600 -2 -42 #> # ... with 9.977e+04 more rows Oops, I forgot to convert to minutes. I’ll reuse the time2mins function I wrote earlier. mutate(flights, dep_delay2 = time2mins(dep_time) - time2mins(sched_dep_time)) %>% filter(dep_delay2 != dep_delay) %>% select(dep_time, sched_dep_time, dep_delay, dep_delay2) #> # A tibble: 1,207 × 4 #> dep_time sched_dep_time dep_delay dep_delay2 #> <int> <int> <dbl> <dbl> #> 1 848 1835 853 -587 #> 2 42 2359 43 -1397 #> 3 126 2250 156 -1284 #> 4 32 2359 33 -1407 #> 5 50 2145 185 -1255 #> 6 235 2359 156 -1284 #> # ... with 1,201 more rows Well, that solved most of the problems, but these two numbers don’t match because we aren’t accounting for flights where the departure time is the next day from the scheduled departure time. Find the 10 most delayed flights using a ranking function. How do you want to handle ties? Carefully read the documentation for min_rank(). I’d want to handle ties by taking the minimum of tied values. If three flights are have the same value and are the most delayed, we would say they are tied for first, not tied for third or second. mutate(flights, dep_delay_rank = min_rank(-dep_delay)) %>% arrange(dep_delay_rank) %>% filter(dep_delay_rank <= 10) #> # A tibble: 10 × 20 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 1 9 641 900 1301 1242 #> 2 2013 6 15 1432 1935 1137 1607 #> 3 2013 1 10 1121 1635 1126 1239 #> 4 2013 9 20 1139 1845 1014 1457 #> 5 2013 7 22 845 1600 1005 1044 #> 6 2013 4 10 1100 1900 960 1342 #> # ... with 4 more rows, and 13 more variables: sched_arr_time <int>, #> # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, #> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, #> # minute <dbl>, time_hour <dttm>, dep_delay_rank <int> What does 1:3 + 1:10 return? Why? It returns c(1 + 1, 2 + 2, 3 + 3, 1 + 4, 2 + 5, 3 + 6, 1 + 7, 2 + 8, 3 + 9, 1 + 10). When adding two vectors recycles the shorter vector’s values to get vectors of the same length. We get a warning vector since the shorter vector is not a multiple of the longer one (this often, but not necessarily, means we made an error somewhere). 1:3 + 1:10 #> Warning in 1:3 + 1:10: longer object length is not a multiple of shorter #> object length #> [1] 2 4 6 5 7 9 8 10 12 11 What trigonometric functions does R provide? All the classics: cos, sin, tan, acos, asin, atan, plus a few others that are drive by numerical or computational issues. 3.6 Grouped summaries with summarise() 3.6.1 Exercises Brainstorm at least 5 different ways to assess the typical delay characteristics of a group of flights. Consider the following scenarios: A flight is 15 minutes early 50% of the time, and 15 minutes late 50% of the time. A flight is always 10 minutes late. A flight is 30 minutes early 50% of the time, and 30 minutes late 50% of the time. 99% of the time a flight is on time. 1% of the time it’s 2 hours late. Which is more important: arrival delay or departure delay? Arrival delay is more important. Arriving early is nice, but equally as good as arriving late is bad. Variation is worse than consistency; if I know the plane will always arrive 10 minutes late, then I can plan for it arriving as if the actual arrival time was 10 minutes later than the scheduled arrival time. So I’d try something that calculates the expected time of the flight, and then aggregates over any delays from that time. I would ignore any early arrival times. A better ranking would also consider cancellations, and need a way to convert them to a delay time (perhaps using the arrival time of the next flight to the same destination). Come up with another approach that will give you the same output as not_cancelled %>% count(dest) and not_cancelled %>% count(tailnum, wt = distance) (without using count()). Our definition of cancelled flights (is.na(dep_delay) | is.na(arr_delay)) is slightly suboptimal. Why? Which is the most important column? If a flight doesn’t depart, then it won’t arrive. A flight can also depart and not arrive if it crashes; I’m not sure how this data would handle flights that are redirected and land at other airports for whatever reason. The more important column is arr_delay so we could just use that. filter(flights, !is.na(dep_delay), is.na(arr_delay)) %>% select(dep_time, arr_time, sched_arr_time, dep_delay, arr_delay) #> # A tibble: 1,175 × 5 #> dep_time arr_time sched_arr_time dep_delay arr_delay #> <int> <int> <int> <dbl> <dbl> #> 1 1525 1934 1805 -5 NA #> 2 1528 2002 1647 29 NA #> 3 1740 2158 2020 -5 NA #> 4 1807 2251 2103 29 NA #> 5 1939 29 2151 59 NA #> 6 1952 2358 2207 22 NA #> # ... with 1,169 more rows Okay, I’m not sure what’s going on in this data. dep_time can be non-missing and arr_delay missing but arr_time not missing. They may be combining different flights? Look at the number of cancelled flights per day. Is there a pattern? Is the proportion of cancelled flights related to the average delay? cancelled_delayed <- flights %>% mutate(cancelled = (is.na(arr_delay) | is.na(dep_delay))) %>% group_by(year, month, day) %>% summarise(prop_cancelled = mean(cancelled), avg_dep_delay = mean(dep_delay, na.rm = TRUE)) ggplot(cancelled_delayed, aes(x = avg_dep_delay, prop_cancelled)) + geom_point() + geom_smooth() #> `geom_smooth()` using method = 'loess' Which carrier has the worst delays? Challenge: can you disentangle the effects of bad airports vs. bad carriers? Why/why not? (Hint: think about flights %>% group_by(carrier, dest) %>% summarise(n())) flights %>% group_by(carrier) %>% summarise(arr_delay = mean(arr_delay, na.rm = TRUE)) %>% arrange(desc(arr_delay)) #> # A tibble: 16 × 2 #> carrier arr_delay #> <chr> <dbl> #> 1 F9 21.9 #> 2 FL 20.1 #> 3 EV 15.8 #> 4 YV 15.6 #> 5 OO 11.9 #> 6 MQ 10.8 #> # ... with 10 more rows filter(airlines, carrier == "F9") #> # A tibble: 1 × 2 #> carrier name #> <chr> <chr> #> 1 F9 Frontier Airlines Inc. Frontier Airlines (FL) has the worst delays. You can get part of the way to disentangling the effects of airports vs. carriers by comparing each flight’s delay to the average delay of destination airport. However, you’d really want to compare it to the average delay of the desination airport, after removing other flights from the same airline. 538 has done something like this: http://fivethirtyeight.com/features/the-best-and-worst-airlines-airports-and-flights-summer-2015-update/. For each plane, count the number of flights before the first delay of greater than 1 hour. I think this requires grouped mutate (but I may be wrong): flights %>% arrange(tailnum, year, month, day) %>% group_by(tailnum) %>% mutate(delay_gt1hr = dep_delay > 60) %>% mutate(before_delay = cumsum(delay_gt1hr)) %>% filter(before_delay < 1) %>% count(sort = TRUE) #> # A tibble: 3,755 × 2 #> tailnum n #> <chr> <int> #> 1 N954UW 206 #> 2 N952UW 163 #> 3 N957UW 142 #> 4 N5FAAA 117 #> 5 N38727 99 #> 6 N3742C 98 #> # ... with 3,749 more rows What does the sort argument to count() do. When might you use it? The sort argument to count sorts the results in order of n. You could use this anytime you would do count followed by arrange. 3.7 Grouped mutates and filters 3.7.1 Exercises Refer back to the table of useful mutate and filtering functions. Describe how each operation changes when you combine it with grouping. They operate within each group rather than over the entire data frame. E.g. mean will calculate the mean within each group. Which plane (tailnum) has the worst on-time record? flights %>% group_by(tailnum) %>% summarise(arr_delay = mean(arr_delay, na.rm = TRUE)) %>% ungroup() %>% filter(rank(desc(arr_delay)) <= 1) #> # A tibble: 1 × 2 #> tailnum arr_delay #> <chr> <dbl> #> 1 N844MH 320 What time of day should you fly if you want to avoid delays as much as possible? Let’s group by hour. The earlier the better to fly. This is intuitive as delays early in the morning are likely to propogate throughout the day. flights %>% group_by(hour) %>% summarise(arr_delay = mean(arr_delay, na.rm = TRUE)) %>% ungroup() %>% arrange(arr_delay) #> # A tibble: 20 × 2 #> hour arr_delay #> <dbl> <dbl> #> 1 7 -5.304 #> 2 5 -4.797 #> 3 6 -3.384 #> 4 9 -1.451 #> 5 8 -1.113 #> 6 10 0.954 #> # ... with 14 more rows For each destination, compute the total minutes of delay. For each, flight, compute the proportion of the total delay for its destination. flights %>% filter(!is.na(arr_delay), arr_delay > 0) %>% group_by(dest) %>% mutate(total_delay = sum(arr_delay), prop_delay = arr_delay / sum(arr_delay)) #> Source: local data frame [133,004 x 21] #> Groups: dest [103] #> #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 1 1 517 515 2 830 #> 2 2013 1 1 533 529 4 850 #> 3 2013 1 1 542 540 2 923 #> 4 2013 1 1 554 558 -4 740 #> 5 2013 1 1 555 600 -5 913 #> 6 2013 1 1 558 600 -2 753 #> # ... with 1.33e+05 more rows, and 14 more variables: #> # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>, #> # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, #> # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>, #> # total_delay <dbl>, prop_delay <dbl> Alternatively, consider the delay as relative to the minimum delay for any flight to that destination. Now all non-cancelled flights have a proportion. flights %>% filter(!is.na(arr_delay), arr_delay > 0) %>% group_by(dest) %>% mutate(total_delay = sum(arr_delay - min(arr_delay)), prop_delay = arr_delay / sum(arr_delay)) #> Source: local data frame [133,004 x 21] #> Groups: dest [103] #> #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 1 1 517 515 2 830 #> 2 2013 1 1 533 529 4 850 #> 3 2013 1 1 542 540 2 923 #> 4 2013 1 1 554 558 -4 740 #> 5 2013 1 1 555 600 -5 913 #> 6 2013 1 1 558 600 -2 753 #> # ... with 1.33e+05 more rows, and 14 more variables: #> # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>, #> # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, #> # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>, #> # total_delay <dbl>, prop_delay <dbl> Delays are typically temporally correlated: even once the problem that caused the initial delay has been resolved, later flights are delayed to allow earlier flights to leave. Using lag() explore how the delay of a flight is related to the delay of the immediately preceding flight. We want to group by day to avoid taking the lag from the previous day. Also, I want to use departure delay, since this mechanism is relevant for departures. Also, I remove missing values both before and after calculating the lag delay. However, it would be interesting to ask the probability or averge delay after a cancellation. flights %>% group_by(year, month, day) %>% filter(!is.na(dep_delay)) %>% mutate(lag_delay = lag(dep_delay)) %>% filter(!is.na(lag_delay)) %>% ggplot(aes(x = dep_delay, y = lag_delay)) + geom_point() + geom_smooth() #> `geom_smooth()` using method = 'gam' Look at each destination. Can you find flights that are suspiciously fast? (i.e. flights that represent a potential data entry error). Compute the air time a flight relative to the shortest flight to that destination. Which flights were most delayed in the air? The shorter BOS and PHL flights that are 20 minutes for 30+ minutes flights seem plausible - though maybe entries of +/- a few minutes can easily create large changes. I assume that departure time has a standardized definition, but I’m not sure; if there is some discretion, that could create errors that are small in absolute time, but large in relative time for small flights. The ATL, GSP, an BNA flights looks a little suspicious as it’s almost half the time for longer flights. flights %>% filter(!is.na(air_time)) %>% group_by(dest) %>% mutate(med_time = median(air_time), fast = (air_time - med_time) / med_time) %>% arrange(fast) %>% select(air_time, med_time, fast, dep_time, sched_dep_time, arr_time, sched_arr_time) %>% head(15) #> Adding missing grouping variables: `dest` #> Source: local data frame [15 x 8] #> Groups: dest [9] #> #> dest air_time med_time fast dep_time sched_dep_time arr_time #> <chr> <dbl> <dbl> <dbl> <int> <int> <int> #> 1 BOS 21 38 -0.447 1450 1500 1547 #> 2 ATL 65 112 -0.420 1709 1700 1923 #> 3 GSP 55 92 -0.402 2040 2025 2225 #> 4 BOS 23 38 -0.395 1954 2000 2131 #> 5 BNA 70 113 -0.381 1914 1910 2045 #> 6 MSP 93 149 -0.376 1558 1513 1745 #> # ... with 9 more rows, and 1 more variables: sched_arr_time <int> I could also try a z-score. Though the sd and mean will be affected by large delays. flights %>% filter(!is.na(air_time)) %>% group_by(dest) %>% mutate(air_time_mean = mean(air_time), air_time_sd = sd(air_time), z_score = (air_time - air_time_mean) / air_time_sd) %>% arrange(z_score) %>% select(z_score, air_time_mean, air_time_sd, air_time, dep_time, sched_dep_time, arr_time, sched_arr_time) #> Adding missing grouping variables: `dest` #> Source: local data frame [327,346 x 9] #> Groups: dest [104] #> #> dest z_score air_time_mean air_time_sd air_time dep_time sched_dep_time #> <chr> <dbl> <dbl> <dbl> <dbl> <int> <int> #> 1 MSP -4.90 150.6 11.75 93 1558 1513 #> 2 ATL -4.88 112.9 9.81 65 1709 1700 #> 3 GSP -4.72 93.4 8.13 55 2040 2025 #> 4 BNA -4.05 114.4 10.96 70 1914 1910 #> 5 CVG -3.98 96.0 8.52 62 1359 1343 #> 6 BOS -3.63 39.0 4.95 21 1450 1500 #> # ... with 3.273e+05 more rows, and 2 more variables: arr_time <int>, #> # sched_arr_time <int> flights %>% filter(!is.na(air_time)) %>% group_by(dest) %>% mutate(air_time_diff = air_time - min(air_time)) %>% arrange(desc(air_time_diff)) %>% select(dest, year, month, day, carrier, flight, air_time_diff, air_time, dep_time, arr_time) %>% head() #> Source: local data frame [6 x 10] #> Groups: dest [5] #> #> dest year month day carrier flight air_time_diff air_time dep_time #> <chr> <int> <int> <int> <chr> <int> <dbl> <dbl> <int> #> 1 SFO 2013 7 28 DL 841 195 490 1727 #> 2 LAX 2013 11 22 DL 426 165 440 1812 #> 3 EGE 2013 1 28 AA 575 163 382 1806 #> 4 DEN 2013 9 10 UA 745 149 331 1513 #> 5 LAX 2013 7 10 DL 17 147 422 1814 #> 6 LAS 2013 11 22 UA 587 143 399 2142 #> # ... with 1 more variables: arr_time <int> Find all destinations that are flown by at least two carriers. Use that information to rank the carriers. The carrier tha flies to the most locations is ExpressJet Airlines (EV). ExpressJet is a regional airline and partner for major airlines, so its one of those that flies small planes to close airports flights %>% group_by(dest, carrier) %>% count(carrier) %>% group_by(carrier) %>% count(sort = TRUE) #> # A tibble: 16 × 2 #> carrier nn #> <chr> <int> #> 1 EV 61 #> 2 9E 49 #> 3 UA 47 #> 4 B6 42 #> 5 DL 40 #> 6 MQ 20 #> # ... with 10 more rows filter(airlines, carrier == "EV") #> # A tibble: 1 × 2 #> carrier name #> <chr> <chr> #> 1 EV ExpressJet Airlines Inc. "], -["exploratory-data-analysis.html", "4 Exploratory Data Analysis 4.1 Introduction 4.2 Missing Values 4.3 Covariation", " 4 Exploratory Data Analysis 4.1 Introduction library("tidyverse") library("viridis") library("forcats") This will also use data from nycflights13, library("nycflights13") 4.1.1 Questions 4.1.2 Variation 4.1.2.1 Exercises 1. Explore the distribution of each of the x, y, and z variables in diamonds. What do you learn? Think about a diamond and how you might decide which dimension is the length, width, and depth. In order to make it eaiser to plot them, I’ll reshape the dataset so that I can use the variables as facets. diamonds %>% mutate(id = row_number()) %>% select(x, y, z, id) %>% gather(variable, value, -id) %>% ggplot(aes(x = value)) + geom_density() + geom_rug() + facet_grid(variable ~ .) There several noticeable features of thedistributions They are right skewed, with most diamonds small, but a few very large ones. There is an outlier in y, and z (see the rug) All three distributions have a bimodality (perhaps due to some sort of threshhold) According to the documentation for diamonds: x is length, y is width, and z is depth. I don’t know if I would have figured that out before; maybe if there was data on the type of cuts. 2. Explore the distribution of price. Do you discover anything unusual or surprising? (Hint: Carefully think about the binwidth and make sure you try a wide range of values.) The price data is spikey, but I can’t tell what that corresponds to, as the following plots don’t show much difference in the distributions in the last one and last two digits. There are no diamonds with a price of 1500 There’s a bulge in the distribution around 7500. ggplot(filter(diamonds, price < 2500), aes(x = price)) + geom_histogram(binwidth = 10, center = 0) ggplot(filter(diamonds), aes(x = price)) + geom_histogram(binwidth = 100, center = 0) Distribution of last digit diamonds %>% mutate(ending = price %% 10) %>% ggplot(aes(x = ending)) + geom_histogram(binwidth = 1, center = 0) + geom_bar() diamonds %>% mutate(ending = price %% 100) %>% ggplot(aes(x = ending)) + geom_histogram(binwidth = 1) + geom_bar() diamonds %>% mutate(ending = price %% 1000) %>% filter(ending >= 500, ending <= 800) %>% ggplot(aes(x = ending)) + geom_histogram(binwidth = 1) + geom_bar() 3. How many diamonds are 0.99 carat? How many are 1 carat? What do you think is the cause of the difference? There are more than 70 times as many 1 carat diamonds as 0.99 carat diamond. diamonds %>% filter(carat >= 0.99, carat <= 1) %>% count(carat) #> # A tibble: 2 × 2 #> carat n #> <dbl> <int> #> 1 0.99 23 #> 2 1.00 1558 I don’t know exactly the process behind how carats are measured, but some way or another some diamonds carat values are being “rounded up”, because presumably there is a premium for a 1 carat diamond vs. a 0.99 carat diamond beyond the expected increase in price due to a 0.01 carat increase. To check this intuition, we’d want to look at the number of diamonds in each carat range to seem if there is an abnormally low number at 0.99 carats, and an abnormally high number at 1 carat. diamonds %>% filter(carat >= 0.9, carat <= 1.1) %>% count(carat) %>% print(n = 30) #> # A tibble: 21 × 2 #> carat n #> <dbl> <int> #> 1 0.90 1485 #> 2 0.91 570 #> 3 0.92 226 #> 4 0.93 142 #> 5 0.94 59 #> 6 0.95 65 #> 7 0.96 103 #> 8 0.97 59 #> 9 0.98 31 #> 10 0.99 23 #> 11 1.00 1558 #> 12 1.01 2242 #> 13 1.02 883 #> 14 1.03 523 #> 15 1.04 475 #> 16 1.05 361 #> 17 1.06 373 #> 18 1.07 342 #> 19 1.08 246 #> 20 1.09 287 #> 21 1.10 278 Q Can you think of other examples of similar phenoma where you might expect to see similar discontinuities in areas related to your research. Compare and contrast coord_cartesian() vs xlim() or ylim() when zooming in on a histogram. What happens if you leave binwidth unset? What happens if you try and zoom so only half a bar shows? coord_cartesian simply zooms in on the area specified by the limits. The calculation of the histogram is unaffected. ggplot(diamonds) + geom_histogram(mapping = aes(x = price)) + coord_cartesian(xlim = c(100, 5000), ylim = c(0, 3000)) #> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. However, the xlim and ylim functions first drop any values outside the limits (the ylim doesn’t matter in this case), then calculates the histogram, and draws the graph with the given limits. ggplot(diamonds) + geom_histogram(mapping = aes(x = price)) + xlim(100, 5000) + ylim(0, 3000) #> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. #> Warning: Removed 14714 rows containing non-finite values (stat_bin). #> Warning: Removed 5 rows containing missing values (geom_bar). 4.2 Missing Values 4.2.1 Exercises What happens to missing values in a histogram? What happens to missing values in a bar chart? Why is there a difference? Missing values are removed when the number of observations in each bin are calculated. See the warning message: Removed 9 rows containing non-finite values (stat_bin) diamonds2 <- diamonds %>% mutate(y = ifelse(y < 3 | y > 20, NA, y)) ggplot(diamonds2, aes(x = y)) + geom_histogram() #> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. #> Warning: Removed 9 rows containing non-finite values (stat_bin). In geom_bar, NA is treated as another category. This is because the x aesthetic in geom_bar should be a discrete (categorical) variable, and missing values are just another category. diamonds %>% mutate(cut = if_else(runif(n()) < 0.1, NA_character_, as.character(cut))) %>% ggplot() + geom_bar(mapping = aes(x = cut)) In a histogram, the x aesthetic variable needs to be numeric, and stat_bin groups the observations by ranges into bins. Since the numeric value of the NA observations is unknown, they cannot be placed in a particular bin, and are dropped. What does na.rm = TRUE do in mean() and sum()? This option removes NA values from the vector prior to calculating the mean and sum. mean(c(0, 1, 2, NA), na.rm = TRUE) #> [1] 1 sum(c(0, 1, 2, NA), na.rm = TRUE) #> [1] 3 4.3 Covariation 4.3.1 A categorical and continuous variable For a history of the boxplot see Wikckham [40 years of the boxplot] (http://vita.had.co.nz/papers/boxplots.pdf) Krywinski, Martin, and Naomi Altman. 2014. “Points of Significance: Visualizing samples with box plots.” Nature Methods URL Where does the 1.5 x IQR come from? It’s kind of arbitrary. But in a normal distribution, the IQR is approximatley 2, and 1.5 x IQR is approx 4, so the outliers are approximately within 4 standard deviations of the median (mean). 4.3.1.1 Excercises Use what you’ve learned to improve the visualisation of the departure times of cancelled vs. non-cancelled flights. Instead of a freqplot use a box-plot nycflights13::flights %>% mutate( cancelled = is.na(dep_time), sched_hour = sched_dep_time %/% 100, sched_min = sched_dep_time %% 100, sched_dep_time = sched_hour + sched_min / 60 ) %>% ggplot() + geom_boxplot(mapping = aes(y = sched_dep_time, x = cancelled)) What variable in the diamonds dataset is most important for predicting the price of a diamond? How is that variable correlated with cut? Why does the combination of those two relationships lead to lower quality diamonds being more expensive? I’m not exactly sure what this question is asking conditional on using only the tools introduced in the book thus far. Install the ggstance package, and create a horizontal boxplot. How does this compare to using coord_flip()? Earlier we created a horizontal boxplot of the distribution hwy by class, using geom_boxplot and coord_flip: ggplot(data = mpg) + geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) + coord_flip() In this case the output looks the same, but in the aesthetics the x and y are flipped from the previous case. library("ggstance") ggplot(data = mpg) + geom_boxploth(mapping = aes(y = reorder(class, hwy, FUN = median), x = hwy)) One problem with boxplots is that they were developed in an era of much smaller datasets and tend to display a prohibitively large number of “outlying values”. One approach to remedy this problem is the letter value plot. Install the lvplot package, and try using geom_lv() to display the distribution of price vs cut. What do you learn? How do you interpret the plots? The boxes of the letter-value plot correspond to many more quantiles. They are useful for larger datasets because larger datasets can give precise estiamtes of quantiles beyond the quartiles in expectation, larger datasets should have many more outliers The letter-value plot is described in: Heike Hofmann, Karen Kafadar, and Hadley Wickham. 2011. “Letter-value plots: Boxplots for large data” http://vita.had.co.nz/papers/letter-value-plot.pdf library("lvplot") ggplot(diamonds, aes(x = cut, y = price)) + geom_lv() Compare and contrast geom_violin() with a facetted geom_histogram(), or a coloured geom_freqpoly(). What are the pros and cons of each method? I produce plots for these three methods below. The geom_freqpoly is better for look-up: meaning that given a price, it is easy to tell which cut has the highest density. However, the overlapping lines makes it difficult to distinguish how the overall distributions relate to each other. The geom_violin and facetted geom_histogram have similar strengths and weaknesses. It is easy to visually distinguish differences in the overall shape of the distributions (skewness, central values, variance, etc). However, since we can’t easily compare the vertical values of the distribution, its difficult to look up which category has the highest density for a given price. All of these methods depend on tuning parameters to determine the level of smoothness of the distribution. ggplot(data = diamonds, mapping = aes(x = price, y = ..density..)) + geom_freqpoly(mapping = aes(colour = cut), binwidth = 500) ggplot(data = diamonds, mapping = aes(x = price)) + geom_histogram() + facet_wrap(~ cut, ncol = 1, scales = "free_y") #> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. ggplot(data = diamonds, mapping = aes(x = cut, y = price)) + geom_violin() + coord_flip() The violin plot was first described in Hintze JL, Nelson RD (1998). “Violin Plots: A Box Plot-Density Trace Synergism.” The American Statistician, 52(2), 181–184 If you have a small dataset, it’s sometimes useful to use geom_jitter() to see the relationship between a continuous and categorical variable. The ggbeeswarm package provides a number of methods similar to geom_jitter(). List them and briefly describe what each one does. There are two methods: geom_quasirandom that produces plots that resemble something between jitter and violin. There are several different methods that determine exactly how the random location of the points is generated. geom_beeswarm creates a shape similar to a violin plot, but by offsetting the points. I’ll use the mpg boxplot example since these methods display individual points, they are better suited for smaller datasets. library("ggbeeswarm") ggplot(data = mpg) + geom_quasirandom(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) ggplot(data = mpg) + geom_quasirandom(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy), method = "tukey") ggplot(data = mpg) + geom_quasirandom(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy), method = "tukeyDense") ggplot(data = mpg) + geom_quasirandom(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy), method = "frowney") ggplot(data = mpg) + geom_quasirandom(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy), method = "smiley") ggplot(data = mpg) + geom_beeswarm(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) 4.3.2 Two categorical variables How could you rescale the count dataset above to more clearly show the distribution of cut within colour, or colour within cut? TO clearly show the distribution of cut within color, calculate a new variable prop which is the proportion of each cut within a color. This is done using a grouped mutate. diamonds %>% count(color, cut) %>% group_by(color) %>% mutate(prop = n / sum(n)) %>% ggplot(mapping = aes(x = color, y = cut)) + geom_tile(mapping = aes(fill = prop)) + scale_fill_viridis(limits = c(0, 1)) Similarly, to scale by the distribution of color within cut, diamonds %>% count(color, cut) %>% group_by(cut) %>% mutate(prop = n / sum(n)) %>% ggplot(mapping = aes(x = color, y = cut)) + geom_tile(mapping = aes(fill = prop)) + scale_fill_viridis(limits = c(0, 1)) I add limit = c(0, 1) to put the color scale between (0, 1). These are the logical boundaries of proportions. This makes it possible to compare each cell to its actual value, and would improve comparisons across multiple plots. However, it ends up limiting the colors and makes it harder to compare within the dataset. However, using the default limits of the minimum and maximum values makes it easier to compare within the dataset the emphasizing relative differences, but harder to compare across datasets. Use geom_tile() together with dplyr to explore how average flight delays vary by destination and month of year. What makes the plot difficult to read? How could you improve it? flights %>% group_by(month, dest) %>% summarise(dep_delay = mean(dep_delay, na.rm = TRUE)) %>% ggplot(aes(x = factor(month), y = dest, fill = dep_delay)) + geom_tile() + labs(x = "Month", y = "Destination", fill = "Departure Delay") There are several things that could be done to improve it, sort destinations by a meaningful quanity (distance, number of flights, average delay) remove missing values better color scheme (viridis) How to treat missing values is difficult. In this case, missing values correspond to airports which don’t have regular flights (at least one flight each month) from NYC. These are likely smaller airports (with higher variance in their average due to fewer observations). library("viridis") flights %>% group_by(month, dest) %>% summarise(dep_delay = mean(dep_delay, na.rm = TRUE)) %>% group_by(dest) %>% filter(n() == 12) %>% ungroup() %>% mutate(dest = fct_reorder(dest, dep_delay)) %>% ggplot(aes(x = factor(month), y = dest, fill = dep_delay)) + geom_tile() + scale_fill_viridis() + labs(x = "Month", y = "Destination", fill = "Departure Delay") Why is it slightly better to use aes(x = color, y = cut) rather than aes(x = cut, y = color) in the example above? It’s usually better to use the categorical variable with a larger number of categories or the longer labels on the y axis. If at all possible, labels should be horizontal because that is easier to read. However, switching the order doesn’t result in overlapping labels. diamonds %>% count(color, cut) %>% ggplot(mapping = aes(y = color, x = cut)) + geom_tile(mapping = aes(fill = n)) Another justification, for switching the order is that the larger numbers are at the top when x = color and y = cut, and that lowers the cognitive burden of interpreting the plot. 4.3.3 Two continuous variables Instead of summarising the conditional distribution with a boxplot, you could use a frequency polygon. What do you need to consider when using cut_width() vs cut_number()? How does that impact a visualisation of the 2d distribution of carat and price? When using cut_width the number in each bin may be unequal. The distribution of carat is right skewed so there are few diamonds in those bins. ggplot(data = diamonds, mapping = aes(x = price, colour = cut_width(carat, 0.3))) + geom_freqpoly() #> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. Plotting the density instead of counts will make the distributions comparable, although the bins with few observations will still be hard to interpret. ggplot(data = diamonds, mapping = aes(x = price, y = ..density.., colour = cut_width(carat, 0.3))) + geom_freqpoly() #> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. Plotting the density instead of counts will make the distributions comparable, although the bins with few observations will still be hard to interpret. ggplot(data = diamonds, mapping = aes(x = price, colour = cut_number(carat, 10))) + geom_freqpoly() #> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. Since there are equal numbers in each bin, the plot looks the same if density is used for the y aesthetic (although the values are on a different scale). ggplot(data = diamonds, mapping = aes(x = price, y = ..density.., colour = cut_number(carat, 10))) + geom_freqpoly() #> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. Visualise the distribution of carat, partitioned by price. With a boxplot, partitionining into an 10 bins with the same number of observations: ggplot(diamonds, aes(x = cut_number(price, 10), y = carat)) + geom_boxplot() + coord_flip() + xlab("Price") With a boxplot, partitionining into an bins of $2,000 with the width of the box determined by the number of observations. I use boundary = 0 to ensure the first bin goes from $0–$2,000. ggplot(diamonds, aes(x = cut_width(price, 2000, boundary = 0), y = carat)) + geom_boxplot(varwidth = TRUE) + coord_flip() + xlab("Price") How does the price distribution of very large diamonds compare to small diamonds. Is it as you expect, or does it surprise you? The distribution of very large diamonds is more variable. I’m not surprised, since I had a very weak prior about diamond prices. Ex post, I would reason that above a certain size other factors such as cut, clarity, color play more of a role in the price. Combine two of the techniques you’ve learned to visualise the combined distribution of cut, carat, and price. There’s lots of options to try: Here’s a couple. What else did you try? What’s the best way? ggplot(diamonds, aes(x = carat, y = price)) + geom_hex() + facet_wrap(~ cut, ncol = 1) + scale_fill_viridis() #> Loading required package: methods ggplot(diamonds, aes(x = cut_number(carat, 5), y = price, color = cut)) + geom_boxplot() ggplot(diamonds, aes(color = cut_number(carat, 5), y = price, x = cut)) + geom_boxplot() "], -["tibbles.html", "5 Tibbles 5.1 Prerquisites 5.2 Creating Tibbles 5.3 Tibbles vs. data.frame 5.4 Subsetting 5.5 Interacting with older code 5.6 Exercises", " 5 Tibbles 5.1 Prerquisites library("tidyverse") Functions and packages covered in this chapter: package tibble as_tibble, tibble 5.2 Creating Tibbles Why might you want to create non-syntactic variable names? Since variable names are often used as in plots (e.g. axis-titles) or headers in tables, where having spaces or other characters that are invalid R variable names is useful. Those functions will have ways to use text other than the column. 5.3 Tibbles vs. data.frame Discuss the definition of a data frame. What is the traditional R data.frame? In general, discuss how this “dialect” of R relates to base R and other R that they will see. Also, need to discuss types of variables. If nycflights::flights were printed in the console it would be much worse. Just try it, I dare you. as.data.frame(nycflights13::flights) 5.4 Subsetting Note Warnings about partial matching! What is it and why is it dangerous. 5.5 Interacting with older code Note Not all older functions work with tibbles (an example includes giAmelia); usually because they rely on quirks in data.frame behavior that tibbles “fix”. Use as.data.frame() to turn a tibble back into a data.frame. This is usually because of [ and the way it inconsistenly returns a vector or a data frame. With tibbles [ always returns a data frame 5.6 Exercises How can you tell if an object is a tibble? (Hint: try printing mtcars, which is a regular data frame). mtcars #> mpg cyl disp hp drat wt qsec vs am gear carb #> Mazda RX4 21.0 6 160.0 110 3.90 2.62 16.5 0 1 4 4 #> Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.88 17.0 0 1 4 4 #> Datsun 710 22.8 4 108.0 93 3.85 2.32 18.6 1 1 4 1 #> Hornet 4 Drive 21.4 6 258.0 110 3.08 3.21 19.4 1 0 3 1 #> Hornet Sportabout 18.7 8 360.0 175 3.15 3.44 17.0 0 0 3 2 #> Valiant 18.1 6 225.0 105 2.76 3.46 20.2 1 0 3 1 #> Duster 360 14.3 8 360.0 245 3.21 3.57 15.8 0 0 3 4 #> Merc 240D 24.4 4 146.7 62 3.69 3.19 20.0 1 0 4 2 #> Merc 230 22.8 4 140.8 95 3.92 3.15 22.9 1 0 4 2 #> Merc 280 19.2 6 167.6 123 3.92 3.44 18.3 1 0 4 4 #> Merc 280C 17.8 6 167.6 123 3.92 3.44 18.9 1 0 4 4 #> Merc 450SE 16.4 8 275.8 180 3.07 4.07 17.4 0 0 3 3 #> Merc 450SL 17.3 8 275.8 180 3.07 3.73 17.6 0 0 3 3 #> Merc 450SLC 15.2 8 275.8 180 3.07 3.78 18.0 0 0 3 3 #> Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.25 18.0 0 0 3 4 #> Lincoln Continental 10.4 8 460.0 215 3.00 5.42 17.8 0 0 3 4 #> Chrysler Imperial 14.7 8 440.0 230 3.23 5.34 17.4 0 0 3 4 #> Fiat 128 32.4 4 78.7 66 4.08 2.20 19.5 1 1 4 1 #> Honda Civic 30.4 4 75.7 52 4.93 1.61 18.5 1 1 4 2 #> Toyota Corolla 33.9 4 71.1 65 4.22 1.83 19.9 1 1 4 1 #> Toyota Corona 21.5 4 120.1 97 3.70 2.46 20.0 1 0 3 1 #> Dodge Challenger 15.5 8 318.0 150 2.76 3.52 16.9 0 0 3 2 #> AMC Javelin 15.2 8 304.0 150 3.15 3.44 17.3 0 0 3 2 #> Camaro Z28 13.3 8 350.0 245 3.73 3.84 15.4 0 0 3 4 #> Pontiac Firebird 19.2 8 400.0 175 3.08 3.85 17.1 0 0 3 2 #> Fiat X1-9 27.3 4 79.0 66 4.08 1.94 18.9 1 1 4 1 #> Porsche 914-2 26.0 4 120.3 91 4.43 2.14 16.7 0 1 5 2 #> Lotus Europa 30.4 4 95.1 113 3.77 1.51 16.9 1 1 5 2 #> Ford Pantera L 15.8 8 351.0 264 4.22 3.17 14.5 0 1 5 4 #> Ferrari Dino 19.7 6 145.0 175 3.62 2.77 15.5 0 1 5 6 #> Maserati Bora 15.0 8 301.0 335 3.54 3.57 14.6 0 1 5 8 #> Volvo 142E 21.4 4 121.0 109 4.11 2.78 18.6 1 1 4 2 class(mtcars) #> [1] "data.frame" class(as_tibble(mtcars)) #> [1] "tbl_df" "tbl" "data.frame" Tibbles will only print out a limited number of rows and show the class on top of each column. Addtionally, tibbles have class "tbl_df" and "tbl_" in addition to "data.frame". Compare and contrast the following operations on a data.frame and equivalent tibble. What is different? Why might the default data frame behaviours cause you frustration? df <- data.frame(abc = 1, xyz = "a") df$x #> [1] a #> Levels: a df[, "xyz"] #> [1] a #> Levels: a df[, c("abc", "xyz")] #> abc xyz #> 1 1 a tbl <- as_tibble(df) tbl$x #> Warning: Unknown column 'x' #> NULL tbl[, "xyz"] #> # A tibble: 1 × 1 #> xyz #> <fctr> #> 1 a tbl[, c("abc", "xyz")] #> # A tibble: 1 × 2 #> abc xyz #> <dbl> <fctr> #> 1 1 a Using $ a data.frame will partially complete the column. So even though we wrote df$x it returned df$xyz. This saves a few keystrokes, but can result in accidentally using a different variable than you thought you were using. With data.frames, with [ the type of object that is returned differs on the number of columns. If it is one column, it won’t return a data.frame, but instead will return a vector. With more than one column, then it will return a data.frame. This is fine if you know what you are passing in, but suppose you did df[ , vars] where vars was a variable. Then you what that code does depends on length(vars) and you’d have to write code to account for those situations or risk bugs. If you have the name of a variable stored in an object, e.g. var <- "mpg", how can you extract the reference variable from a tibble? You can use the double bracket, like df[[var]]. You cannot use the dollar sign, becuase df$var would look for a column named var. Practice referring to non-syntactic names in the following data frame by: Extracting the variable called 1. Plotting a scatterplot of 1 vs 2. Creating a new column called 3 which is 2 divided by 1. Renaming the columns to one, two and three. annoying <- tibble( `1` = 1:10, `2` = `1` * 2 + rnorm(length(`1`)) ) Extract the variable called 1: annoying[["1"]] #> [1] 1 2 3 4 5 6 7 8 9 10 or annoying$`1` #> [1] 1 2 3 4 5 6 7 8 9 10 A scatterplot of 1 vs. 2: ggplot(annoying, aes(x = `1`, y = `2`)) + geom_point() A new column 3 with is 2 divided by 1: annoying[["3"]] <- annoying$`2` / annoying$`1` or annoying[["3"]] <- annoying[["2"]] / annoying[["1"]] Renaming the columns to one, two, and three: annoying <- rename(annoying, one = `1`, two = `2`, three = `3`) glimpse(annoying) #> Observations: 10 #> Variables: 3 #> $ one <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 #> $ two <dbl> 0.60, 4.26, 3.56, 7.99, 10.62, 13.15, 12.18, 15.75, 17.7... #> $ three <dbl> 0.60, 2.13, 1.19, 2.00, 2.12, 2.19, 1.74, 1.97, 1.97, 1.97 What does tibble::enframe() do? When might you use it? It converts named vectors to a data frame with names and values ?tibble::enframe enframe(c(a = 1, b = 2, c = 3)) #> # A tibble: 3 × 2 #> name value #> <chr> <dbl> #> 1 a 1 #> 2 b 2 #> 3 c 3 What option controls how many additional column names are printed at the footer of a tibble? The print function for tibbles is in print.tbl_df: ?print.tbl_df The option n_extra determines the number of extra columns to print information for. "], -["data-import.html", "6 Data Import 6.1 Introduction 6.2 Getting started 6.3 Parsing a vector 6.4 Other Types of Data", " 6 Data Import 6.1 Introduction Functions and packages used: readr, feather, haven, rio read_csv parse_* type_convert save, load readRDS, writeRDS write_csv, write_tsv, write_feather read_lines, read_file library("tidyverse") 6.2 Getting started Note: read_log() is important for web data, but likely to be irrelevant to most political science research TODO fill in the links and add any missing 6.2.1 Exercises What function would you use to read a file where fields were separated with “|”? I’d use read_delim with delim="|": read_delim(file, delim = "|") Apart from file, skip, and comment, what other arguments do read_csv() and read_tsv() have in common? They have the following arguments in common: union(names(formals(read_csv)), names(formals(read_tsv))) #> [1] "file" "col_names" "col_types" "locale" "na" #> [6] "quoted_na" "comment" "trim_ws" "skip" "n_max" #> [11] "guess_max" "progress" col_names and col_types are used to specify the column names and how to parse the columns locale is important for determining things like the enecoding and whether “.” or “,” is used as a decimal mark. na and quoted_na control which strings are treated as missing values when parsing vectors trim_ws trims whitespace before and after cells before parsing n_max sets how many rows to read guess_max sets how many rows to use when guessing the column type progress determines whether a progress bar is shown. What are the most important arguments to read_fwf()? The most important argument to read_fwf which reads “fixed-width formats”, is col_positions which tells the function where data columns begin and end. Sometimes strings in a CSV file contain commas. To prevent them from causing problems they need to be surrounded by a quoting character, like " or '. By convention, read_csv() assumes that the quoting character will be ", and if you want to change it you’ll need to use read_delim() instead. What arguments do you need to specify to read the following text into a data frame? "x,y\\n1,'a,b'" x <- "x,y\\n1,'a,b'" read_delim(x, ",", quote = "'") #> # A tibble: 1 × 2 #> x y #> <int> <chr> #> 1 1 a,b Identify what is wrong with each of the following inline CSV files. What happens when you run the code? read_csv("a,b\\n1,2,3\\n4,5,6") #> Warning: 2 parsing failures. #> row col expected actual #> 1 -- 2 columns 3 columns #> 2 -- 2 columns 3 columns #> # A tibble: 2 × 2 #> a b #> <int> <int> #> 1 1 2 #> 2 4 5 Only two columns are specified in the header “a” and “b”, but the rows have three columns, so the last column in dropped. read_csv("a,b,c\\n1,2\\n1,2,3,4") #> Warning: 2 parsing failures. #> row col expected actual #> 1 -- 3 columns 2 columns #> 2 -- 3 columns 4 columns #> # A tibble: 2 × 3 #> a b c #> <int> <int> <int> #> 1 1 2 NA #> 2 1 2 3 The numbers of columns in the data do not match the number of columns in the header (three). In row one, there are only two values, so column c is set to missing. In row two, there is an extra value, and that value is dropped. read_csv("a,b\\n\\"1") #> Warning: 2 parsing failures. #> row col expected actual #> 1 a closing quote at end of file #> 1 -- 2 columns 1 columns #> # A tibble: 1 × 2 #> a b #> <int> <chr> #> 1 1 <NA> It’s not clear what the intent was here. The opening quote \\\\"1 is dropped because it is not closed, and a is treated as an integer. read_csv("a,b\\n1,2\\na,b") #> # A tibble: 2 × 2 #> a b #> <chr> <chr> #> 1 1 2 #> 2 a b Both “a” and “b” are treated as character vectors since they contain non-numeric strings. This may have been intentional, or the author may have intended the values of the columns to be “1,2” and “a,b”. read_csv("a;b\\n1;3") #> # A tibble: 1 × 1 #> `a;b` #> <chr> #> 1 1;3 The values are separated by “;” rather than “,”. Use read_csv2 instead: read_csv2("a;b\\n1;3") #> # A tibble: 1 × 2 #> a b #> <int> <int> #> 1 1 3 6.3 Parsing a vector Notes This is detailed, but these details can make your life hell. Skim now, but be aware that what should be simple, actually is not. In data analysis, ��% is data cleaning, ��% is modeling, and the rest is character encoding issues — Jeffrey B. Arnold (@jrnld) July 31, 2016 This Computerphile video on Unicode is great Characters, Symbols and the Unicode Miracle - Computerphile Note that these issues are real. Reusing one of Chris Adolph’s csv files from an earlier version of this course gave me problems, resulting in me filing this bug report. The suggested reading is very useful: http://kunststube.net/encoding/ This becomes especially useful when you take “Text as Data”. charToRaw("Jeff") #> [1] 4a 65 66 66 class(charToRaw("Jeff")) #> [1] "raw" 6.3.1 Exercises What are the most important arguments to locale()? The locale broadly controls the following: date and time formats: date_names, date_format, and time_format time_zone: tz numbers: decimal_mark, grouping_mark encoding: encoding What happens if you try and set decimal_mark and grouping_mark to the same character? What happens to the default value of grouping_mark when you set decimal_mark to “,”? What happens to the default value of decimal_mark when you set the grouping_mark to “.”? If the decimal and grouping marks are set to the same character, locale throws an error: locale(decimal_mark = ".", grouping_mark = ".") #> Error: `decimal_mark` and `grouping_mark` must be different If the decimal_mark is set to the comma “,", then the grouping mark is set to the period ".": locale(decimal_mark = ",") #> <locale> #> Numbers: 123.456,78 #> Formats: %AD / %AT #> Timezone: UTC #> Encoding: UTF-8 #> <date_names> #> Days: Sunday (Sun), Monday (Mon), Tuesday (Tue), Wednesday (Wed), #> Thursday (Thu), Friday (Fri), Saturday (Sat) #> Months: January (Jan), February (Feb), March (Mar), April (Apr), May #> (May), June (Jun), July (Jul), August (Aug), September #> (Sep), October (Oct), November (Nov), December (Dec) #> AM/PM: AM/PM If the grouping mark is set to a period, then the decimal mark is set to a comma locale(grouping_mark = ",") #> <locale> #> Numbers: 123,456.78 #> Formats: %AD / %AT #> Timezone: UTC #> Encoding: UTF-8 #> <date_names> #> Days: Sunday (Sun), Monday (Mon), Tuesday (Tue), Wednesday (Wed), #> Thursday (Thu), Friday (Fri), Saturday (Sat) #> Months: January (Jan), February (Feb), March (Mar), April (Apr), May #> (May), June (Jun), July (Jul), August (Aug), September #> (Sep), October (Oct), November (Nov), December (Dec) #> AM/PM: AM/PM I didn’t discuss the date_format and time_format options to locale(). What do they do? Construct an example that shows when they might be useful. They provide default date and time formats. The readr vignette discusses using these to parse dates: since dates can include languages specific weekday and month names, and different conventions for specifying AM/PM locale() #> <locale> #> Numbers: 123,456.78 #> Formats: %AD / %AT #> Timezone: UTC #> Encoding: UTF-8 #> <date_names> #> Days: Sunday (Sun), Monday (Mon), Tuesday (Tue), Wednesday (Wed), #> Thursday (Thu), Friday (Fri), Saturday (Sat) #> Months: January (Jan), February (Feb), March (Mar), April (Apr), May #> (May), June (Jun), July (Jul), August (Aug), September #> (Sep), October (Oct), November (Nov), December (Dec) #> AM/PM: AM/PM Examples from the readr vignette of parsing French dates parse_date("1 janvier 2015", "%d %B %Y", locale = locale("fr")) #> [1] "2015-01-01" parse_date("14 oct. 1979", "%d %b %Y", locale = locale("fr")) #> [1] "1979-10-14" Apparently the time format is not used for anything, but the date format is used for guessing column types. If you live outside the US, create a new locale object that encapsulates the settings for the types of file you read most commonly. ?locale What’s the difference between read_csv() and read_csv2()? The delimiter. The function read_csv uses a comma, while read_csv2 uses a semi-colon (;). Using a semi-colon is useful when commas are used as the decimal point (as in Europe). What are the most common encodings used in Europe? What are the most common encodings used in Asia? Do some googling to find out. UTF-8 is standard now, and ASCII has been around forever. For the European languages, there are separate encodings for Romance languages and Eastern European languages using Latin script, Cyrillic, Greek, Hebrew, Turkish: usually with separate ISO and Windows encoding standards. There is also Mac OS Roman. For Asian languages Arabic and Vietnamese have ISO and Windows standards. The other major Asian scripts have their own: Japanese: JIS X 0208, Shift JIS, ISO-2022-JP Chinese: GB 2312, GBK, GB 18030 Korean: KS X 1001, EUC-KR, ISO-2022-KR The list in the documentation for stringi::stri_enc_detect is pretty good since it supports the most common encodings: Western European Latin script languages: ISO-8859-1, Windows-1250 (also CP-1250 for code-point) Eastern European Latin script languages: ISO-8859-2, Windows-1252 Greek: ISO-8859-7 Turkish: ISO-8859-9, Windows-1254 Hebrew: ISO-8859-8, IBM424, Windows 1255 Russian: Windows 1251 Japanese: Shift JIS, ISO-2022-JP, EUC-JP Korean: ISO-2022-KR, EUC-KR Chinese: GB18030, ISO-2022-CN (Simplified), Big5 (Traditional) Arabic: ISO-8859-6, IBM420, Windows 1256 For more information: https://en.wikipedia.org/wiki/Character_encoding has a good list http://stackoverflow.com/questions/8509339/what-is-the-most-common-encoding-of-each-language http://kunststube.net/encoding/ Some of the more useful programs for this In R see readr::guess_encoding and the stringi package with str_enc_detect iconv: https://en.wikipedia.org/wiki/Iconv chardet: https://github.com/chardet/chardet (Python) Generate the correct format string to parse each of the following dates and times: 6.4 Other Types of Data NOTES Expand on what’s in this section: The rio package is very useful for loading different types of Other useful functions and packages not mentioned here: Stata: haven, read_dta. While the foreign package in R reads Stata files, it cannot read files created by the most recent version of Stata (> 13). SPSS: haven::read_spss SAS: haven::read_sas XLS: readxl::read_excel JSON: jsonlite pacakge. However, often there are APIs and clients which make this easier, e.g. pollstR which returns data from the Huffington Post Pollster API. XML: xml2 HTML: rvest Databases: DBI and backends PDF: This is really a different thing as you are extracting data from an unstructured form. It also depends on whether the PDF text is actually an image as from a scan, in which case you need to use OCR to first indentify words. tabulizer: extracts tables from PDF documents pdftools: extracts text from pdf documents Also see general text analysis packages like tm, quanteda, etc. which often have functions to assist with getting text from PDFs. Links to Jenny Bryan’s tutorials: purrr-tutorial worked examples Discussing csvy as an example of a csv with metadata "], -["tidy-data.html", "7 Tidy Data 7.1 Introduction 7.2 Tidy Data 7.3 Spreading and Gathering 7.4 Separating and Uniting 7.5 Missing Values 7.6 Case Study 7.7 Non-Tidy Data", " 7 Tidy Data 7.1 Introduction Functions used in this chapter spread gather separate unite complete fill library(tidyverse) 7.2 Tidy Data NOTES Add Tidy Data to reading Use COW war dataset as an example of non-tidy data Also WDI data for non-tidy data Replication datatsets are often non-tidy. Why? See this post by Jeff Leek The Rules Each variable has its own column Each observation muust have its own row Each value must have its own cell or even Put each dataset in a tibble Put each variable in a column These seem obvious at first, so we need to see examples of not-following tidy data and what happens. Some nuances: The definitions of variable, observation, and value are not always clear. And how you store and arrange the data can depend on how you aim to use it. Generally, aim for storing the data in a tidy format that ensures minimal errors. When you model it, you can transform the data later. See non-tidy data. It is easier to work with variables in columns because of mutate and summary functions. It will also work better with tidyverse functions: e.g. using group_by to group and summarize, or facet_* and aesthetics in ggplot2. The tidy data ideas are adapted from the database normalization, but simplified and adapted to the general uses of practicing data scientists. 7.2.1 Exercises Using prose, describe how the variables and observations are organised in each of the sample tables. In table1 each row is a (country, year) with variables cases and population. table1 #> # A tibble: 6 × 4 #> country year cases population #> <chr> <int> <int> <int> #> 1 Afghanistan 1999 745 19987071 #> 2 Afghanistan 2000 2666 20595360 #> 3 Brazil 1999 37737 172006362 #> 4 Brazil 2000 80488 174504898 #> 5 China 1999 212258 1272915272 #> 6 China 2000 213766 1280428583 In table2, each row is country, year , variable (“cases”, “population”) combination, and there is a count variable with the numeric value of the variable. table2 #> # A tibble: 12 × 4 #> country year type count #> <chr> <int> <chr> <int> #> 1 Afghanistan 1999 cases 745 #> 2 Afghanistan 1999 population 19987071 #> 3 Afghanistan 2000 cases 2666 #> 4 Afghanistan 2000 population 20595360 #> 5 Brazil 1999 cases 37737 #> 6 Brazil 1999 population 172006362 #> # ... with 6 more rows In table3, each row is a (country, year) combination with the column rate having the rate of cases to population as a character string in the format "cases/rate". table3 #> # A tibble: 6 × 3 #> country year rate #> * <chr> <int> <chr> #> 1 Afghanistan 1999 745/19987071 #> 2 Afghanistan 2000 2666/20595360 #> 3 Brazil 1999 37737/172006362 #> 4 Brazil 2000 80488/174504898 #> 5 China 1999 212258/1272915272 #> 6 China 2000 213766/1280428583 Table 4 is split into two tables, one table for each variable: table4a is the table for cases, while table4b is the table for population. Within each table, each row is a country, each column is a year, and the cells are the value of the variable for the table. table4a #> # A tibble: 3 × 3 #> country `1999` `2000` #> * <chr> <int> <int> #> 1 Afghanistan 745 2666 #> 2 Brazil 37737 80488 #> 3 China 212258 213766 table4b #> # A tibble: 3 × 3 #> country `1999` `2000` #> * <chr> <int> <int> #> 1 Afghanistan 19987071 20595360 #> 2 Brazil 172006362 174504898 #> 3 China 1272915272 1280428583 Compute the rate for table2, and table4a + table4b. You will need to perform four operations: Extract the number of TB cases per country per year. Extract the matching population per country per year. Divide cases by population, and multiply by 10000. Store back in the appropriate place. Which representation is easiest to work with? Which is hardest? Why? Without using the joins covered in Ch 12: tb2_cases <- filter(table2, type == "cases")[["count"]] tb2_country <- filter(table2, type == "cases")[["country"]] tb2_year <- filter(table2, type == "cases")[["year"]] tb2_population <- filter(table2, type == "population")[["count"]] table2_clean <- tibble(country = tb2_country, year = tb2_year, rate = tb2_cases / tb2_population) table2_clean #> # A tibble: 6 × 3 #> country year rate #> <chr> <int> <dbl> #> 1 Afghanistan 1999 3.73e-05 #> 2 Afghanistan 2000 1.29e-04 #> 3 Brazil 1999 2.19e-04 #> 4 Brazil 2000 4.61e-04 #> 5 China 1999 1.67e-04 #> 6 China 2000 1.67e-04 Note, that this assumes that all observations are sorted so that each country, year will have the observation for cases followed by population. tibble(country = table4a[["country"]], `1999` = table4a[["1999"]] / table4b[["1999"]], `2000` = table4b[["2000"]] / table4b[["2000"]]) #> # A tibble: 3 × 3 #> country `1999` `2000` #> <chr> <dbl> <dbl> #> 1 Afghanistan 3.73e-05 1 #> 2 Brazil 2.19e-04 1 #> 3 China 1.67e-04 1 or tibble(country = rep(table4a[["country"]], 2), year = rep(c(1999, 2000), each = nrow(table4a)), `rate` = c(table4a[["1999"]] / table4b[["1999"]], table4b[["2000"]] / table4b[["2000"]])) #> # A tibble: 6 × 3 #> country year rate #> <chr> <dbl> <dbl> #> 1 Afghanistan 1999 3.73e-05 #> 2 Brazil 1999 2.19e-04 #> 3 China 1999 1.67e-04 #> 4 Afghanistan 2000 1.00e+00 #> 5 Brazil 2000 1.00e+00 #> 6 China 2000 1.00e+00 Recreate the plot showing change in cases over time using table2 instead of table1. What do you need to do first? First, I needed to filter the tibble to only include those rows that represented the “cases” variable. table2 %>% filter(type == "cases") %>% ggplot(aes(year, count)) + geom_line(aes(group = country), colour = "grey50") + geom_point(aes(colour = country)) 7.3 Spreading and Gathering tidy4a <- table4a %>% gather(`1999`, `2000`, key = "year", value = "cases") tidy4b <- table4b %>% gather(`1999`, `2000`, key = "year", value = "cases") left_join(tidy4a, tidy4b) #> Joining, by = c("country", "year", "cases") #> # A tibble: 6 × 3 #> country year cases #> <chr> <chr> <int> #> 1 Afghanistan 1999 745 #> 2 Brazil 1999 37737 #> 3 China 1999 212258 #> 4 Afghanistan 2000 2666 #> 5 Brazil 2000 80488 #> 6 China 2000 213766 spread(table2, key = type, value = count) #> # A tibble: 6 × 4 #> country year cases population #> * <chr> <int> <int> <int> #> 1 Afghanistan 1999 745 19987071 #> 2 Afghanistan 2000 2666 20595360 #> 3 Brazil 1999 37737 172006362 #> 4 Brazil 2000 80488 174504898 #> 5 China 1999 212258 1272915272 #> 6 China 2000 213766 1280428583 7.3.1 Exercises Why are gather() and spread() not perfectly symmetrical? Carefully consider the following example: stocks <- tibble( year = c(2015, 2015, 2016, 2016), half = c( 1, 2, 1, 2), return = c(1.88, 0.59, 0.92, 0.17) ) stocks %>% spread(year, return) %>% gather("year", "return", `2015`:`2016`) #> # A tibble: 4 × 3 #> half year return #> <dbl> <chr> <dbl> #> 1 1 2015 1.88 #> 2 2 2015 0.59 #> 3 1 2016 0.92 #> 4 2 2016 0.17 The functions spread and gather are not perfectly symmetrical because column type information is not transferred between them. In the original table the column year was numeric, but after the spread-gather cyle it is character, because with gather, variable names are always converted to a character vector. The convert argument tries to convert character vectors to the appropriate type. In the background this uses the type.convert function. stocks %>% spread(year, return) %>% gather("year", "return", `2015`:`2016`, convert = TRUE) #> # A tibble: 4 × 3 #> half year return #> <dbl> <int> <dbl> #> 1 1 2015 1.88 #> 2 2 2015 0.59 #> 3 1 2016 0.92 #> 4 2 2016 0.17 Why does this code fail? table4a %>% gather(1999, 2000, key = "year", value = "cases") #> Error in eval(expr, envir, enclos): Position must be between 0 and n The code fails because the column names 1999 and 2000 are not standard and thus needs to be quoted. The tidyverse functions will interpret 1999 and 2000 without quotes as looking for the 1999th and 2000th column of the data frame. This will work: table4a %>% gather(`1999`, `2000`, key = "year", value = "cases") #> # A tibble: 6 × 3 #> country year cases #> <chr> <chr> <int> #> 1 Afghanistan 1999 745 #> 2 Brazil 1999 37737 #> 3 China 1999 212258 #> 4 Afghanistan 2000 2666 #> 5 Brazil 2000 80488 #> 6 China 2000 213766 Why does spreading this tibble fail? How could you add a new column to fix the problem? people <- tribble( ~name, ~key, ~value, #-----------------|--------|------ "Phillip Woods", "age", 45, "Phillip Woods", "height", 186, "Phillip Woods", "age", 50, "Jessica Cordero", "age", 37, "Jessica Cordero", "height", 156 ) glimpse(people) #> Observations: 5 #> Variables: 3 #> $ name <chr> "Phillip Woods", "Phillip Woods", "Phillip Woods", "Jess... #> $ key <chr> "age", "height", "age", "age", "height" #> $ value <dbl> 45, 186, 50, 37, 156 spread(people, key, value) #> Error: Duplicate identifiers for rows (1, 3) Spreading the data frame fails because there are two rows with “age” for “Phillip Woods”. We would need to add another column with an indicator for the number observation it is, people <- tribble( ~name, ~key, ~value, ~obs, #-----------------|--------|------|------ "Phillip Woods", "age", 45, 1, "Phillip Woods", "height", 186, 1, "Phillip Woods", "age", 50, 2, "Jessica Cordero", "age", 37, 1, "Jessica Cordero", "height", 156, 1 ) spread(people, key, value) #> # A tibble: 3 × 4 #> name obs age height #> * <chr> <dbl> <dbl> <dbl> #> 1 Jessica Cordero 1 37 156 #> 2 Phillip Woods 1 45 186 #> 3 Phillip Woods 2 50 NA Tidy the simple tibble below. Do you need to spread or gather it? What are the variables? preg <- tribble( ~pregnant, ~male, ~female, "yes", NA, 10, "no", 20, 12 ) You need to gather it. The variables are: pregnant: logical (“yes”, “no”) female: logical count: integer gather(preg, sex, count, male, female) %>% mutate(pregnant = pregnant == "yes", female = sex == "female") %>% select(-sex) #> # A tibble: 4 × 3 #> pregnant count female #> <lgl> <dbl> <lgl> #> 1 TRUE NA FALSE #> 2 FALSE 20 FALSE #> 3 TRUE 10 TRUE #> 4 FALSE 12 TRUE Converting pregnant and female from character vectors to logical was not necessary to tidy it, but it makes it easier to work with. 7.4 Separating and Uniting table3 %>% separate(rate, into = c("cases", "population"), sep = "/", convert = TRUE) %>% separate(year, into = c("century", "year"), sep = 2) #> # A tibble: 6 × 5 #> country century year cases population #> * <chr> <chr> <chr> <int> <int> #> 1 Afghanistan 19 99 745 19987071 #> 2 Afghanistan 20 00 2666 20595360 #> 3 Brazil 19 99 37737 172006362 #> 4 Brazil 20 00 80488 174504898 #> 5 China 19 99 212258 1272915272 #> 6 China 20 00 213766 1280428583 table5 %>% unite(new, century, year, sep = "") #> # A tibble: 6 × 3 #> country new rate #> * <chr> <chr> <chr> #> 1 Afghanistan 1999 745/19987071 #> 2 Afghanistan 2000 2666/20595360 #> 3 Brazil 1999 37737/172006362 #> 4 Brazil 2000 80488/174504898 #> 5 China 1999 212258/1272915272 #> 6 China 2000 213766/1280428583 7.4.1 Exercises What do the extra and fill arguments do in separate()? Experiment with the various options for the following two toy datasets. tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>% separate(x, c("one", "two", "three")) #> Warning: Too many values at 1 locations: 2 #> # A tibble: 3 × 3 #> one two three #> * <chr> <chr> <chr> #> 1 a b c #> 2 d e f #> 3 h i j tibble(x = c("a,b,c", "d,e", "f,g,i")) %>% separate(x, c("one", "two", "three")) #> Warning: Too few values at 1 locations: 2 #> # A tibble: 3 × 3 #> one two three #> * <chr> <chr> <chr> #> 1 a b c #> 2 d e <NA> #> 3 f g i ?separate The extra argument tells separate what to do if there are too many pieces, and the fill argument if there aren’t enough. tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>% separate(x, c("one", "two", "three")) #> Warning: Too many values at 1 locations: 2 #> # A tibble: 3 × 3 #> one two three #> * <chr> <chr> <chr> #> 1 a b c #> 2 d e f #> 3 h i j By default separate drops the extra values with a warning. tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>% separate(x, c("one", "two", "three"), extra = "drop") #> # A tibble: 3 × 3 #> one two three #> * <chr> <chr> <chr> #> 1 a b c #> 2 d e f #> 3 h i j This produces the same result as above, dropping extra values, but without the warning. tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>% separate(x, c("one", "two", "three"), extra = "merge") #> # A tibble: 3 × 3 #> one two three #> * <chr> <chr> <chr> #> 1 a b c #> 2 d e f,g #> 3 h i j In this, the extra values are not split, so “f,g” appears in column three. In this, one of the entries for column, “d,e”, has too few elements. The default for fill is similar to separate; it fills with missing values but emits a warning. In this, row 2 of column “three”, is NA. tibble(x = c("a,b,c", "d,e", "f,g,i")) %>% separate(x, c("one", "two", "three")) #> Warning: Too few values at 1 locations: 2 #> # A tibble: 3 × 3 #> one two three #> * <chr> <chr> <chr> #> 1 a b c #> 2 d e <NA> #> 3 f g i Alternative options for fill are "right", to fill with missing values from the right, but without a warning tibble(x = c("a,b,c", "d,e", "f,g,i")) %>% separate(x, c("one", "two", "three"), fill = "right") #> # A tibble: 3 × 3 #> one two three #> * <chr> <chr> <chr> #> 1 a b c #> 2 d e <NA> #> 3 f g i The option fill = "left" also fills with missing values without a warning, but this time from the left side. Now, column “one” of row 2 will be missing, and the other values in that row are shifted over. tibble(x = c("a,b,c", "d,e", "f,g,i")) %>% separate(x, c("one", "two", "three"), fill = "left") #> # A tibble: 3 × 3 #> one two three #> * <chr> <chr> <chr> #> 1 a b c #> 2 <NA> d e #> 3 f g i Both unite() and separate() have a remove argument. What does it do? Why would you set it to FALSE? You would set it to FALSE if you want to create a new variable, but keep the old one. Compare and contrast separate() and extract(), Why are there three variations of separation (by position, by separator, and with groups), but only one unite? The function extract uses a regular expression to find groups and split into columns. In unite it is unambigous since it is many columns to one, and once the columns are specified, there is only one way to do it, the only choice is the sep. In separate, it is one to many, and there are multiple ways to split the character string. 7.5 Missing Values 7.5.1 Exercises Compare and contrast the fill arguments to spread() and complete(). ?spread ?complete In spread, the fill argument explicitly sets the value to replace NAs. In complete, the fill argument also sets a value to replace NAs but it is named list, allowing for different values for different variables. Also, both cases replace both implicit and explicit missing values. What does the direction argument to fill() do? With fill, it determines whether NA values should be replaced by the previous non-missing value ("down") or the next non-missing value ("up"). 7.6 Case Study who1 <- who %>% gather(new_sp_m014:newrel_f65, key = "key", value = "cases", na.rm = TRUE) glimpse(who1) #> Observations: 76,046 #> Variables: 6 #> $ country <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanis... #> $ iso2 <chr> "AF", "AF", "AF", "AF", "AF", "AF", "AF", "AF", "AF", ... #> $ iso3 <chr> "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG"... #> $ year <int> 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, ... #> $ key <chr> "new_sp_m014", "new_sp_m014", "new_sp_m014", "new_sp_m... #> $ cases <int> 0, 30, 8, 52, 129, 90, 127, 139, 151, 193, 186, 187, 2... who2 <- who1 %>% mutate(key = stringr::str_replace(key, "newrel", "new_rel")) who3 <- who2 %>% separate(key, c("new", "type", "sexage"), sep = "_") who3 #> # A tibble: 76,046 × 8 #> country iso2 iso3 year new type sexage cases #> * <chr> <chr> <chr> <int> <chr> <chr> <chr> <int> #> 1 Afghanistan AF AFG 1997 new sp m014 0 #> 2 Afghanistan AF AFG 1998 new sp m014 30 #> 3 Afghanistan AF AFG 1999 new sp m014 8 #> 4 Afghanistan AF AFG 2000 new sp m014 52 #> 5 Afghanistan AF AFG 2001 new sp m014 129 #> 6 Afghanistan AF AFG 2002 new sp m014 90 #> # ... with 7.604e+04 more rows who3 %>% count(new) #> # A tibble: 1 × 2 #> new n #> <chr> <int> #> 1 new 76046 who4 <- who3 %>% select(-new, -iso2, -iso3) who5 <- who4 %>% separate(sexage, c("sex", "age"), sep = 1) who5 #> # A tibble: 76,046 × 6 #> country year type sex age cases #> * <chr> <int> <chr> <chr> <chr> <int> #> 1 Afghanistan 1997 sp m 014 0 #> 2 Afghanistan 1998 sp m 014 30 #> 3 Afghanistan 1999 sp m 014 8 #> 4 Afghanistan 2000 sp m 014 52 #> 5 Afghanistan 2001 sp m 014 129 #> 6 Afghanistan 2002 sp m 014 90 #> # ... with 7.604e+04 more rows 7.6.1 Exercises In this case study I set na.rm = TRUE just to make it easier to check that we had the correct values. Is this reasonable? Think about how missing values are represented in this dataset. Are there implicit missing values? What’s the difference between an NA and zero? Perhaps? I would need to know more about the data generation process. There are zero’s in the data, which means they may explicitly be indicating no cases. who1 %>% filter(cases == 0) %>% nrow() #> [1] 11080 So it appears that either a country has all its values in a year as non-missing if the WHO collected data for that country, or all its values are non-missing. So it is okay to treat explicitly and implicitly missing values the same, and we don’t lose any information by dropping them. gather(who, new_sp_m014:newrel_f65, key = "key", value = "cases") %>% group_by(country, year) %>% mutate(missing = is.na(cases)) %>% select(country, year, missing) %>% distinct() %>% group_by(country, year) %>% filter(n() > 1) #> Source: local data frame [0 x 2] #> Groups: country, year [0] #> #> # ... with 2 variables: country <chr>, year <int> What happens if you neglect the mutate() step? (mutate(key = stringr::str_replace(key, "newrel", "new_rel")) separate emits the warning “too few values”, and if we check the rows for keys beginning with "newrel_", we see that sexage is messing, and type = m014. who3a <- who1 %>% separate(key, c("new", "type", "sexage"), sep = "_") #> Warning: Too few values at 2580 locations: 73467, 73468, 73469, 73470, #> 73471, 73472, 73473, 73474, 73475, 73476, 73477, 73478, 73479, 73480, #> 73481, 73482, 73483, 73484, 73485, 73486, ... filter(who3a, new == "newrel") %>% head() #> # A tibble: 6 × 8 #> country iso2 iso3 year new type sexage cases #> <chr> <chr> <chr> <int> <chr> <chr> <chr> <int> #> 1 Afghanistan AF AFG 2013 newrel m014 <NA> 1705 #> 2 Albania AL ALB 2013 newrel m014 <NA> 14 #> 3 Algeria DZ DZA 2013 newrel m014 <NA> 25 #> 4 Andorra AD AND 2013 newrel m014 <NA> 0 #> 5 Angola AO AGO 2013 newrel m014 <NA> 486 #> 6 Anguilla AI AIA 2013 newrel m014 <NA> 0 I claimed that iso2 and iso3 were redundant with country. Confirm this claim. select(who3, country, iso2, iso3) %>% distinct() %>% group_by(country) %>% filter(n() > 1) #> Source: local data frame [0 x 3] #> Groups: country [0] #> #> # ... with 3 variables: country <chr>, iso2 <chr>, iso3 <chr> For each country, year, and sex compute the total number of cases of TB. Make an informative visualisation of the data. who5 %>% group_by(country, year, sex) %>% filter(year > 1995) %>% summarise(cases = sum(cases)) %>% unite(country_sex, country, sex, remove = FALSE) %>% ggplot(aes(x = year, y = cases, group = country_sex, colour = sex)) + geom_line() A small multiples plot faceting by country is difficult given the number of countries. Focusing on those countries with the largest changes or absolute magnitudes after providing the context above is another option. 7.7 Non-Tidy Data Corpus and text data is often stored in sparse Matrices https://cran.r-project.org/web/packages/tm/tm.pdf Graphical data has its own format: http://igraph.org/r/doc/ "], -["relational-data.html", "8 Relational Data 8.1 Prerequisites 8.2 nycflights13 8.3 Keys 8.4 Mutating Joins 8.5 Filtering Joins 8.6 Set operations", " 8 Relational Data 8.1 Prerequisites library("tidyverse") library("nycflights13") Topics, functions keys: primary key, foreign key, mutating joins: left_join, right_join, inner_join, full_join merge vs. joins filtering joins: semi_join, anti_join set operations: intersect, union, setdiff TODO: fuzzy joining 8.2 nycflights13 NOTES nycflights13 is an example of a data-only R package. R packages can contain both functions and data. Since data-sets can get large, often they can be packaged as their own dataset. These sorts of data-only R packages make it convenient for R users to access your data, but it should not be the only way you provide your research data. Not everyone uses R, so the original data should be provided in a program agnostic format (e.g. csv files). This also holds for those using Stata; they should not be distributing data in .dta format files specific to Stata (even if as we saw earlier, other programs can read that data.) Another example of a data-only R package is gapminder. How does Hadley create his diagrams? The four tables in the nycflights13 package: airlines #> # A tibble: 16 × 2 #> carrier name #> <chr> <chr> #> 1 9E Endeavor Air Inc. #> 2 AA American Airlines Inc. #> 3 AS Alaska Airlines Inc. #> 4 B6 JetBlue Airways #> 5 DL Delta Air Lines Inc. #> 6 EV ExpressJet Airlines Inc. #> # ... with 10 more rows airports #> # A tibble: 1,458 × 8 #> faa name lat lon alt tz dst #> <chr> <chr> <dbl> <dbl> <int> <dbl> <chr> #> 1 04G Lansdowne Airport 41.1 -80.6 1044 -5 A #> 2 06A Moton Field Municipal Airport 32.5 -85.7 264 -6 A #> 3 06C Schaumburg Regional 42.0 -88.1 801 -6 A #> 4 06N Randall Airport 41.4 -74.4 523 -5 A #> 5 09J Jekyll Island Airport 31.1 -81.4 11 -5 A #> 6 0A9 Elizabethton Municipal Airport 36.4 -82.2 1593 -5 A #> # ... with 1,452 more rows, and 1 more variables: tzone <chr> planes #> # A tibble: 3,322 × 9 #> tailnum year type manufacturer model engines #> <chr> <int> <chr> <chr> <chr> <int> #> 1 N10156 2004 Fixed wing multi engine EMBRAER EMB-145XR 2 #> 2 N102UW 1998 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2 #> 3 N103US 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2 #> 4 N104UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2 #> 5 N10575 2002 Fixed wing multi engine EMBRAER EMB-145LR 2 #> 6 N105UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2 #> # ... with 3,316 more rows, and 3 more variables: seats <int>, #> # speed <int>, engine <chr> weather #> # A tibble: 26,130 × 15 #> origin year month day hour temp dewp humid wind_dir wind_speed #> <chr> <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> #> 1 EWR 2013 1 1 0 37.0 21.9 54.0 230 10.4 #> 2 EWR 2013 1 1 1 37.0 21.9 54.0 230 13.8 #> 3 EWR 2013 1 1 2 37.9 21.9 52.1 230 12.7 #> 4 EWR 2013 1 1 3 37.9 23.0 54.5 230 13.8 #> 5 EWR 2013 1 1 4 37.9 24.1 57.0 240 15.0 #> 6 EWR 2013 1 1 6 39.0 26.1 59.4 270 10.4 #> # ... with 2.612e+04 more rows, and 5 more variables: wind_gust <dbl>, #> # precip <dbl>, pressure <dbl>, visib <dbl>, time_hour <dttm> 8.2.1 Exercises Imagine you wanted to draw (approximately) the route each plane flies from its origin to its destination. What variables would you need? What tables would you need to combine? flights table: origin and dest airports table: longitude and latitude variables We would merge the flights with airports twice: once to get the location of the origin airport, and once to get the location of the dest airport. I forgot to draw the relationship between weather and airports. What is the relationship and how should it appear in the diagram? The variable origin in weather is matched with faa in airports. weather only contains information for the origin (NYC) airports. If it contained weather records for all airports in the USA, what additional relation would it define with flights? year, month, day, hour, origin in weather would be matched to year, month, day, hour, dest in flight (though it should use the arrival date-time values for dest if possible). We know that some days of the year are “special”, and fewer people than usual fly on them. How might you represent that data as a data frame? What would be the primary keys of that table? How would it connect to the existing tables? I would add a table of special dates. The primary key would be date. It would match to the year, month, day columns of `flights. 8.3 Keys Add a surrogate key to flights. I add the column flight_id as a surrogate key. I sort the data prior to making the key, even though it is not strictly necessary, so the order of the rows has some meaning. flights %>% arrange(year, month, day, sched_dep_time, carrier, flight) %>% mutate(flight_id = row_number()) %>% glimpse() #> Observations: 336,776 #> Variables: 20 #> $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013,... #> $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,... #> $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,... #> $ dep_time <int> 517, 533, 542, 544, 554, 559, 558, 559, 558, 55... #> $ sched_dep_time <int> 515, 529, 540, 545, 558, 559, 600, 600, 600, 60... #> $ dep_delay <dbl> 2, 4, 2, -1, -4, 0, -2, -1, -2, -2, -3, NA, 1, ... #> $ arr_time <int> 830, 850, 923, 1004, 740, 702, 753, 941, 849, 8... #> $ sched_arr_time <int> 819, 830, 850, 1022, 728, 706, 745, 910, 851, 8... #> $ arr_delay <dbl> 11, 20, 33, -18, 12, -4, 8, 31, -2, -3, -8, NA,... #> $ carrier <chr> "UA", "UA", "AA", "B6", "UA", "B6", "AA", "AA",... #> $ flight <int> 1545, 1714, 1141, 725, 1696, 1806, 301, 707, 49... #> $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N39463... #> $ origin <chr> "EWR", "LGA", "JFK", "JFK", "EWR", "JFK", "LGA"... #> $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ORD", "BOS", "ORD"... #> $ air_time <dbl> 227, 227, 160, 183, 150, 44, 138, 257, 149, 158... #> $ distance <dbl> 1400, 1416, 1089, 1576, 719, 187, 733, 1389, 10... #> $ hour <dbl> 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,... #> $ minute <dbl> 15, 29, 40, 45, 58, 59, 0, 0, 0, 0, 0, 0, 0, 0,... #> $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013... #> $ flight_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, ... Identify the keys in the following datasets Lahman::Batting babynames::babynames nasaweather::atmos fueleconomy::vehicles ggplot2::diamonds (You might need to install some packages and read some documentation.) The primary key for Lahman::Batting is playerID, yearID, stint. It is not simply playerID, yearID because players can have different stints in different leagues within the same year. Lahman::Batting %>% group_by(playerID, yearID, stint) %>% filter(n() > 1) %>% nrow() #> [1] 0 The primary key for babynames::babynames is year, sex, name. It is no simply year, name since names can appear for both sexes with different counts. babynames::babynames %>% group_by(year, sex, name) %>% filter(n() > 1) %>% nrow() #> [1] 0 The primary key for nasaweather::atmos is the location and time of the measurement: lat, long, year, month. nasaweather::atmos %>% group_by(lat, long, year, month) %>% filter(n() > 1) %>% nrow() #> [1] 0 The column id (unique EPA identifier) is the primary key for fueleconomy::vehicles: fueleconomy::vehicles %>% group_by(id) %>% filter(n() > 1) %>% nrow() #> [1] 0 There is no primary key for ggplot2::diamonds. Using all variables in the data frame, the number of distinct rows is less than the total number of rows, meaning no combination of variables uniquely identifies the observations. ggplot2::diamonds %>% distinct() %>% nrow() #> [1] 53794 nrow(ggplot2::diamonds) #> [1] 53940 Draw a diagram illustrating the connections between the Batting, Master, and Salaries tables in the Lahman package. Draw another diagram that shows the relationship between Master, Managers, AwardsManagers. Batting primary key: playerID, yearID, stint foreign keys: playerID -> Master.playerID Master primary key: playerID Salaries primary key: yearID, teamID, playerID foreign keys: playerID -> Master.playerID Managers: primary key: yearID, playerID, teamID, inseason foreign keys: playerID -> Master.teamID Managers: primary key: awardID, yearID AwardsManagers: primary key: playerID, awardID, yearID (since there are ties and while tie distinguishes those awards it has NA values) foreign keys: playerID -> Master.playerID playerID, yearID, lgID -> Managers.playerID, yearID, lgID lgID and teamID appear in multiple tables, but should be primary keys for league and team tables. How would you characterise the relationship between the Batting, Pitching, and Fielding tables? 8.4 Mutating Joins flights2 <- flights %>% select(year:day, hour, origin, dest, tailnum, carrier) flights2 %>% select(-origin, -dest) %>% left_join(airlines, by = "carrier") #> # A tibble: 336,776 × 7 #> year month day hour tailnum carrier name #> <int> <int> <int> <dbl> <chr> <chr> <chr> #> 1 2013 1 1 5 N14228 UA United Air Lines Inc. #> 2 2013 1 1 5 N24211 UA United Air Lines Inc. #> 3 2013 1 1 5 N619AA AA American Airlines Inc. #> 4 2013 1 1 5 N804JB B6 JetBlue Airways #> 5 2013 1 1 6 N668DN DL Delta Air Lines Inc. #> 6 2013 1 1 5 N39463 UA United Air Lines Inc. #> # ... with 3.368e+05 more rows 8.4.1 Exercises Compute the average delay by destination, then join on the airports data frame so you can show the spatial distribution of delays. Here’s an easy way to draw a map of the United States: airports %>% semi_join(flights, c("faa" = "dest")) %>% ggplot(aes(lon, lat)) + borders("state") + geom_point() + coord_quickmap() (Don’t worry if you don’t understand what semi_join() does — you’ll learn about it next.) avg_dest_delays <- flights %>% group_by(dest) %>% # arrival delay NA's are cancelled flights summarise(delay = mean(arr_delay, na.rm = TRUE)) %>% inner_join(airports, by = c(dest = "faa")) avg_dest_delays %>% ggplot(aes(lon, lat, colour = delay)) + borders("state") + geom_point() + coord_quickmap() You might want to use the size or colour of the points to display the average delay for each airport. Add the location of the origin and destination (i.e. the lat and lon) to flights. flights %>% left_join(airports, by = c(dest = "faa")) %>% left_join(airports, by = c(origin = "faa")) %>% head() #> # A tibble: 6 × 33 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 1 1 517 515 2 830 #> 2 2013 1 1 533 529 4 850 #> 3 2013 1 1 542 540 2 923 #> 4 2013 1 1 544 545 -1 1004 #> 5 2013 1 1 554 600 -6 812 #> 6 2013 1 1 554 558 -4 740 #> # ... with 26 more variables: sched_arr_time <int>, arr_delay <dbl>, #> # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, #> # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, #> # time_hour <dttm>, name.x <chr>, lat.x <dbl>, lon.x <dbl>, alt.x <int>, #> # tz.x <dbl>, dst.x <chr>, tzone.x <chr>, name.y <chr>, lat.y <dbl>, #> # lon.y <dbl>, alt.y <int>, tz.y <dbl>, dst.y <chr>, tzone.y <chr> Is there a relationship between the age of a plane and its delays? Suprisingly not. If anything (departure) delay seems to decrease slightly with age (perhaps because of selection): plane_ages <- planes %>% mutate(age = 2013 - year) %>% select(tailnum, age) flights %>% inner_join(plane_ages, by = "tailnum") %>% group_by(age) %>% filter(!is.na(dep_delay)) %>% summarise(delay = mean(dep_delay)) %>% ggplot(aes(x = age, y = delay)) + geom_point() + geom_line() #> Warning: Removed 1 rows containing missing values (geom_point). #> Warning: Removed 1 rows containing missing values (geom_path). What weather conditions make it more likely to see a delay? Almost any amount or precipitation is associated with a delay, though not as strong a trend after 0.02 in as one would expect flight_weather <- flights %>% inner_join(weather, by = c("origin" = "origin", "year" = "year", "month" = "month", "day" = "day", "hour" = "hour")) flight_weather %>% group_by(precip) %>% summarise(delay = mean(dep_delay, na.rm = TRUE)) %>% ggplot(aes(x = precip, y = delay)) + geom_line() + geom_point() What happened on June 13 2013? Display the spatial pattern of delays, and then use Google to cross-reference with the weather. There was a large series of storms (derechos) in the southeastern US (see June 12-13, 2013 derecho series) The largest delays are in Tennessee (Nashville) and the Southeaste and Midwest (the location of the derechos). library(viridis) flights %>% filter(year == 2013, month == 6, day == 13) %>% group_by(dest) %>% summarise(delay = mean(arr_delay, na.rm = TRUE)) %>% inner_join(airports, by = c("dest" = "faa")) %>% ggplot(aes(y = lat, x = lon, size = delay, colour = delay)) + borders("state") + geom_point() + coord_quickmap() + scale_color_viridis() #> Warning: Removed 3 rows containing missing values (geom_point). 8.5 Filtering Joins semi_join: keep all obs in x with match in y anti_join: drop all obs in x with a match in y 8.5.1 Exercises What does it mean for a flight to have a missing tailnum? What do the tail numbers that don’t have a matching record in planes have in common? (Hint: one variable explains ~90% of the problems.) American Airlines (AA) and Envoy Airlines (MQ) don’t report tail numbers. flights %>% anti_join(planes, by = "tailnum") %>% count(carrier, sort = TRUE) #> # A tibble: 10 × 2 #> carrier n #> <chr> <int> #> 1 MQ 25397 #> 2 AA 22558 #> 3 UA 1693 #> 4 9E 1044 #> 5 B6 830 #> 6 US 699 #> # ... with 4 more rows Filter flights to only show flights with planes that have flown at least 100 flights. planes_gt100 <- filter(flights) %>% group_by(tailnum) %>% count() %>% filter(n > 100) flights %>% semi_join(planes_gt100, by = "tailnum") #> # A tibble: 229,202 × 19 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 1 1 1604 1510 54 1817 #> 2 2013 1 1 2100 2100 0 2307 #> 3 2013 1 2 827 835 -8 1059 #> 4 2013 1 2 2014 2020 -6 2256 #> 5 2013 1 4 1621 1625 -4 1853 #> 6 2013 1 5 834 835 -1 1050 #> # ... with 2.292e+05 more rows, and 12 more variables: #> # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>, #> # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, #> # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm> Combine fueleconomy::vehicles and fueleconomy::common to find only the records for the most common models. The table fueleconomy::common identifies vehicles by make and model: glimpse(fueleconomy::vehicles) #> Observations: 33,442 #> Variables: 12 #> $ id <int> 27550, 28426, 27549, 28425, 1032, 1033, 3347, 13309, 133... #> $ make <chr> "AM General", "AM General", "AM General", "AM General", ... #> $ model <chr> "DJ Po Vehicle 2WD", "DJ Po Vehicle 2WD", "FJ8c Post Off... #> $ year <int> 1984, 1984, 1984, 1984, 1985, 1985, 1987, 1997, 1997, 19... #> $ class <chr> "Special Purpose Vehicle 2WD", "Special Purpose Vehicle ... #> $ trans <chr> "Automatic 3-spd", "Automatic 3-spd", "Automatic 3-spd",... #> $ drive <chr> "2-Wheel Drive", "2-Wheel Drive", "2-Wheel Drive", "2-Wh... #> $ cyl <int> 4, 4, 6, 6, 4, 6, 6, 4, 4, 6, 4, 4, 6, 4, 4, 6, 5, 5, 6,... #> $ displ <dbl> 2.5, 2.5, 4.2, 4.2, 2.5, 4.2, 3.8, 2.2, 2.2, 3.0, 2.3, 2... #> $ fuel <chr> "Regular", "Regular", "Regular", "Regular", "Regular", "... #> $ hwy <int> 17, 17, 13, 13, 17, 13, 21, 26, 28, 26, 27, 29, 26, 27, ... #> $ cty <int> 18, 18, 13, 13, 16, 13, 14, 20, 22, 18, 19, 21, 17, 20, ... glimpse(fueleconomy::common) #> Observations: 347 #> Variables: 4 #> $ make <chr> "Acura", "Acura", "Acura", "Acura", "Acura", "Audi", "Au... #> $ model <chr> "Integra", "Legend", "MDX 4WD", "NSX", "TSX", "A4", "A4 ... #> $ n <int> 42, 28, 12, 28, 27, 49, 49, 66, 20, 12, 46, 20, 30, 29, ... #> $ years <int> 16, 10, 12, 14, 11, 19, 15, 19, 19, 12, 20, 15, 16, 16, ... fueleconomy::vehicles %>% semi_join(fueleconomy::common, by = c("make", "model")) #> # A tibble: 14,531 × 12 #> id make model year class trans #> <int> <chr> <chr> <int> <chr> <chr> #> 1 1833 Acura Integra 1986 Subcompact Cars Automatic 4-spd #> 2 1834 Acura Integra 1986 Subcompact Cars Manual 5-spd #> 3 3037 Acura Integra 1987 Subcompact Cars Automatic 4-spd #> 4 3038 Acura Integra 1987 Subcompact Cars Manual 5-spd #> 5 4183 Acura Integra 1988 Subcompact Cars Automatic 4-spd #> 6 4184 Acura Integra 1988 Subcompact Cars Manual 5-spd #> # ... with 1.452e+04 more rows, and 6 more variables: drive <chr>, #> # cyl <int>, displ <dbl>, fuel <chr>, hwy <int>, cty <int> Find the 48 hours (over the course of the whole year) that have the worst delays. Cross-reference it with the weather data. Can you see any patterns? What does anti_join(flights, airports, by = c("dest" = "faa")) tell you? What does anti_join(airports, flights, by = c("faa" = "dest")) tell you? anti_join(flights, airports, by = c("dest" = "faa")) are flights that go to an airport that is not in FAA list of destinations, likely foreign airports. anti_join(airports, flights, by = c("faa" = "dest")) are US airports that don’t have a flight in the data, meaning that there were no flights to that aiport from New York in 2013. You might expect that there’s an implicit relationship between plane and airline, because each plane is flown by a single airline. Confirm or reject this hypothesis using the tools you’ve learned above. There isn’t such a relationship, since planes can be sold or airlines can merge. However, that doesn’t necessarily mean that such a plane will appear in this data. There are eight planes which flights %>% group_by(tailnum, carrier) %>% count() %>% filter(n() > 1) %>% select(tailnum) %>% distinct() #> Source: local data frame [18 x 1] #> Groups: tailnum [18] #> #> tailnum #> <chr> #> 1 N146PQ #> 2 N153PQ #> 3 N176PQ #> 4 N181PQ #> 5 N197PQ #> 6 N200PQ #> # ... with 12 more rows 8.6 Set operations No exercises "], -["strings.html", "9 Strings 9.1 Introduction 9.2 String Basics 9.3 Matching Patterns and Regular Expressions 9.4 Tools 9.5 Other types of patterns 9.6 stringi", " 9 Strings 9.1 Introduction Functions and packages coverered stringr package str_length str_c str_replace_na str_sub str_to_uppser, str_sort, str_to_lower, str_order str_length, str_pad, str_trim, str_sub For regex = str_view, str_view_all regex syntax str_detect str_subset str_count str_extract str_match tidyr::extract str_split str_locate str_sub the stringi package Ideas mention rex. A package with friendly regular expressions. Use it to match country names? Extract numbers from text? Discuss fuzzy joining and string distance, approximate matching. library(tidyverse) library(stringr) 9.2 String Basics 9.2.1 Exercises In code that doesn’t use stringr, you’ll often see paste() and paste0(). What’s the difference between the two functions? What stringr function are they equivalent to? How do the functions differ in their handling of NA? The function paste seperates strings by spaces by default, while paste0 does not seperate strings with spaces by default. paste("foo", "bar") #> [1] "foo bar" paste0("foo", "bar") #> [1] "foobar" Since str_c does not seperate strings with spaces by default it is closer in behabior to paste0. str_c("foo", "bar") #> [1] "foobar" However, str_c and the paste ufnction handle NA differently. The function str_c propogates NA, if any argument is a missing value, it returns a missing value. This is in line with how the numeric R functions, e.g. sum, mean, handle missing values. However, the paste functions, convert NA to the string "NA" and then treat it as any other character vector. str_c("foo", NA) #> [1] NA paste("foo", NA) #> [1] "foo NA" paste0("foo", NA) #> [1] "fooNA" In your own words, describe the difference between the sep and collapse arguments to str_c(). The sep argument is the string inserted between argugments to str_c, while collapse is the string used to separate any elements of the character vector into a character vector of length one. Use str_length() and str_sub() to extract the middle character from a string. What will you do if the string has an even number of characters? The following function extracts the middle character. If the string has an even number of characters the choice is arbitrary. We choose to select \\(\\lceil n / 2 \\rceil\\), because that case works even if the string is only of length one. A more general method would allow the user to select either the floor or ceiling for the middle character of an even string. x <- c("a", "abc", "abcd", "abcde", "abcdef") L <- str_length(x) m <- ceiling(L / 2) str_sub(x, m, m) #> [1] "a" "b" "b" "c" "c" What does str_wrap() do? When might you want to use it? The function str_wrap wraps text so that it fits within a certain width. This is useful for wrapping long strings of text to be typeset. What does str_trim() do? What’s the opposite of str_trim()? The function str_trim trims the whitespace from a string. str_trim(" abc ") #> [1] "abc" str_trim(" abc ", side = "left") #> [1] "abc " str_trim(" abc ", side = "right") #> [1] " abc" The opposite of str_trim is str_pad which adds characters to each side. str_pad("abc", 5, side = "both") #> [1] " abc " str_pad("abc", 4, side = "right") #> [1] "abc " str_pad("abc", 4, side = "left") #> [1] " abc" Write a function that turns (e.g.) a vector c(“a”, “b”, “c”) into the string a, b, and c. Think carefully about what it should do if given a vector of length 0, 1, or 2. Note: See Ch 19 for writing functions. str_commasep <- function(x, sep = ", ", last = ", and ") { if (length(x) > 1) { str_c(str_c(x[-length(x)], collapse = sep), x[length(x)], sep = last) } else { x } } str_commasep("") #> [1] "" str_commasep("a") #> [1] "a" str_commasep(c("a", "b")) #> [1] "a, and b" str_commasep(c("a", "b", "c")) #> [1] "a, b, and c" 9.3 Matching Patterns and Regular Expressions 9.3.1 Exercises Explain why each of these strings don’t match a \\: "\\", "\\\\", "\\\\\\". "\\": This will escape the next character in the R string. "\\\\": This will resolve to \\ in the regular expression, which will escape the next character in the regular expression. "\\\\\\": The first two backslashes will resolve to a literal backslash in the regular expression, the third will escape the next character. So in the regular expresion, this will escape some escaped character. How would you match the sequence "'\\ ? What patterns will the regular expression \\..\\..\\.. match? How would you represent it as a string? It will match any patterns that are a dot followed by any character, repeated three times. 9.3.1.1 Exercises How would you match the literal string “\\(^\\)”? str_view(c("$^$", "ab$^$sfas"), "^\\\\$\\\\^\\\\$$") Given the corpus of common words in stringr::words, create regular expressions that find all words that: Start with “y”. End with “x” Are exactly three letters long. (Don’t cheat by using str_length()!) Have seven letters or more. Since this list is long, you might want to use the match argument to str_view() to show only the matching or non-matching words. head(stringr::words) #> [1] "a" "able" "about" "absolute" "accept" "account" A simpler way, shown later is 9.3.1.2 Character classes and alternatives 9.3.1.2.1 Exercises Create regular expressions to find all words that: Start with a vowel. That only contain consonants. (Hint: thinking about matching “not”-vowels.) End with ed, but not with eed. End with ing or ise. Words starting with vowels str_view(stringr::words, "^[aeiou]") Words that contain only consonants str_view(stringr::words, "^[^aeiou]+$", match=TRUE) This seems to require using the + pattern introduced later, unless one wants to be very verbose and specify words of certain lengths. Words that end with ed but not with eed. This handles the special case of “ed”, as well as words with length > 2. str_view(stringr::words, "^ed$|[^e]ed$", match = TRUE) Words ending in ing or ise: str_view(stringr::words, "i(ng|se)$", match = TRUE) Empirically verify the rule “i before e except after c”. Using only what has been introduced thus far: str_view(stringr::words, "(cei|[^c]ie)", match = TRUE) str_view(stringr::words, "(cie|[^c]ei)", match = TRUE) Using str_detect: sum(str_detect(stringr::words, "(cei|[^c]ie)")) #> [1] 14 sum(str_detect(stringr::words, "(cie|[^c]ei)")) #> [1] 3 Is “q” always followed by a “u”? In the stringr::words dataset, yes. In the full English language, no. str_view(stringr::words, "q[^u]", match = TRUE) Write a regular expression that matches a word if it’s probably written in British English, not American English. Ummm. In the general case, this is hard. But, there are a few heuristics to consider that can get part of the way there: British English uses “ou” instead of “o” use of “ae” and “oe” instead of “a” and “o” ends in ise instead of ize ending yse ou|ise^|ae|oe|yse^ There are others, but https://en.wikipedia.org/wiki/American_and_British_English_spelling_differences but this is not handled best by a regular expression. It would require a dictionary with differences in spellings for different words. And even then, a good algorithm would be statistical, inferring the probability that a text or word is using the British spelling rather than some deterministic algorithm. Create a regular expression that will match telephone numbers as commonly written in your country. Using what has been covered in R4DS thus far, x <- c("123-456-7890", "1235-2351") str_view(x, "\\\\d\\\\d\\\\d-\\\\d\\\\d\\\\d-\\\\d\\\\d\\\\d\\\\d") Using stuff covered in the next section, str_view(x, "\\\\d{3}-\\\\d{3}-\\\\d{4}") Note that this pattern doesn’t account for phone numbers that are invalid because of unassigned area code, or special numbers like 911, or for extensions. See https://en.wikipedia.org/wiki/North_American_Numbering_Plan for the complexities of US phone numbers, and http://stackoverflow.com/questions/123559/a-comprehensive-regex-for-phone-number-validation for one discussion of using a regex for phone number validation. 9.3.2 Repitition 9.3.2.1 Exercises Describe the equivalents of ?, +, * in {m,n} form. The equivalent of ? is {,1}, matching at most 1. The equivalent of + is {1,}, matching 1 or more. There is no direct equivalent of * in {m,n} form since there are no bounds on the matches: it can be 0 up to infinity matches. Describe in words what these regular expressions match: (read carefully to see if I’m using a regular expression or a string that defines a regular expression.) ^.*$: Any string "\\\\{.+\\\\}": Any string with curly braces surrounding at least one character. \\d{4}-\\d{2}-\\d{2}: A date in “%Y-%m-%d” format: four digits followed by a dash, followed by two digits followed by a dash, followed by another two digits followed by a dash. "\\\\\\\\{4}": This resolves to the regex \\\\{4}, which is four backslashes. Create regular expressions to find all words that: Start with three consonants. Have three or more vowels in a row. Have two or more vowel-consonant pairs in a row. A regex to find all words starting with three consonants str_view(words, "^[^aeiou]{3}", match = TRUE) A regex to find three or more vowels in a row: str_view(words, "[aeiou]{3,}", match = TRUE) Two or more vowel-consonant pairs in a row. str_view(words, "([aeiou][^aeiou]){2,}", match = TRUE) Solve the beginner regexp crosswords at https://regexcrossword.com/challenges/beginner Nope 9.3.3 Grouping and backreferences str_view(fruit, "(..)\\\\1", match = TRUE) 9.3.3.1 Exercises Describe, in words, what these expressions will match: (.)\\1\\1 : The same character apearing three times in a row. E.g. “aaa” "(.)(.)\\\\2\\\\1": A pair of characters followed by the same pair of characters in reversed order. E.g. “abba”. (..)\\1: Any two characters repeated. E.g. “a1a1”. "(.).\\\\1.\\\\1": A character followed by any character, the original character, any other character, the original character again. E.g. “abaca”, “b8b.b”. "(.)(.)(.).*\\\\3\\\\2\\\\1" Three characters followed by zero or more characters of any kind followed by the same three characters but in reverse order. E.g. “abcsgasgddsadgsdgcba” or “abccba” or “abc1cba”. Construct regular expressions to match words that: Start and end with the same character. Assuming the word is more than one character and all strings are considered words, ^(.).*\\1$ str_view(words, "^(.).*\\\\1$", match = TRUE) 2 Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.). # any two characters repeated str_view(words, "(..).*\\\\1", match = TRUE) # more stringent, letters only, but also allowing for differences in capitalization str_view(str_to_lower(words), "([a-z][a-z]).*\\\\1", match = TRUE) Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.) str_view(words, "(.).*\\\\1.*\\\\1", match = TRUE) 9.4 Tools 9.4.1 Detect matches x <- c("apple", "banana", "pear") str_detect(x, "e") #> [1] TRUE FALSE TRUE Number of words starting with t? sum(str_detect(words, "^t")) #> [1] 65 Proportion of words ending with a vowel? mean(str_detect(words, "[aeiou]$")) #> [1] 0.277 To find all words with no vowels no_vowels_1 <- !str_detect(words, "[aeiou]") no_vowels_2 <- str_detect(words, "^[^aeiou]+$") identical(no_vowels_1, no_vowels_2) #> [1] TRUE words[str_detect(words, "x$")] #> [1] "box" "sex" "six" "tax" str_subset(words, "x$") #> [1] "box" "sex" "six" "tax" df <- tibble( word = words, i = seq_along(word) ) df %>% filter(str_detect(words, "x$")) #> # A tibble: 4 × 2 #> word i #> <chr> <int> #> 1 box 108 #> 2 sex 747 #> 3 six 772 #> 4 tax 841 Number of matches in each string x <- c("apple", "banana", "pear") str_count(x, "a") #> [1] 1 3 1 Average vowels per word mean(str_count(words, "[aeiou]")) #> [1] 1.99 df %>% mutate( vowels = str_count(word, "[aeiou]"), consonants = str_count(word, "[^aeiou]") ) #> # A tibble: 980 × 4 #> word i vowels consonants #> <chr> <int> <int> <int> #> 1 a 1 1 0 #> 2 able 2 2 2 #> 3 about 3 3 2 #> 4 absolute 4 4 4 #> 5 accept 5 2 4 #> 6 account 6 3 4 #> # ... with 974 more rows matches do not overlap - they are usually greedy, except when otherwise noted. matches only match the first one. _all() functions will get all matches. 9.4.2 Exercises For each of the following challenges, try solving it by using both a single regular expression, and a combination of multiple str_detect() calls. Find all words that start or end with x. Find all words that start with a vowel and end with a consonant. Are there any words that contain at least one of each different vowel? Words that start or end with x? # one regex words[str_detect(words, "^x|x$")] #> [1] "box" "sex" "six" "tax" # split regex into parts start_with_x <- str_detect(words, "^x") end_with_x <- str_detect(words, "x$") words[start_with_x | end_with_x] #> [1] "box" "sex" "six" "tax" Find all words starting with vowel and ending with consonant. str_subset(words, "^[aeiou].*[^aeiou]$") %>% head() #> [1] "about" "accept" "account" "across" "act" "actual" start_with_vowel <- str_detect(words, "^[aeiou]") end_with_consonant <- str_detect(words, "[^aeiou]$") words[start_with_vowel & end_with_consonant] %>% head() #> [1] "about" "accept" "account" "across" "act" "actual" Words that contain at least one of each vowel. I can’t think of a good way of doing this without doing a regex of the permutations: pattern <- cross_n(rerun(5, c("a", "e", "i", "o", "u")), .filter = function(...) { x <- as.character(unlist(list(...))) length(x) != length(unique(x)) }) %>% map_chr(~ str_c(unlist(.x), collapse = ".*")) %>% str_c(collapse = "|") str_subset(words, pattern) #> character(0) words[str_detect(words, "a") & str_detect(words, "e") & str_detect(words, "i") & str_detect(words, "o") & str_detect(words, "u")] #> character(0) There appear to be none. To check that it works, str_subset("aseiouds", pattern) #> [1] "aseiouds" What word has the highest number of vowels? What word has the highest proportion of vowels? (Hint: what is the denominator?) prop_vowels <- str_count(words, "[aeiou]") / str_length(words) words[which(prop_vowels == max(prop_vowels))] #> [1] "a" 9.4.3 Extract Matches The Harvard sentences: length(sentences) #> [1] 720 head(sentences) #> [1] "The birch canoe slid on the smooth planks." #> [2] "Glue the sheet to the dark blue background." #> [3] "It's easy to tell the depth of a well." #> [4] "These days a chicken leg is a rare dish." #> [5] "Rice is often served in round bowls." #> [6] "The juice of lemons makes fine punch." colours <- c("red", "orange", "yellow", "green", "blue", "purple") colour_match <- str_c(colours, collapse = "|") colour_match #> [1] "red|orange|yellow|green|blue|purple" has_colour <- str_subset(sentences, colour_match) matches <- str_extract(has_colour, colour_match) head(matches) #> [1] "blue" "blue" "red" "red" "red" "blue" more <- sentences[str_count(sentences, colour_match) > 1] str_view_all(more, colour_match) str_extract(more, colour_match) #> [1] "blue" "green" "orange" The _all versions of functions return lists. str_extract_all(more, colour_match) #> [[1]] #> [1] "blue" "red" #> #> [[2]] #> [1] "green" "red" #> #> [[3]] #> [1] "orange" "red" str_extract_all(more, colour_match, simplify = TRUE) #> [,1] [,2] #> [1,] "blue" "red" #> [2,] "green" "red" #> [3,] "orange" "red" x <- c("a", "a b", "a b c") str_extract_all(x, "[a-z]", simplify = TRUE) #> [,1] [,2] [,3] #> [1,] "a" "" "" #> [2,] "a" "b" "" #> [3,] "a" "b" "c" 9.4.3.1 Exercises In the previous example, you might have noticed that the regular expression matched “flickered”, which is not a colour. Modify the regex to fix the problem. Add the \\b before and after the pattern colour_match2 <- str_c("\\\\b(", str_c(colours, collapse = "|"), ")\\\\b") colour_match2 #> [1] "\\\\b(red|orange|yellow|green|blue|purple)\\\\b" more2 <- sentences[str_count(sentences, colour_match) > 1] str_view_all(more2, colour_match2, match = TRUE) From the Harvard sentences data, extract: The first word from each sentence. All words ending in ing. All plurals. The first word in each sentence requires defining what a word is. I’ll consider a word any contiguous str_extract(sentences, "[a-zA-X]+") %>% head() #> [1] "The" "Glue" "It" "These" "Rice" "The" All words ending in ing: pattern <- "\\\\b[A-Za-z]+ing\\\\b" sentences_with_ing <- str_detect(sentences, pattern) unique(unlist(str_extract_all(sentences[sentences_with_ing], pattern))) %>% head() #> [1] "spring" "evening" "morning" "winding" "living" "king" All plurals. To do this correct requires linguistic information. But if we just want to say any word ending in an “s” is plural (and with more than 3 characters to remove as, is, gas, etc.) unique(unlist(str_extract_all(sentences, "\\\\b[A-Za-z]{3,}s\\\\b"))) %>% head() #> [1] "planks" "days" "bowls" "lemons" "makes" "hogs" 9.4.4 Grouped Matches noun <- "(a|the) ([^ ]+)" has_noun <- sentences %>% str_subset(noun) %>% head(10) has_noun %>% str_extract(noun) #> [1] "the smooth" "the sheet" "the depth" "a chicken" "the parked" #> [6] "the sun" "the huge" "the ball" "the woman" "a helps" has_noun %>% str_match(noun) #> [,1] [,2] [,3] #> [1,] "the smooth" "the" "smooth" #> [2,] "the sheet" "the" "sheet" #> [3,] "the depth" "the" "depth" #> [4,] "a chicken" "a" "chicken" #> [5,] "the parked" "the" "parked" #> [6,] "the sun" "the" "sun" #> [7,] "the huge" "the" "huge" #> [8,] "the ball" "the" "ball" #> [9,] "the woman" "the" "woman" #> [10,] "a helps" "a" "helps" tibble(sentence = sentences) %>% tidyr::extract( sentence, c("article", "noun"), "(a|the) ([^ ]+)", remove = FALSE ) #> # A tibble: 720 × 3 #> sentence article noun #> * <chr> <chr> <chr> #> 1 The birch canoe slid on the smooth planks. the smooth #> 2 Glue the sheet to the dark blue background. the sheet #> 3 It's easy to tell the depth of a well. the depth #> 4 These days a chicken leg is a rare dish. a chicken #> 5 Rice is often served in round bowls. <NA> <NA> #> 6 The juice of lemons makes fine punch. <NA> <NA> #> # ... with 714 more rows 9.4.4.1 Exercises Find all words that come after a “number” like “one”, “two”, “three” etc. Pull out both the number and the word. I’ll use the same following “word” pattern as used above numword <- "(one|two|three|four|five|six|seven|eight|nine|ten) +(\\\\S+)" sentences[str_detect(sentences, numword)] %>% str_extract(numword) #> [1] "ten served" "one over" "seven books" "two met" #> [5] "two factors" "one and" "three lists" "seven is" #> [9] "two when" "one floor." "ten inches." "one with" #> [13] "one war" "one button" "six minutes." "ten years" #> [17] "one in" "ten chased" "one like" "two shares" #> [21] "two distinct" "one costs" "ten two" "five robins." #> [25] "four kinds" "one rang" "ten him." "three story" #> [29] "ten by" "one wall." "three inches" "ten your" #> [33] "six comes" "one before" "three batches" "two leaves." Find all contractions. Separate out the pieces before and after the apostrophe. contraction <- "([A-Za-z]+)'([A-Za-z]+)" sentences %>% `[`(str_detect(sentences, contraction)) %>% str_extract(contraction) #> [1] "It's" "man's" "don't" "store's" "workmen's" #> [6] "Let's" "sun's" "child's" "king's" "It's" #> [11] "don't" "queen's" "don't" "pirate's" "neighbor's" 9.4.5 Splitting 9.4.5.1 Exercises Split up a string like "apples, pears, and bananas" into individual components. x <- c("apples, pears, and bananas") str_split(x, ", +(and +)?")[[1]] #> [1] "apples" "pears" "bananas" Why is it better to split up by boundary("word") than " "? Splitting by boundary("word") splits on punctuation and not just whitespace. What does splitting with an empty string ("") do? Experiment, and then read the documentation. str_split("ab. cd|agt", "")[[1]] #> [1] "a" "b" "." " " "c" "d" "|" "a" "g" "t" It splits the string into individual characters. 9.5 Other types of patterns 9.5.1 Exercises How would you find all strings containing \\ with regex() vs. with fixed()? str_subset(c("a\\\\b", "ab"), "\\\\\\\\") #> [1] "a\\\\b" str_subset(c("a\\\\b", "ab"), fixed("\\\\")) #> [1] "a\\\\b" What are the five most common words in sentences? str_extract_all(sentences, boundary("word")) %>% unlist() %>% str_to_lower() %>% tibble() %>% set_names("word") %>% group_by(word) %>% count(sort = TRUE) %>% head(5) #> # A tibble: 5 × 2 #> word n #> <chr> <int> #> 1 the 751 #> 2 a 202 #> 3 of 132 #> 4 to 123 #> 5 and 118 9.6 stringi 9.6.1 Exercises Find the stringi functions that: Count the number of words. stri_count_words Find duplicated strings. stri_duplicated Generate random text. There are several functions beginning with stri_rand_. stri_rand_lipsum generates lorem ipsum text, stri_rand_strings generates random strings, stri_rand_shuffle randomly shuffles the code points in the text. How do you control the language that stri_sort() uses for sorting? Use the locale argument to the opts_collator argument. "], -["factors.html", "10 Factors 10.1 Introduction 10.2 Creating Factors 10.3 General Social Survey 10.4 Modifying factor order 10.5 Modifying factor levels", " 10 Factors 10.1 Introduction Functions and packages: forcats factor fct_inorder levels readr::parse_factor fct_reorder fct_relevel fct_reorder2 fct_infreq fct_rev fct_recode fct_lump fct_collapse library("tidyverse") library("forcats") 10.2 Creating Factors No exercises 10.3 General Social Survey 10.3.1 Exercises Explore the distribution of rincome (reported income). What makes the default bar chart hard to understand? How could you improve the plot? rincome_plot <- gss_cat %>% ggplot(aes(rincome)) + geom_bar() rincome_plot The default bar chart labels are too squished to read. One solution is to change the angle of the labels, rincome_plot + theme(axis.text.x = element_text(angle = 90)) But that’s not natural either, because text is vertical, and we read horizontally. So with long labels, it is better to flip it. rincome_plot + coord_flip() This is better, but it unituively goes from low to high. It would help if the scale is reversed. Also, if all the missing factors were differentiated. What is the most common relig in this survey? What’s the most common partyid? The most common relig is “Protestant” gss_cat %>% count(relig) %>% arrange(-n) %>% head(1) #> # A tibble: 1 × 2 #> relig n #> <fctr> <int> #> 1 Protestant 10846 The most common partyid is “Independent” gss_cat %>% count(partyid) %>% arrange(-n) %>% head(1) #> # A tibble: 1 × 2 #> partyid n #> <fctr> <int> #> 1 Independent 4119 Which relig does denom (denomination) apply to? How can you find out with a table? How can you find out with a visualisation? levels(gss_cat$denom) #> [1] "No answer" "Don't know" "No denomination" #> [4] "Other" "Episcopal" "Presbyterian-dk wh" #> [7] "Presbyterian, merged" "Other presbyterian" "United pres ch in us" #> [10] "Presbyterian c in us" "Lutheran-dk which" "Evangelical luth" #> [13] "Other lutheran" "Wi evan luth synod" "Lutheran-mo synod" #> [16] "Luth ch in america" "Am lutheran" "Methodist-dk which" #> [19] "Other methodist" "United methodist" "Afr meth ep zion" #> [22] "Afr meth episcopal" "Baptist-dk which" "Other baptists" #> [25] "Southern baptist" "Nat bapt conv usa" "Nat bapt conv of am" #> [28] "Am bapt ch in usa" "Am baptist asso" "Not applicable" From the context it is clear that denom refers to “Protestant” (and unsurprising given that it is the largest category in freq). Let’s filter out the non-responses, no answers, others, not-applicable, or no denomination, to leave only answers to denominations. After doing that, the only remaining responses are “Protestant”. gss_cat %>% filter(!denom %in% c("No answer", "Other", "Don't know", "Not applicable", "No denomination")) %>% count(relig) #> # A tibble: 1 × 2 #> relig n #> <fctr> <int> #> 1 Protestant 7025 This is also clear in a scatter plot of relig vs. denom where the points are proportional to the size of the number of answers (since otherwise there would be overplotting). gss_cat %>% count(relig, denom) %>% ggplot(aes(x = relig, y = denom, size = n)) + geom_point() + theme(axis.text.x = element_text(angle = 90)) 10.4 Modifying factor order 10.4.1 Exercises There are some suspiciously high numbers in tvhours. Is the mean a good summary? summary(gss_cat[["tvhours"]]) #> Min. 1st Qu. Median Mean 3rd Qu. Max. NA's #> 0 1 2 3 4 24 10146 gss_cat %>% filter(!is.na(tvhours)) %>% ggplot(aes(x = tvhours)) + geom_histogram(binwidth = 1) Whether the mean is the best summary epends on what you are using it for :-), i.e. your objective. But probably the median would be what most people prefer. And the hours of tv doesn’t look that surprising to me. For each factor in gss_cat identify whether the order of the levels is arbitrary or principled. The following piece of code uses functions covered in Ch 21, to print out the names of only the factors. keep(gss_cat, is.factor) %>% names() #> [1] "marital" "race" "rincome" "partyid" "relig" "denom" There are five six categorical variables: marital, race, rincome, partyid, relig, denom. The ordering of marital is “somewhat principled”. There is some sort of logic in that the levels are grouped “never married”, married at some point (separated, divorced, widowed), and “married”; though it would seem that “Never Married”, “Divorced”, “Widowed”, “Separated”, “Married” might be more natural. I find that the question of ordering can be determined by the level of aggregation in a categorical variable, and there can be more “partially ordered” factors than one would expect. levels(gss_cat[["marital"]]) #> [1] "No answer" "Never married" "Separated" "Divorced" #> [5] "Widowed" "Married" gss_cat %>% ggplot(aes(x = marital)) + geom_bar() The ordering of race is principled in that the categories are ordered by count of observations in the data. levels(gss_cat$race) #> [1] "Other" "Black" "White" "Not applicable" gss_cat %>% ggplot(aes(race)) + geom_bar(drop = FALSE) #> Warning: Ignoring unknown parameters: drop The levels of rincome are ordered in decreasing order of the income; however the placement of “No answer”, “Don’t know”, and “Refused” before, and “Not applicable” after the income levels is arbitrary. It would be better to place all the missing income level categories either before or after all the known values. levels(gss_cat$rincome) #> [1] "No answer" "Don't know" "Refused" "$25000 or more" #> [5] "$20000 - 24999" "$15000 - 19999" "$10000 - 14999" "$8000 to 9999" #> [9] "$7000 to 7999" "$6000 to 6999" "$5000 to 5999" "$4000 to 4999" #> [13] "$3000 to 3999" "$1000 to 2999" "Lt $1000" "Not applicable" The levels of relig is arbitrary: there is no natural ordering, and they don’t appear to be ordered by stats within the dataset. levels(gss_cat$relig) #> [1] "No answer" "Don't know" #> [3] "Inter-nondenominational" "Native american" #> [5] "Christian" "Orthodox-christian" #> [7] "Moslem/islam" "Other eastern" #> [9] "Hinduism" "Buddhism" #> [11] "Other" "None" #> [13] "Jewish" "Catholic" #> [15] "Protestant" "Not applicable" gss_cat %>% ggplot(aes(relig)) + geom_bar() + coord_flip() The same goes for denom. levels(gss_cat$denom) #> [1] "No answer" "Don't know" "No denomination" #> [4] "Other" "Episcopal" "Presbyterian-dk wh" #> [7] "Presbyterian, merged" "Other presbyterian" "United pres ch in us" #> [10] "Presbyterian c in us" "Lutheran-dk which" "Evangelical luth" #> [13] "Other lutheran" "Wi evan luth synod" "Lutheran-mo synod" #> [16] "Luth ch in america" "Am lutheran" "Methodist-dk which" #> [19] "Other methodist" "United methodist" "Afr meth ep zion" #> [22] "Afr meth episcopal" "Baptist-dk which" "Other baptists" #> [25] "Southern baptist" "Nat bapt conv usa" "Nat bapt conv of am" #> [28] "Am bapt ch in usa" "Am baptist asso" "Not applicable" Ignoring “No answer”, “Don’t know”, and “Other party”, the levels of partyid are ordered from “Strong Republican”" to “Strong Democrat”. levels(gss_cat$partyid) #> [1] "No answer" "Don't know" "Other party" #> [4] "Strong republican" "Not str republican" "Ind,near rep" #> [7] "Independent" "Ind,near dem" "Not str democrat" #> [10] "Strong democrat" Why did moving “Not applicable” to the front of the levels move it to the bottom of the plot? Because that gives the level “Not applicable” an integer value of 1. 10.5 Modifying factor levels 10.5.1 Exercises How have the proportions of people identifying as Democrat, Republican, and Independent changed over time? To answer that, we need to combine the multiple levels into Democrat, Republican, and Independent levels(gss_cat$partyid) #> [1] "No answer" "Don't know" "Other party" #> [4] "Strong republican" "Not str republican" "Ind,near rep" #> [7] "Independent" "Ind,near dem" "Not str democrat" #> [10] "Strong democrat" gss_cat %>% mutate(partyid = fct_collapse(partyid, other = c("No answer", "Don't know", "Other party"), rep = c("Strong republican", "Not str republican"), ind = c("Ind,near rep", "Independent", "Ind,near dem"), dem = c("Not str democrat", "Strong democrat"))) %>% count(year, partyid) %>% group_by(year) %>% mutate(p = n / sum(n)) %>% ggplot(aes(x = year, y = p, colour = fct_reorder2(partyid, year, p))) + geom_point() + geom_line() + labs(colour = "Party ID.") How could you collapse rincome into a small set of categories? Group all the non-responses into one category, and then group other categories into a smaller number. Since there is a clear ordering, we wouldn’t want to use something like fct_lump. levels(gss_cat$rincome) #> [1] "No answer" "Don't know" "Refused" "$25000 or more" #> [5] "$20000 - 24999" "$15000 - 19999" "$10000 - 14999" "$8000 to 9999" #> [9] "$7000 to 7999" "$6000 to 6999" "$5000 to 5999" "$4000 to 4999" #> [13] "$3000 to 3999" "$1000 to 2999" "Lt $1000" "Not applicable" library("stringr") gss_cat %>% mutate(rincome = fct_collapse( rincome, `Unknown` = c("No answer", "Don't know", "Refused", "Not applicable"), `Lt $5000` = c("Lt $1000", str_c("$", c("1000", "3000", "4000"), " to ", c("2999", "3999", "4999"))), `$5000 to 10000` = str_c("$", c("5000", "6000", "7000", "8000"), " to ", c("5999", "6999", "7999", "9999")) )) %>% ggplot(aes(x = rincome)) + geom_bar() + coord_flip() "], -["dates-and-times.html", "11 Dates and Times 11.1 Prerequisite 11.2 Creating date/times 11.3 Date-Time Components 11.4 Time Spans", " 11 Dates and Times lubridate today, now ymd etc., ymd_hms etc. make_datetime, make_date as_datetime, as_date year, month, mday, yday, wday and year<- floor_date, round_date, ceiling_date update as.duration, duration functions (ddays, etc) period functions (days, months, etc) interval creation with %--% with_tz, force_tz hms package has times Ideas for applications: CDB90 data, COW war start end and duration Read more on time-zones: https://en.wikipedia.org/wiki/Time_zone Computerphile The Problem with Time & Timezones - Computerphile The history of the tz database are themselves interesting: https://en.wikipedia.org/wiki/Tz_database A literary appreciation of the Olson/Zoneinfo/tz database I think time-zones are likely a point for social science research in and of themselves. Policy choices. Coordination. Regression discontinuity designs. Just sayin… 11.1 Prerequisite library(tidyverse) library(lubridate) library(nycflights13) 11.2 Creating date/times NOTE %/% is integer division, divide and throw away the remainder. %% calculates the modulus (remainder of division). For example to test for an even number: x %% 2 == 0, or odd x %% 2 == 1. To get the thousands value of a number x %/% 1000. make_datetime_100 <- function(year, month, day, time) { make_datetime(year, month, day, time %/% 100, time %% 100) } flights_dt <- flights %>% filter(!is.na(dep_time), !is.na(arr_time)) %>% mutate( dep_time = make_datetime_100(year, month, day, dep_time), arr_time = make_datetime_100(year, month, day, arr_time), sched_dep_time = make_datetime_100(year, month, day, sched_dep_time), sched_arr_time = make_datetime_100(year, month, day, sched_arr_time) ) %>% select(origin, dest, ends_with("delay"), ends_with("time")) flights_dt %>% head #> # A tibble: 6 × 9 #> origin dest dep_delay arr_delay dep_time sched_dep_time #> <chr> <chr> <dbl> <dbl> <dttm> <dttm> #> 1 EWR IAH 2 11 2013-01-01 05:17:00 2013-01-01 05:15:00 #> 2 LGA IAH 4 20 2013-01-01 05:33:00 2013-01-01 05:29:00 #> 3 JFK MIA 2 33 2013-01-01 05:42:00 2013-01-01 05:40:00 #> 4 JFK BQN -1 -18 2013-01-01 05:44:00 2013-01-01 05:45:00 #> 5 LGA ATL -6 -25 2013-01-01 05:54:00 2013-01-01 06:00:00 #> 6 EWR ORD -4 12 2013-01-01 05:54:00 2013-01-01 05:58:00 #> # ... with 3 more variables: arr_time <dttm>, sched_arr_time <dttm>, #> # air_time <dbl> Times are often stored as integers since a reference time, called an epoch. The most epoch is the UNIX (or POSIX) Epoch of January 1st, 1970 00:00:00. So, interally, times are stored as the number of days, seconds, or milliseconds, etc. since the 1970-01-01 00:00:00.000. Calculate dates and datetimes from number of seconds (as_datetime) or days (as_date) from Unix epoch. as_datetime(60 * 60 * 10) #> [1] "1970-01-01 10:00:00 UTC" as_date(365 * 10 + 2) #> [1] "1980-01-01" 11.2.1 Exercises What happens if you parse a string that contains invalid dates? ret <- ymd(c("2010-10-10", "bananas")) #> Warning: 1 failed to parse. print(class(ret)) #> [1] "Date" ret #> [1] "2010-10-10" NA It produces an NA and an warning message. What does the tzone argument to today() do? Why is it important? It determines the time-zone of the date. Since different time-zones can have different dates, the value of today() can vary depending on the time-zone specified. Use the appropriate lubridate function to parse each of the following dates: d1 <- "January 1, 2010" mdy(d1) #> [1] "2010-01-01" d2 <- "2015-Mar-07" ymd(d2) #> [1] "2015-03-07" d3 <- "06-Jun-2017" dmy(d3) #> [1] "2017-06-06" d4 <- c("August 19 (2015)", "July 1 (2015)") mdy(d4) #> [1] "2015-08-19" "2015-07-01" d5 <- "12/30/14" # Dec 30, 2014 mdy(d5) #> [1] "2014-12-30" 11.3 Date-Time Components sched_dep <- flights_dt %>% mutate(minute = minute(sched_dep_time)) %>% group_by(minute) %>% summarise( avg_delay = mean(arr_delay, na.rm = TRUE), n = n()) Note The difference between rounded and unrounded dates provides the within period time. (datetime <- ymd_hms("2016-07-08 12:34:56")) #> [1] "2016-07-08 12:34:56 UTC" year(datetime) <- 2020 datetime #> [1] "2020-07-08 12:34:56 UTC" month(datetime) <- 01 datetime #> [1] "2020-01-08 12:34:56 UTC" hour(datetime) <- hour(datetime) + 1 datetime #> [1] "2020-01-08 13:34:56 UTC" 11.3.1 Exercises How does the distribution of flight times within a day change over the course of the year? Let’s try plotting this by month: flights_dt %>% mutate(time = hour(dep_time) * 100 + minute(dep_time), mon = as.factor(month (dep_time))) %>% ggplot(aes(x = time, group = mon, color = mon)) + geom_freqpoly(binwidth = 100) This will look better if everything is normalized within groups. The reason that February is lower is that there are fewer days and thus fewer flights. flights_dt %>% mutate(time = hour(dep_time) * 100 + minute(dep_time), mon = as.factor(month (dep_time))) %>% ggplot(aes(x = time, y = ..density.., group = mon, color = mon)) + geom_freqpoly(binwidth = 100) At least to me there doesn’t appear to much difference in within-day distribution over the year, but I maybe thinking about it incorrectly. Compare dep_time, sched_dep_time and dep_delay. Are they consistent? Explain your findings. If they are consistent, then dep_time = sched_dep_time + dep_delay. flights_dt %>% mutate(dep_time_ = sched_dep_time + dep_delay * 60) %>% filter(dep_time_ != dep_time) %>% select(dep_time_, dep_time, sched_dep_time, dep_delay) #> # A tibble: 1,205 × 4 #> dep_time_ dep_time sched_dep_time dep_delay #> <dttm> <dttm> <dttm> <dbl> #> 1 2013-01-02 08:48:00 2013-01-01 08:48:00 2013-01-01 18:35:00 853 #> 2 2013-01-03 00:42:00 2013-01-02 00:42:00 2013-01-02 23:59:00 43 #> 3 2013-01-03 01:26:00 2013-01-02 01:26:00 2013-01-02 22:50:00 156 #> 4 2013-01-04 00:32:00 2013-01-03 00:32:00 2013-01-03 23:59:00 33 #> 5 2013-01-04 00:50:00 2013-01-03 00:50:00 2013-01-03 21:45:00 185 #> 6 2013-01-04 02:35:00 2013-01-03 02:35:00 2013-01-03 23:59:00 156 #> # ... with 1,199 more rows There exist discrepencies. It looks like there are mistakes in the dates. These are flights in which the actual departure time is on the next day relative to the scheduled departure time. We forgot to account for this when creating the date-times. The code would have had to check if the departure time is less than the scheduled departure time. Alternatively, simply adding the delay time is more robust because it will automatically account for crossing into the next day. Compare air_time with the duration between the departure and arrival. Explain your findings. flights_dt %>% mutate(flight_duration = as.numeric(arr_time - dep_time), air_time_mins = air_time, diff = flight_duration - air_time_mins) %>% select(origin, dest, flight_duration, air_time_mins, diff) #> # A tibble: 328,063 × 5 #> origin dest flight_duration air_time_mins diff #> <chr> <chr> <dbl> <dbl> <dbl> #> 1 EWR IAH 193 227 -34 #> 2 LGA IAH 197 227 -30 #> 3 JFK MIA 221 160 61 #> 4 JFK BQN 260 183 77 #> 5 LGA ATL 138 116 22 #> 6 EWR ORD 106 150 -44 #> # ... with 3.281e+05 more rows How does the average delay time change over the course of a day? Should you use dep_time or sched_dep_time? Why? Use sched_dep_time because that is the relevant metric for someone scheduling a flight. Also, using dep_time will always bias delays to later in the day since delays will push flights later. flights_dt %>% mutate(sched_dep_hour = hour(sched_dep_time)) %>% group_by(sched_dep_hour) %>% summarise(dep_delay = mean(dep_delay)) %>% ggplot(aes(y = dep_delay, x = sched_dep_hour)) + geom_point() + geom_smooth() #> `geom_smooth()` using method = 'loess' On what day of the week should you leave if you want to minimise the chance of a delay? Sunday has the lowest average departure delay time and the lowest average arrival delay time. flights_dt %>% mutate(dow = wday(sched_dep_time)) %>% group_by(dow) %>% summarise(dep_delay = mean(dep_delay), arr_delay = mean(arr_delay, na.rm = TRUE)) #> # A tibble: 7 × 3 #> dow dep_delay arr_delay #> <dbl> <dbl> <dbl> #> 1 1 11.5 4.82 #> 2 2 14.7 9.65 #> 3 3 10.6 5.39 #> 4 4 11.7 7.05 #> 5 5 16.1 11.74 #> 6 6 14.7 9.07 #> # ... with 1 more rows What makes the distribution of diamonds$carat and flights$sched_dep_time similar? ggplot(diamonds, aes(x = carat)) + geom_density() In both carat and sched_dep_time there are abnormally large numbers of values are at nice “human” numbers. In sched_dep_time it is at 00 and 30 minutes. In carats, it is at 0, 1/3, 1/2, 2/3, ggplot(diamonds, aes(x = carat %% 1 * 100)) + geom_histogram(binwidth = 1) In scheduled departure times it is 00 and 30 minutes, and minutes ending in 0 and 5. ggplot(flights_dt, aes(x = minute(sched_dep_time))) + geom_histogram(binwidth = 1) Confirm my hypothesis that the early departures of flights in minutes 20-30 and 50-60 are caused by scheduled flights that leave early. Hint: create a binary variable that tells you whether or not a flight was delayed. At the minute level, there doesn’t appear to be anything: flights_dt %>% mutate(early = dep_delay < 0, minute = minute(sched_dep_time)) %>% group_by(minute) %>% summarise(early = mean(early)) %>% ggplot(aes(x = minute, y = early)) + geom_point() But if grouped in 10 minute intervals, there is a higher proportion of early flights during those minutes. flights_dt %>% mutate(early = dep_delay < 0, minute = minute(sched_dep_time) %% 10) %>% group_by(minute) %>% summarise(early = mean(early)) %>% ggplot(aes(x = minute, y = early)) + geom_point() 11.4 Time Spans duration: exact number of seconds period: human time periods - e.g. weeks, months interval: start and end points 11.4.1 Durations No exercises 11.4.2 Periods Define overnight when arr_time < dep_time (no flights > 24 hours): flights_dt <- flights_dt %>% mutate( overnight = arr_time < dep_time, arr_time = arr_time + days(overnight * 1), sched_arr_time = sched_arr_time + days(overnight * 1) ) 11.4.3 Intervals NOTE This section seems less complete than the others. Refer to the lubridate vignette for more information. 11.4.4 Exercises Why is there months() but no dmonths()? There is no direct unambigous value of months in seconds: 31 days: Jan, Mar, May, Jul, Aug, Oct, 30 days: Apr, Jun, Sep, Nov, Dec 28 or 29 days: Feb Though in the past, in the pre-computer era, for arithmetic convenience, bankers adopoted a 360 day year with 30 day months. Explain days(overnight * 1) to someone who has just started learning R. How does it work? overnight is equal to TRUE (1) or FALSE (0). So if it is an overnight flight, this becomes 1 day, and if not, then overnight = 0, and no days are added to the date. Create a vector of dates giving the first day of every month in 2015. Create a vector of dates giving the first day of every month in the current year. A vector of the first day of the month for every month in 2015: ymd("2015-01-01") + months(0:11) #> [1] "2015-01-01" "2015-02-01" "2015-03-01" "2015-04-01" "2015-05-01" #> [6] "2015-06-01" "2015-07-01" "2015-08-01" "2015-09-01" "2015-10-01" #> [11] "2015-11-01" "2015-12-01" To get the vector of the first day of the month for this year, we first need to figure out what this year is, and get January 1st of it. I can do that by taking today() and truncating it to the year using floor_date: floor_date(today(), unit = "year") + months(0:11) #> [1] "2017-01-01" "2017-02-01" "2017-03-01" "2017-04-01" "2017-05-01" #> [6] "2017-06-01" "2017-07-01" "2017-08-01" "2017-09-01" "2017-10-01" #> [11] "2017-11-01" "2017-12-01" Write a function that given your birthday (as a date), returns how old you are in years. age <- function(bday) { (bday %--% today()) %/% years(1) } age(ymd("1990-10-12")) #> Note: method with signature 'Timespan#Timespan' chosen for function '%/%', #> target signature 'Interval#Period'. #> "Interval#ANY", "ANY#Period" would also be valid #> [1] 26 Why can’t (today() %--% (today() + years(1)) / months(1) work? It appears to work. Today is a date. Today + 1 year is a valid endpoint for an interval. And months is period that is defined in this period. (today() %--% (today() + years(1))) %/% months(1) #> [1] 12 (today() %--% (today() + years(1))) / months(1) #> [1] 12 11.4.5 Time Zones No exercises. But time-zones are hell. Be happy you aren’t dealing with financial data. "], -["vectors.html", "12 Vectors 12.1 Introduction 12.2 Important types of Atomic Vector 12.3 Using atomic vectors 12.4 Recursive Vectors (lists) 12.5 Augmented Vectors", " 12 Vectors 12.1 Introduction Functions mentioned typeof dplyr::near is.finite, is.nan, is.na attributes library("tidyverse") #> Loading tidyverse: ggplot2 #> Loading tidyverse: tibble #> Loading tidyverse: tidyr #> Loading tidyverse: readr #> Loading tidyverse: purrr #> Loading tidyverse: dplyr #> Conflicts with tidy packages ---------------------------------------------- #> filter(): dplyr, stats #> lag(): dplyr, stats 12.2 Important types of Atomic Vector Why does this matter? 99% of the time in the work you do, it won’t. Someone else has written the numerical methods and (hopefully) accounted for these issues. And the types of problems you encounter in social science generally are not dealing with these issues. However, if you aren’t even aware that “floating point numbers” are a “thing”, if something goes wrong, it will seem like magic. Also, at least being aware of these problems will help you understand error messages from optimization routines that complaing of “numerical precision”. 12.2.1 Exercises Describe the difference between is.finite(x) and !is.infinite(x). To find out, try the functions on a numeric vector that includes a number and the five special values (NA, NaN, Inf, -Inf). x <- c(0, NA, NaN, Inf, -Inf) is.finite(x) #> [1] TRUE FALSE FALSE FALSE FALSE !is.infinite(x) #> [1] TRUE TRUE TRUE FALSE FALSE is.finite considers only a number to be finite, and considers missing (NA), not a number (NaN), and positive and negative infinity to be not finite. However, since is.infinite only considers Inf and -Inf to be inifinite, !is.infinite considers 0 as well as missing and not-a-number to be not infinite. So NA and NaN are neither finite or infinite. Mind blown. Read the source code for dplyr::near() (Hint: to see the source code, drop the ()). How does it work? The source for dplyr::near is: dplyr::near #> function (x, y, tol = .Machine$double.eps^0.5) #> { #> abs(x - y) < tol #> } #> <environment: namespace:dplyr> Instead of checking for exact equality, it checks that two numbers are within a certain tolerance, tol. By default the tolerance is set to the square root of .Machine$double.eps, which is the smallest floating point number that the computer can represent. A logical vector can take 3 possible values. How many possible values can an integer vector take? How many possible values can a double take? Use google to do some research. The help for .Machine describes some of this: As all current implementations of R use 32-bit integers and usne IEC 60559 floating-point (double precision) arithmetic, The IEC 60559 or IEEE 754 format uses a 64 bit vector, but Brainstorm at least four functions that allow you to convert a double to an integer. How do they differ? Be precise. Broadly, could convert a double to an integer by truncating or rounding to the nearest integer. For truncating or for handling ties (doubles ending in 0.5), there are multiple methods for determing which integer value to go to. methods 0.5 -0.5 1.5 -1.5 ============================== ==== ===== ==== ==== towards zero: 0 0 1 1 away from zero 1 -1 2 -2 largest towards \\(+\\infty\\)) 1 0 2 -1 smallest (towards \\(-\\infty\\)) 0 -1 1 -2 even 0 0 2 -2 odd 1 -1 1 -1 ===================================================== See the Wikipedia article IEEE floating point for rounding rules. For rounding, R and many programming languages use the IEEE standard. This is “round to nearest, ties to even”. This is not the same as what you See the value of looking at the value of .Machine$double.rounding and its documentation. x <- seq(-10, 10, by = 0.5) round2 <- function(x, to_even = TRUE) { q <- x %/% 1 r <- x %% 1 q + (r >= 0.5) } x <- c(-12.5, -11.5, 11.5, 12.5) round(x) #> [1] -12 -12 12 12 round2(x, to_even = FALSE) #> [1] -12 -11 12 13 The problem with the always rounding 0.5 up rule is that it is biased upwards. Rounding to nearest with ties towards even is not. Consider the sequence \\(-100.5, -99.5, \\dots, 0, \\dots, 99.5, 100.5\\). Its sum is 0. It would be nice if rounding preserved that sum. Using the “ties towards even”, the sum is still zero. Hoever, the “ties towards \\(+\\infty\\)” produces a non-zero number. x <- seq(-100.5, 100.5, by = 1) sum(x) #> [1] 0 sum(round(x)) #> [1] 0 sum(round2(x)) #> [1] 101 Here’s a real-world non-engineering example of rounding going terribly wrong. In 1983, the Vancouver stock exchange adjusted its index from 524.811 to 1098.892 to correct for accumulated error due to rounding to three decimal points (see Vancouver Stock Exchange). Here’s a list of a few more. What functions from the readr package allow you to turn a string into logical, integer, and double vector? The functions parse_logical, parse_integer, and parse_number. parse_logical(c("TRUE", "FALSE", "1", "0", "true", "t", "NA")) #> [1] TRUE FALSE TRUE FALSE TRUE TRUE NA parse_integer(c("1235", "0134", "NA")) #> [1] 1235 134 NA parse_number(c("1.0", "3.5", "1,000", "NA")) #> [1] 1.0 3.5 1000.0 NA Read the documentation of read_number. In order to ignore things like currency symbols and comma seperators in number strings it ignores them using a heuristic. 12.3 Using atomic vectors What does mean(is.na(x)) tell you about a vector x? What about sum(!is.finite(x))? The expression mean(is.na(x)) calculates the proportion of missing values in a vector x <- c(1:10, NA, NaN, Inf, -Inf) mean(is.na(x)) #> [1] 0.143 The expression mean(!is.finite(x)) calcualtes the proportion of values that are NA, NaN, or infinite. mean(!is.finite(x)) #> [1] 0.286 Carefully read the documentation of is.vector(). What does it actually test for? Why does is.atomic() not agree with the definition of atomic vectors above? The function is.vector only checks whether the object has no attributes other than names. Thus a list is a vector: is.vector(list(a = 1, b = 2)) #> [1] TRUE But any object that has an attribute (other than names) is not: x <- 1:10 attr(x, "something") <- TRUE is.vector(x) #> [1] FALSE The idea behind this is that object oriented classes will include attributes, including, but not limited to "class". The function is.atomic explicitly checks whether an object is one of the atomic types (“logical”, “integer”, “numeric”, “complex”, “character”, and “raw”) or NULL. is.atomic(1:10) #> [1] TRUE is.atomic(list(a = 1)) #> [1] FALSE The function is.atomic will consider objects to be atomic even if they have extra attributes. is.atomic(x) #> [1] TRUE Compare and contrast setNames() with purrr::set_names(). These are simple functions, so we can simply print out their source code: setNames #> function (object = nm, nm) #> { #> names(object) <- nm #> object #> } #> <bytecode: 0x7fb54902f638> #> <environment: namespace:stats> purrr::set_names #> function (x, nm = x) #> { #> if (!is_vector(x)) { #> stop("`x` must be a vector", call. = FALSE) #> } #> if (length(x) != length(nm)) { #> stop("`x` and `nm` must be the same length", call. = FALSE) #> } #> names(x) <- nm #> x #> } #> <environment: namespace:purrr> From the code we can see that set_names adds a few sanity checks: x has to be a vector, and the lengths of the object and the names have to be the same. Create functions that take a vector as input and returns: The last value. Should you use [ or [[? 2 The elements at even numbered positions. Every element except the last value. Only even numbers (and no missing values). last_value <- function(x) { # check for case with no length if (length(x)) { # Use [[ as suggested because it returns one element x[[length(x)]] } else { x } } last_value(numeric()) #> numeric(0) last_value(1) #> [1] 1 last_value(1:10) #> [1] 10 even_indices <- function(x) { if (length(x)) { x[seq_along(x) %% 2 == 0] } else { x } } even_indices(numeric()) #> numeric(0) even_indices(1) #> numeric(0) even_indices(1:10) #> [1] 2 4 6 8 10 # test using case to ensure that values not indices # are being returned even_indices(letters) #> [1] "b" "d" "f" "h" "j" "l" "n" "p" "r" "t" "v" "x" "z" not_last <- function(x) { if (length(x)) { x[-length(x)] } else { x } } not_last(1:5) #> [1] 1 2 3 4 even_numbers <- function(x) { x[!is.na(x) & (x %% 2 == 0)] } even_numbers(-10:10) #> [1] -10 -8 -6 -4 -2 0 2 4 6 8 10 Why is x[-which(x > 0)] not the same as x[x <= 0]? They will treat missing values differently. x <- c(-5:5, Inf, -Inf, NaN, NA) x[-which(x > 0)] #> [1] -5 -4 -3 -2 -1 0 -Inf NaN NA -which(x > 0) #> [1] -7 -8 -9 -10 -11 -12 x[x <= 0] #> [1] -5 -4 -3 -2 -1 0 -Inf NA NA x <= 0 #> [1] TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE #> [12] FALSE TRUE NA NA -which(x > 0) which calculates the indexes for any value that is TRUE and ignores NA. Thus is keeps NA and NaN because the comparison is not TRUE. x <= 0 works slightly differently. If x <= 0 returns TRUE or FALSE it works the same way. Hoewver, if the comparison generates a NA, then it will always keep that entry, but set it to NA. This is why the last two values of x[x <= 0] are NA rather than c(NaN, NA). What happens when you subset with a positive integer that’s bigger than the length of the vector? What happens when you subset with a name that doesn’t exist? When you subset with positive integers that are larger than the length of the vector, NA values are returned for those integers larger than the length of the vector. (1:10)[11:12] #> [1] NA NA When a vector is subset with a name that doesn’t exist, an error is generated. c(a = 1, 2)[["b"]] #> Error in c(a = 1, 2)[["b"]]: subscript out of bounds 12.4 Recursive Vectors (lists) 12.4.1 Exercises Draw the following lists as nested sets: list(a, b, list(c, d), list(e, f)) list(list(list(list(list(list(a)))))) What happens if you subset a tibble as if you’re subsetting a list? What are the key differences between a list and a tibble? Subsetting a tibble works the same way as a list; a data frame can be thought of as a list of columns. The key different between a list and a tibble is that a tibble (data frame) has the restriction that all its elements (columns) must have the same length. x <- tibble(a = 1:2, b = 3:4) x[["a"]] #> [1] 1 2 x["a"] #> # A tibble: 2 × 1 #> a #> <int> #> 1 1 #> 2 2 x[1] #> # A tibble: 2 × 1 #> a #> <int> #> 1 1 #> 2 2 x[1, ] #> # A tibble: 1 × 2 #> a b #> <int> <int> #> 1 1 3 12.5 Augmented Vectors 12.5.1 Exercises What does hms::hms(3600) return? How does it print? What primitive type is the augmented vector built on top of? What attributes does it use? x <- hms::hms(3600) class(x) #> [1] "hms" "difftime" x #> 01:00:00 hms::hms returns an object of class, and prints the time in “%H:%M:%S” format. The primitive type is a double typeof(x) #> [1] "double" The atttributes is uses are "units" and "class". attributes(x) #> $units #> [1] "secs" #> #> $class #> [1] "hms" "difftime" Try and make a tibble that has columns with different lengths. What happens? If I try to create at tibble with a scalar and column of a different length there are no issues, and the scalar is repeated to the length of the longer vector. tibble(x = 1, y = 1:5) #> # A tibble: 5 × 2 #> x y #> <dbl> <int> #> 1 1 1 #> 2 1 2 #> 3 1 3 #> 4 1 4 #> 5 1 5 However, if I try to create a tibble with two vectors of different lengths (other than one), the tibble function throws an error. tibble(x = 1:3, y = 1:4) #> Error: Variables must be length 1 or 4. #> Problem variables: 'x' Based on the definition above, is it ok to have a list as a column of a tibble? If I didn’t already know the answer, what I would do is try it out. From the above, the error message was about vectors having different lengths. But there is nothing that prevents a tibble from having vectors of different types: doubles, character, integers, logical, factor, date. The later are still atomic, but they have additional attributes. So, maybe there won’t be an issue with a list vector as long as it is the same length. tibble(x = 1:3, y = list("a", 1, list(1:3))) #> # A tibble: 3 × 2 #> x y #> <int> <list> #> 1 1 <chr [1]> #> 2 2 <dbl [1]> #> 3 3 <list [1]> It works! I even used a list with heterogenous types and there wasn’t an issue. In following chapters we’ll see that list vectors can be very useful: for example, when processing many different models. "], -["iteration.html", "13 Iteration 13.1 Introduction 13.2 For Loops 13.3 For loop variations 13.4 For loops vs. functionals 13.5 The map functions 13.6 Dealing with Failure 13.7 Mapping over multiple arguments 13.8 Walk 13.9 Other patterns of for loops", " 13 Iteration 13.1 Introduction purrr package for loop while seq_len, seq_along unlist bind_rows, bind_cols, purrr::flatten_dbl Map functions in purrr: map and type-specific variants map_lgl, map_chr, map_int, map_dbl. col_summary apply function in base R: lapply, sapply, vapply safely, quietly, possibly walk and variants keep, discard, some, every, head_while, tail_while, detect, detect_index reduce library("tidyverse") library("stringr") 13.2 For Loops 13.2.1 Exercises Write for loops to: Compute the mean of every column in mtcars. Determine the type of each column in nycflights13::flights. Compute the number of unique values in each column of iris. Generate 10 random normals for each of \\(\\mu = -10\\), 0, 10, and 100. Think about the output, sequence, and body before you start writing the loop. To compute the mean of every column in mtcars. output <- vector("double", ncol(mtcars)) names(output) <- names(mtcars) for (i in names(mtcars)) { output[i] <- mean(mtcars[[i]]) } output #> mpg cyl disp hp drat wt qsec vs am #> 20.091 6.188 230.722 146.688 3.597 3.217 17.849 0.438 0.406 #> gear carb #> 3.688 2.812 Determine the type of each column in nycflights13::flights. Note that we need to use a list, not a character vector, since the class can have multiple values. data("flights", package = "nycflights13") output <- vector("list", ncol(flights)) names(output) <- names(flights) for (i in names(flights)) { output[[i]] <- class(flights[[i]]) } output #> $year #> [1] "integer" #> #> $month #> [1] "integer" #> #> $day #> [1] "integer" #> #> $dep_time #> [1] "integer" #> #> $sched_dep_time #> [1] "integer" #> #> $dep_delay #> [1] "numeric" #> #> $arr_time #> [1] "integer" #> #> $sched_arr_time #> [1] "integer" #> #> $arr_delay #> [1] "numeric" #> #> $carrier #> [1] "character" #> #> $flight #> [1] "integer" #> #> $tailnum #> [1] "character" #> #> $origin #> [1] "character" #> #> $dest #> [1] "character" #> #> $air_time #> [1] "numeric" #> #> $distance #> [1] "numeric" #> #> $hour #> [1] "numeric" #> #> $minute #> [1] "numeric" #> #> $time_hour #> [1] "POSIXct" "POSIXt" data(iris) iris_uniq <- vector("double", ncol(iris)) names(iris_uniq) <- names(iris) for (i in names(iris)) { iris_uniq[i] <- length(unique(iris[[i]])) } iris_uniq #> Sepal.Length Sepal.Width Petal.Length Petal.Width Species #> 35 23 43 22 3 # number to draw n <- 10 # values of the mean mu <- c(-10, 0, 10, 100) normals <- vector("list", length(mu)) for (i in seq_along(normals)) { normals[[i]] <- rnorm(n, mean = mu[i]) } normals #> [[1]] #> [1] -11.40 -9.74 -12.44 -10.01 -9.38 -8.85 -11.82 -10.25 -10.24 -10.28 #> #> [[2]] #> [1] -0.5537 0.6290 2.0650 -1.6310 0.5124 -1.8630 -0.5220 -0.0526 #> [9] 0.5430 -0.9141 #> #> [[3]] #> [1] 10.47 10.36 8.70 10.74 11.89 9.90 9.06 9.98 9.17 8.49 #> #> [[4]] #> [1] 100.9 100.2 100.2 101.6 100.1 99.9 98.1 99.7 99.7 101.1 However, we don’t need a for loop for this since rnorm recycles means. matrix(rnorm(n * length(mu), mean = mu), ncol = n) #> [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] #> [1,] -9.930 -9.56 -9.88 -10.2061 -12.27 -8.926 -11.178 -9.51 -8.663 #> [2,] -0.639 2.76 -1.91 0.0192 2.68 -0.665 -0.976 -1.70 0.237 #> [3,] 9.950 10.05 10.86 10.0296 9.64 11.114 11.065 8.53 11.318 #> [4,] 99.749 100.58 99.76 100.5498 100.21 99.754 100.132 100.28 100.524 #> [,10] #> [1,] -9.39 #> [2,] -0.11 #> [3,] 10.17 #> [4,] 99.91 Eliminate the for loop in each of the following examples by taking advantage of an existing function that works with vectors: out <- "" for (x in letters) { out <- stringr::str_c(out, x) } out #> [1] "abcdefghijklmnopqrstuvwxyz" str_c already works with vectors, so simply use str_c with the collapse argument to return a single string. stringr::str_c(letters, collapse = "") #> [1] "abcdefghijklmnopqrstuvwxyz" For this I’m going to rename the variable sd to something different because sd is the name of the function we want to use. x <- sample(100) sd. <- 0 for (i in seq_along(x)) { sd. <- sd. + (x[i] - mean(x)) ^ 2 } sd. <- sqrt(sd. / (length(x) - 1)) sd. #> [1] 29 We could simply use the sd function. sd(x) #> [1] 29 Or if there was a need to use the equation (e.g. for pedagogical reasons), then the functions mean and sum already work with vectors: sqrt(sum((x - mean(x)) ^ 2) / (length(x) - 1)) #> [1] 29 x <- runif(100) out <- vector("numeric", length(x)) out[1] <- x[1] for (i in 2:length(x)) { out[i] <- out[i - 1] + x[i] } out #> [1] 0.126 1.064 1.865 2.623 3.156 3.703 3.799 4.187 4.359 5.050 #> [11] 5.725 6.672 6.868 7.836 8.224 8.874 9.688 9.759 10.286 11.050 #> [21] 11.485 12.038 12.242 12.273 13.242 13.421 14.199 15.085 15.921 16.527 #> [31] 17.434 17.470 17.601 17.695 18.392 18.797 18.863 18.989 19.927 20.143 #> [41] 20.809 21.013 21.562 22.389 22.517 22.778 23.066 23.081 23.935 24.349 #> [51] 25.100 25.819 26.334 27.309 27.670 27.840 28.623 28.654 29.444 29.610 #> [61] 29.639 30.425 31.250 32.216 32.594 32.769 33.372 34.178 34.215 34.947 #> [71] 35.163 35.179 35.307 35.993 36.635 36.963 37.350 38.058 38.755 39.681 #> [81] 40.140 40.736 40.901 41.468 42.366 42.960 43.792 44.386 45.165 45.562 #> [91] 46.412 47.154 47.472 47.583 47.685 48.485 48.865 48.917 49.904 50.508 The code above is calculating a cumulative sum. Use the function cumsum all.equal(cumsum(x),out) #> [1] TRUE Ex. 21.2.1.3 Combine your function writing and for loop skills: 1. Write a for loop that `prints()` the lyrics to the children's song "Alice the camel". 1. Convert the nursery rhyme "ten in the bed" to a function. Generalise it to any number of people in any sleeping structure. 1. Convert the song "99 bottles of beer on the wall" to a function. Generalise to any number of any vessel containing any liquid on any surface. I don’t know what the deal is with Hadley and nursery rhymes. Here’s the lyrics for Alice the Camel We’ll look from five to no humps, and print out a different last line if there are no humps. This uses cat instead of print, so it looks nicer. humps <- c("five", "four", "three", "two", "one", "no") for (i in humps) { cat(str_c("Alice the camel has ", rep(i, 3), " humps.", collapse = "\\n"), "\\n") if (i == "no") { cat("Now Alice is a horse.\\n") } else { cat("So go, Alice, go.\\n") } cat("\\n") } #> Alice the camel has five humps. #> Alice the camel has five humps. #> Alice the camel has five humps. #> So go, Alice, go. #> #> Alice the camel has four humps. #> Alice the camel has four humps. #> Alice the camel has four humps. #> So go, Alice, go. #> #> Alice the camel has three humps. #> Alice the camel has three humps. #> Alice the camel has three humps. #> So go, Alice, go. #> #> Alice the camel has two humps. #> Alice the camel has two humps. #> Alice the camel has two humps. #> So go, Alice, go. #> #> Alice the camel has one humps. #> Alice the camel has one humps. #> Alice the camel has one humps. #> So go, Alice, go. #> #> Alice the camel has no humps. #> Alice the camel has no humps. #> Alice the camel has no humps. #> Now Alice is a horse. The lyrics for Ten in the Bed: numbers <- c("ten", "nine", "eight", "seven", "six", "five", "four", "three", "two", "one") for (i in numbers) { cat(str_c("There were ", i, " in the bed\\n")) cat("and the little one said\\n") if (i == "one") { cat("I'm lonely...") } else { cat("Roll over, roll over\\n") cat("So they all rolled over and one fell out.\\n") } cat("\\n") } #> There were ten in the bed #> and the little one said #> Roll over, roll over #> So they all rolled over and one fell out. #> #> There were nine in the bed #> and the little one said #> Roll over, roll over #> So they all rolled over and one fell out. #> #> There were eight in the bed #> and the little one said #> Roll over, roll over #> So they all rolled over and one fell out. #> #> There were seven in the bed #> and the little one said #> Roll over, roll over #> So they all rolled over and one fell out. #> #> There were six in the bed #> and the little one said #> Roll over, roll over #> So they all rolled over and one fell out. #> #> There were five in the bed #> and the little one said #> Roll over, roll over #> So they all rolled over and one fell out. #> #> There were four in the bed #> and the little one said #> Roll over, roll over #> So they all rolled over and one fell out. #> #> There were three in the bed #> and the little one said #> Roll over, roll over #> So they all rolled over and one fell out. #> #> There were two in the bed #> and the little one said #> Roll over, roll over #> So they all rolled over and one fell out. #> #> There were one in the bed #> and the little one said #> I'm lonely... For the bottles of beer, I define a helper function to correctly print the number of bottles. bottles <- function(i) { if (i > 2) { bottles <- str_c(i - 1, " bottles") } else if (i == 2) { bottles <- "1 bottle" } else { bottles <- "no more bottles" } bottles } beer_bottles <- function(n) { # should test whether n >= 1. for (i in seq(n, 1)) { cat(str_c(bottles(i), " of beer on the wall, ", bottles(i), " of beer.\\n")) cat(str_c("Take one down and pass it around, ", bottles(i - 1), " of beer on the wall.\\n\\n")) } cat("No more bottles of beer on the wall, no more bottles of beer.\\n") cat(str_c("Go to the store and buy some more, ", bottles(n), " of beer on the wall.\\n")) } beer_bottles(3) #> 2 bottles of beer on the wall, 2 bottles of beer. #> Take one down and pass it around, 1 bottle of beer on the wall. #> #> 1 bottle of beer on the wall, 1 bottle of beer. #> Take one down and pass it around, no more bottles of beer on the wall. #> #> no more bottles of beer on the wall, no more bottles of beer. #> Take one down and pass it around, no more bottles of beer on the wall. #> #> No more bottles of beer on the wall, no more bottles of beer. #> Go to the store and buy some more, 2 bottles of beer on the wall. Ex 21.2.1.4 It’s common to see for loops that don’t preallocate the output and instead increase the length of a vector at each step: output <- vector("integer", 0) for (i in seq_along(x)) { output <- c(output, lengths(x[[i]])) } output I’ll use the package microbenchmark to time this. Microbenchmark will run an R expression a number of times and time it. Define a function that appends to an integer vector. library("microbenchmark") add_to_vector <- function(n) { output <- vector("integer", 0) for (i in seq_len(n)) { output <- c(output, i) } output } microbenchmark(add_to_vector(10000), times = 3) #> Unit: milliseconds #> expr min lq mean median uq max neval #> add_to_vector(10000) 156 158 172 159 180 201 3 And one that pre-allocates it. add_to_vector_2 <- function(n) { output <- vector("integer", n) for (i in seq_len(n)) { output[[i]] <- i } output } microbenchmark(add_to_vector_2(10000), times = 3) #> Unit: milliseconds #> expr min lq mean median uq max neval #> add_to_vector_2(10000) 7.1 7.25 7.33 7.39 7.44 7.49 3 The pre-allocated vector is about 100 times faster! YMMV, but the longer the vector and the bigger the objects, the more that pre-allocation will outperform appending. 13.3 For loop variations 13.3.1 Ex Imagine you have a directory full of CSV files that you want to read in. You have their paths in a vector, files <- dir("data/", pattern = "\\\\.csv$", full.names = TRUE), and now want to read each one with read_csv(). Write the for loop that will load them into a single data frame. I pre-allocate a list, read each file as data frame into an element in that list. This creates a list of data frames. I then use bind_rows to create a single data frame from the list of data frames. df <- vector("list", lenght(files)) for (fname in seq_along(files)) { df[[i]] <- read_csv(files[[i]]) } df <- bind_rows(df) Ex What happens if you use for (nm in names(x)) and x has no names? What if only some of the elements are named? What if the names are not unique? Let’s try it out and see what happens. When there are no names for the vector, it does not run the code in the loop (it runs zero iterations of the loop): x <- 1:3 print(names(x)) #> NULL for (nm in names(x)) { print(nm) print(x[[nm]]) } Note that the length of NULL is zero: length(NULL) #> [1] 0 If there only some names, then we get an error if we try to access an element without a name. However, oddly, nm == "" when there is no name. x <- c(a = 1, 2, c = 3) names(x) #> [1] "a" "" "c" for (nm in names(x)) { print(nm) print(x[[nm]]) } #> [1] "a" #> [1] 1 #> [1] "" #> Error in x[[nm]]: subscript out of bounds Finally, if there are duplicate names, then x[[nm]] will give the first element with that name. There is no way to access duplicately named elements by name. x <- c(a = 1, a = 2, c = 3) names(x) #> [1] "a" "a" "c" for (nm in names(x)) { print(nm) print(x[[nm]]) } #> [1] "a" #> [1] 1 #> [1] "a" #> [1] 1 #> [1] "c" #> [1] 3 Ex Write a function that prints the mean of each numeric column in a data frame, along with its name. For example, show_mean(iris) would print: show_mean(iris) #> Sepal.Length: 5.84 #> Sepal.Width: 3.06 #> Petal.Length: 3.76 #> Petal.Width: 1.20 (Extra challenge: what function did I use to make sure that the numbers lined up nicely, even though the variable names had different lengths?) There may be other functions to do this, but I’ll use str_pad, and str_length to ensure that the space given to the variable names is the same. I messed around with the options to format until I got two digits . show_mean <- function(df, digits = 2) { # Get max length of any variable in the dataset maxstr <- max(str_length(names(df))) for (nm in names(df)) { if (is.numeric(df[[nm]])) { cat(str_c(str_pad(str_c(nm, ":"), maxstr + 1L, side = "right"), format(mean(df[[nm]]), digits = digits, nsmall = digits), sep = " "), "\\n") } } } show_mean(iris) #> Sepal.Length: 5.84 #> Sepal.Width: 3.06 #> Petal.Length: 3.76 #> Petal.Width: 1.20 Ex What does this code do? How does it work? trans <- list( disp = function(x) x * 0.0163871, am = function(x) { factor(x, labels = c("auto", "manual")) } ) for (var in names(trans)) { mtcars[[var]] <- trans[[var]](mtcars[[var]]) } This code mutates the disp and am columns: disp is multiplied by 0.0163871 am is replaced by a factor variable. The code works by looping over a named list of functions. It calls the named function in the list on the column of mtcars with the same name, and replaces the values of that column. E.g. this is a function: trans[["disp"]] #> function(x) x * 0.0163871 This applies the function to the column of mtcars with the same name trans[["disp"]](mtcars[["disp"]]) #> [1] 0.0430 0.0430 0.0290 0.0693 0.0967 0.0604 0.0967 0.0394 0.0378 0.0450 #> [11] 0.0450 0.0741 0.0741 0.0741 0.1267 0.1235 0.1182 0.0211 0.0203 0.0191 #> [21] 0.0323 0.0854 0.0816 0.0940 0.1074 0.0212 0.0323 0.0255 0.0943 0.0389 #> [31] 0.0808 0.0325 13.4 For loops vs. functionals col_summary <- function(df, fun) { out <- vector("double", length(df)) for (i in seq_along(df)) { out[i] <- fun(df[[i]]) } out } 13.4.1 Exercises Ex. 21.4.1.1 Read the documentation for apply(). In the 2d case, what two for loops does it generalise. It generalises looping over the rows or columns of a matrix or data-frame. Ex. 21.4.1.2 Adapt col_summary() so that it only applies to numeric columns You might want to start with an is_numeric() function that returns a logical vector that has a TRUE corresponding to each numeric column. col_summary2 <- function(df, fun) { # test whether each colum is numeric numeric_cols <- vector("logical", length(df)) for (i in seq_along(df)) { numeric_cols[[i]] <- is.numeric(df[[i]]) } # indexes of numeric columns idxs <- seq_along(df)[numeric_cols] # number of numeric columns n <- sum(numeric_cols) out <- vector("double", n) for (i in idxs) { out[i] <- fun(df[[i]]) } out } Let’s test that it works, df <- tibble( a = rnorm(10), b = rnorm(10), c = letters[1:10], d = rnorm(10) ) col_summary2(df, mean) #> [1] 0.859 0.555 0.000 -0.451 13.5 The map functions 13.5.1 Shortcuts Notes The lm() function runs a linear regression. It is covered in the Model Basics chapter. 13.5.2 Exercises Ex Write code that uses one of the map functions to: 1. Compute the mean of every column in `mtcars`. 1. Determine the type of each column in `nycflights13::flights`. 1. Compute the number of unique values in each column of `iris`. 1. Generate 10 random normals for each of $\\mu = -10$, $0$, $10$, and $100$. The mean of every column in mtcars: map_dbl(mtcars, mean) #> Warning in mean.default(.x[[i]], ...): argument is not numeric or logical: #> returning NA #> mpg cyl disp hp drat wt qsec vs am #> 20.091 6.188 3.781 146.688 3.597 3.217 17.849 0.438 NA #> gear carb #> 3.688 2.812 The type of every column in nycflights13::flights. map(nycflights13::flights, class) #> $year #> [1] "integer" #> #> $month #> [1] "integer" #> #> $day #> [1] "integer" #> #> $dep_time #> [1] "integer" #> #> $sched_dep_time #> [1] "integer" #> #> $dep_delay #> [1] "numeric" #> #> $arr_time #> [1] "integer" #> #> $sched_arr_time #> [1] "integer" #> #> $arr_delay #> [1] "numeric" #> #> $carrier #> [1] "character" #> #> $flight #> [1] "integer" #> #> $tailnum #> [1] "character" #> #> $origin #> [1] "character" #> #> $dest #> [1] "character" #> #> $air_time #> [1] "numeric" #> #> $distance #> [1] "numeric" #> #> $hour #> [1] "numeric" #> #> $minute #> [1] "numeric" #> #> $time_hour #> [1] "POSIXct" "POSIXt" I had to use map rather than map_chr since the class Though if by type, typeof is meant: map_chr(nycflights13::flights, typeof) #> year month day dep_time sched_dep_time #> "integer" "integer" "integer" "integer" "integer" #> dep_delay arr_time sched_arr_time arr_delay carrier #> "double" "integer" "integer" "double" "character" #> flight tailnum origin dest air_time #> "integer" "character" "character" "character" "double" #> distance hour minute time_hour #> "double" "double" "double" "double" The number of unique values in each column of iris: map_int(iris, ~ length(unique(.))) #> Sepal.Length Sepal.Width Petal.Length Petal.Width Species #> 35 23 43 22 3 Generate 10 random normals for each of \\(\\mu = -10\\), \\(0\\), \\(10\\), and \\(100\\): map(c(-10, 0, 10, 100), rnorm, n = 10) #> [[1]] #> [1] -11.27 -9.46 -9.92 -9.44 -9.58 -11.45 -9.06 -10.34 -10.08 -9.96 #> #> [[2]] #> [1] 0.124 -0.998 1.233 0.340 -0.473 0.709 -1.529 0.237 -1.313 0.747 #> #> [[3]] #> [1] 8.44 10.07 9.36 9.15 10.68 11.15 8.31 9.10 11.32 11.10 #> #> [[4]] #> [1] 101.2 98.6 101.4 100.0 99.9 100.4 100.1 99.2 99.5 98.8 Ex How can you create a single vector that for each column in a data frame indicates whether or not it’s a factor? Use map_lgl with the function is.factor, map_lgl(mtcars, is.factor) #> mpg cyl disp hp drat wt qsec vs am gear carb #> FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE Ex What happens when you use the map functions on vectors that aren’t lists? What does map(1:5, runif) do? Why? The function map applies the function to each element of the vector. map(1:5, runif) #> [[1]] #> [1] 0.226 #> #> [[2]] #> [1] 0.133 0.927 #> #> [[3]] #> [1] 0.894 0.204 0.257 #> #> [[4]] #> [1] 0.614 0.441 0.316 0.101 #> #> [[5]] #> [1] 0.2726 0.6537 0.9279 0.0266 0.5595 Ex What does map(-2:2, rnorm, n = 5) do? Why? What does map_dbl(-2:2, rnorm, n = 5) do? Why? This takes samples of n = 5 from normal distributions of means -2, -1, 0, 1, and 2, and returns a list with each element a numeric vectors of length 5. map(-2:2, rnorm, n = 5) #> [[1]] #> [1] -0.945 -2.821 -2.638 -2.153 -3.416 #> #> [[2]] #> [1] -0.393 -0.912 -2.570 -0.687 -0.347 #> #> [[3]] #> [1] -0.00796 1.72703 2.08647 -0.35835 -1.44212 #> #> [[4]] #> [1] 1.38 1.09 1.16 1.36 0.64 #> #> [[5]] #> [1] 1.8914 3.8278 0.0381 2.9460 2.5490 However, if we use map_dbl it throws an error. map_dbl expects the function to return a numeric vector of length one. map_dbl(-2:2, rnorm, n = 5) #> Error: Result 1 is not a length 1 atomic vector If we wanted a numeric vector, we could use map followed by flatten_dbl, flatten_dbl(map(-2:2, rnorm, n = 5)) #> [1] -1.402 -1.872 -3.717 -1.964 -0.993 -0.287 -2.110 -0.851 -1.386 -1.230 #> [11] 0.392 0.470 0.989 -0.714 1.270 1.709 2.047 -0.210 1.380 0.933 #> [21] 2.280 2.330 2.285 2.429 1.879 Ex Rewrite map(x, function(df) lm(mpg ~ wt, data = df)) to eliminate the anonymous function. map(list(mtcars), ~ lm(mpg ~ wt, data = .)) #> [[1]] #> #> Call: #> lm(formula = mpg ~ wt, data = .) #> #> Coefficients: #> (Intercept) wt #> 37.29 -5.34 13.6 Dealing with Failure 13.7 Mapping over multiple arguments 13.8 Walk 13.9 Other patterns of for loops 13.9.1 Exercises Ex Implement your own version of every() using a for loop. Compare it with purrr::every(). What does purrr’s version do that your version doesn’t? # Use ... to pass arguments to the function every2 <- function(.x, .p, ...) { for (i in .x) { if (!.p(i, ...)) { # If any is FALSE we know not all of then were TRUE return(FALSE) } } # if nothing was FALSE, then it is TRUE TRUE } every2(1:3, function(x) {x > 1}) #> [1] FALSE every2(1:3, function(x) {x > 0}) #> [1] TRUE The function purrr::every does fancy things with .p, like taking a logical vector instead of a function, or being able to test part of a string if the elements of .x are lists. Ex Create an enhanced col_sum() that applies a summary function to every numeric column in a data frame. Note this question has a typo. It is referring to col_summary. I will use map to apply the function to all the columns, and keep to only select numeric columns. col_sum2 <- function(df, f, ...) { map(keep(df, is.numeric), f, ...) } col_sum2(iris, mean) #> $Sepal.Length #> [1] 5.84 #> #> $Sepal.Width #> [1] 3.06 #> #> $Petal.Length #> [1] 3.76 #> #> $Petal.Width #> [1] 1.2 Ex A possible base R equivalent of col_sum() is: col_sum3 <- function(df, f) { is_num <- sapply(df, is.numeric) df_num <- df[, is_num] sapply(df_num, f) } But it has a number of bugs as illustrated with the following inputs: df <- tibble( x = 1:3, y = 3:1, z = c("a", "b", "c") ) # OK col_sum3(df, mean) # Has problems: don't always return numeric vector col_sum3(df[1:2], mean) col_sum3(df[1], mean) col_sum3(df[0], mean) What causes the bugs? The problem is that sapply doesn’t always return numeric vectors. If no columns are selected, instead of gracefully exiting, it returns an empty list. This causes an error since we can’t use a list with [. sapply(df[0], is.numeric) #> named list() sapply(df[1], is.numeric) #> a #> TRUE sapply(df[1:2], is.numeric) #> a b #> TRUE TRUE "], -["model-introduction.html", "14 Model Introduction", " 14 Model Introduction Some of the discussion of models is slightly different, and has a different emphasis than in most social science research. This is largely because this book is speaking to data scientists, where the primary goal is prediction rather than theory testing (that I don’t view these as too different is a different story). The discussion about hypothesis generation vs. confirmation is interesting. Too little emphasis is placed on hypothesis generation in social science. The importance of out of sample testing also receives too little emphasis in political science. And from this discussion it should be clear that many papers in social science are hypothesis generation masquerading as hypothesis confirmation. "], -["model-basics.html", "15 Model Basics 15.1 Prerequisites 15.2 A simple model 15.3 Visualizing Models 15.4 Formulas and Model Families 15.5 Missing values 15.6 Other model families", " 15 Model Basics Distinction between family of models and fitted model is a useful way to think about models. Especially as we can abstract some families of models to be themselves a fitted model of a more flexible family of models. For example, linear regression is a special case of GLM or Gaussian Processes etc. 15.1 Prerequisites library(tidyverse) library(modelr) options(na.action = na.warn) The option na.action determines how missing values are handled. It is a function. na.warn sets it so that there is a warning if there are any missing values (by default, R will just silently drop them). 15.2 A simple model ggplot(sim1, aes(x, y)) + geom_point() models <- tibble( a1 = runif(250, -20, 40), a2 = runif(250, -5, 5) ) ggplot(sim1, aes(x, y)) + geom_abline(aes(intercept = a1, slope = a2), data = models, alpha = 1/4) + geom_point() model1 <- function(a, data) { a[1] + data$x * a[2] } model1(c(7, 1.5), sim1) #> [1] 8.5 8.5 8.5 10.0 10.0 10.0 11.5 11.5 11.5 13.0 13.0 13.0 14.5 14.5 #> [15] 14.5 16.0 16.0 16.0 17.5 17.5 17.5 19.0 19.0 19.0 20.5 20.5 20.5 22.0 #> [29] 22.0 22.0 measure_distance <- function(mod, data) { diff <- data$y - model1(mod, data) sqrt(mean(diff ^ 2)) } measure_distance(c(7, 1.5), sim1) #> [1] 2.67 sim1_dist <- function(a1, a2) { measure_distance(c(a1, a2), sim1) } models <- models %>% mutate(dist = purrr::map2_dbl(a1, a2, sim1_dist)) models #> # A tibble: 250 × 3 #> a1 a2 dist #> <dbl> <dbl> <dbl> #> 1 -15.15 0.0889 30.8 #> 2 30.06 -0.8274 13.2 #> 3 16.05 2.2695 13.2 #> 4 -10.57 1.3769 18.7 #> 5 -19.56 -1.0359 41.8 #> 6 7.98 4.5948 19.3 #> # ... with 244 more rows ggplot(sim1, aes(x, y)) + geom_point(size = 2, colour = "grey30") + geom_abline( aes(intercept = a1, slope = a2, colour = -dist), data = filter(models, rank(dist) <= 10) ) grid <- expand.grid( a1 = seq(-5, 20, length = 25), a2 = seq(1, 3, length = 25) ) %>% mutate(dist = purrr::map2_dbl(a1, a2, sim1_dist)) grid %>% ggplot(aes(a1, a2)) + geom_point(data = filter(grid, rank(dist) <= 10), size = 4, colour = "red") + geom_point(aes(colour = -dist)) ggplot(sim1, aes(x, y)) + geom_point(size = 2, colour = "grey30") + geom_abline( aes(intercept = a1, slope = a2, colour = -dist), data = filter(grid, rank(dist) <= 10) ) best <- optim(c(0, 0), measure_distance, data = sim1) best$par #> [1] 4.22 2.05 ggplot(sim1, aes(x, y)) + geom_point(size = 2, colour = "grey30") + geom_abline(intercept = best$par[1], slope = best$par[2]) sim1_mod <- lm(y ~ x, data = sim1) coef(sim1_mod) #> (Intercept) x #> 4.22 2.05 15.2.1 Exercises sim1a <- tibble( x = rep(1:10, each = 3), y = x * 1.5 + 6 + rt(length(x), df = 2) ) lm(y ~ x, data = sim1a) #> #> Call: #> lm(formula = y ~ x, data = sim1a) #> #> Coefficients: #> (Intercept) x #> 6.05 1.53 ggplot(sim1a, aes(x = x, y = y)) + geom_point() + geom_smooth(method = "lm", se = FALSE) To re-run this a few times using purrr simt <- function(i) { tibble( x = rep(1:10, each = 3), y = x * 1.5 + 6 + rt(length(x), df = 2), .id = i ) } lm_df <- function(.data) { mod <- lm(y ~ x, data = .data) beta <- coef(mod) tibble(intercept = beta[1], slope = beta[2]) } sims <- map(1:100, simt) %>% map_df(lm_df) ggplot(sims, aes(x = intercept, y = slope)) + geom_point() NOTE It’s not entirely clear what is meant by “visualize the results”. The data are generated from a low-degrees of freedmo t-distribution, so there will be outliers.r4ds Linear regression is One way to make linear models more robust is to use a different distance measure. For example, instead of root-mean-squared distance, you could use mean-absolute distance: measure_distance <- function(mod, data) { diff <- data$y - make_prediction(mod, data) mean(abs(diff)) } To re-run this a few times use purrr simt <- function(i) { tibble( x = rep(1:10, each = 3), y = x * 1.5 + 6 + rt(length(x), df = 2), .id = i ) } lm_df <- function(.data) { mod <- lm(y ~ x, data = .data) beta <- coef(mod) tibble(intercept = beta[1], slope = beta[2]) } sims <- map(1:100, simt) %>% map_df(lm_df) ggplot(sims, aes(x = intercept, y = slope)) + geom_point() One challenge with performing numerical optimisation is that it’s only guaranteed to find one local optima. What’s the problem with optimising a three parameter model like this? model1 <- function(a, data) { a[1] + data$x * a[2] + a[3] } The problem is that you for any values a[1] = a1 and a[3] = a3, any other values of a[1] and a[3] where a[1] + a[3] == (a1 + a3) will have the same fit. 15.3 Visualizing Models More complicated models can be visualized with predictions residuals Notes look at tidyr::complete, tidyr::expand, and modelr::data_grid functions modelr::add_residuals and modelr::add_predictions functions add residuals or predictions to the original data geom_ref_line grid <- sim1 %>% data_grid(x) grid #> # A tibble: 10 × 1 #> x #> <int> #> 1 1 #> 2 2 #> 3 3 #> 4 4 #> 5 5 #> 6 6 #> # ... with 4 more rows grid <- grid %>% add_predictions(sim1_mod) grid #> # A tibble: 10 × 2 #> x pred #> <int> <dbl> #> 1 1 6.27 #> 2 2 8.32 #> 3 3 10.38 #> 4 4 12.43 #> 5 5 14.48 #> 6 6 16.53 #> # ... with 4 more rows ggplot(sim1, aes(x)) + geom_point(aes(y = y)) + geom_line(aes(y = pred), data = grid, colour = "red", size = 1) sim1 <- sim1 %>% add_residuals(sim1_mod) sim1 #> # A tibble: 30 × 3 #> x y resid #> <int> <dbl> <dbl> #> 1 1 4.20 -2.072 #> 2 1 7.51 1.238 #> 3 1 2.13 -4.147 #> 4 2 8.99 0.665 #> 5 2 10.24 1.919 #> 6 2 11.30 2.973 #> # ... with 24 more rows ggplot(sim1, aes(resid)) + geom_freqpoly(binwidth = 0.5) ggplot(sim1, aes(x, resid)) + geom_ref_line(h = 0) + geom_point() 15.3.1 Exercises nstead of using lm() to fit a straight line, you can use loess() to fit a smooth curve. Repeat the process of model fitting, grid generation, predictions, and visualisation on sim1 using loess() instead of lm(). How does the result compare to geom_smooth()? I’ll use add_predictions and add_residuals to add the predictions and residuals from a loess regression to the sim1 data. sim1_loess <- loess(y ~ x, data = sim1) grid_loess <- sim1 %>% add_predictions(sim1_loess) sim1 <- sim1 %>% add_residuals(sim1_loess, var = "resid_loess") %>% add_predictions(sim1_loess, var = "pred_loess") This plots the loess predictions. The loess produces a nonlinear, but smooth line through the data. plot_sim1_loess <- ggplot(sim1, aes(x = x, y = y)) + geom_point() + geom_line(aes(x = x, y = pred), data = grid_loess, colour = "red") plot_sim1_loess The predictions of loess are the same as the default method for geom_smooth because geom_smooth() uses loess() by default; the message even tells us that. plot_sim1_loess + geom_smooth(colour = "blue", se = FALSE, alpha = 0.20) #> `geom_smooth()` using method = 'loess' We can plot the residuals (red), and compare them to the residuals from lm (black). In general, the loess model has smaller residuals within the sample (out of sample is a different issue, and we haven’t considered the uncertainty of these estimates). ggplot(sim1, aes(x = x)) + geom_ref_line(h = 0) + geom_point(aes(y = resid)) + geom_point(aes(y = resid_loess), colour = "red") add_predictions() is paired with gather_predictions() and spread_predictions(). How do these three functions differ? The functions gather_predictions and spread_predictions allow for adding predictions from multiple models at once. What does geom_ref_line() do? What package does it come from? Why is displaying a reference line in plots showing residuals useful and important? The geom geom_ref_line() adds as reference line to a plot. Even though it alters a ggplot2 plot, it is in the modelr package. Putting a reference line at zero for residuals is important because good models (generally) should have residuals centered at zero, with approximately the same variance (or distribution) over the support of x, and no correlation. A zero reference line makes it easier to judge these characteristics visually. Why might you want to look at a frequency polygon of absolute residuals? What are the pros and cons compared to looking at the raw residuals? The frequency polygon makes it easier to judge whether the variance and/or absolute size of the residuals varies with respect to x. This is called heteroskedasticity, and results in incorrect standard errors in inference. In prediction, this provides insight into where the model is working well and where it is not. What is lost, is that since the absolute values are shown, whether the model is over-predicting or underpredicting, or on average correctly predicting in different regions of x cannot be determined. 15.4 Formulas and Model Families df <- tribble( ~y, ~x1, ~x2, 4, 2, 5, 5, 1, 6 ) model_matrix(df, y ~ x1) #> # A tibble: 2 × 2 #> `(Intercept)` x1 #> <dbl> <dbl> #> 1 1 2 #> 2 1 1 model_matrix(df, y ~ x1 - 1) #> # A tibble: 2 × 1 #> x1 #> <dbl> #> 1 2 #> 2 1 15.4.1 Categorical Variables df <- tribble( ~ sex, ~ response, "male", 1, "female", 2, "male", 1 ) model_matrix(df, response ~ sex) #> # A tibble: 3 × 2 #> `(Intercept)` sexmale #> <dbl> <dbl> #> 1 1 1 #> 2 1 0 #> 3 1 1 ggplot(sim2) + geom_point(aes(x, y)) mod2 <- lm(y ~ x, data = sim2) grid <- sim2 %>% data_grid(x) %>% add_predictions(mod2) grid #> # A tibble: 4 × 2 #> x pred #> <chr> <dbl> #> 1 a 1.15 #> 2 b 8.12 #> 3 c 6.13 #> 4 d 1.91 ggplot(sim3, aes(x1, y)) + geom_point(aes(colour = x2)) mod1 <- lm(y ~ x1 + x2, data = sim3) mod2 <- lm(y ~ x1 * x2, data = sim3) grid <- sim3 %>% data_grid(x1, x2) %>% gather_predictions(mod1, mod2) grid #> # A tibble: 80 × 4 #> model x1 x2 pred #> <chr> <int> <fctr> <dbl> #> 1 mod1 1 a 1.67 #> 2 mod1 1 b 4.56 #> 3 mod1 1 c 6.48 #> 4 mod1 1 d 4.03 #> 5 mod1 2 a 1.48 #> 6 mod1 2 b 4.37 #> # ... with 74 more rows ggplot(sim3, aes(x1, y, colour = x2)) + geom_point() + geom_line(data = grid, aes(y = pred)) + facet_wrap(~ model) sim3 <- sim3 %>% gather_residuals(mod1, mod2) ggplot(sim3, aes(x1, resid, colour = x2)) + geom_point() + facet_grid(model ~ x2) mod1 <- lm(y ~ x1 + x2, data = sim4) mod2 <- lm(y ~ x1 * x2, data = sim4) grid <- sim4 %>% data_grid( x1 = seq_range(x1, 5), x2 = seq_range(x2, 5) ) %>% gather_predictions(mod1, mod2) grid #> # A tibble: 50 × 4 #> model x1 x2 pred #> <chr> <dbl> <dbl> <dbl> #> 1 mod1 -1.0 -1.0 0.996 #> 2 mod1 -1.0 -0.5 -0.395 #> 3 mod1 -1.0 0.0 -1.786 #> 4 mod1 -1.0 0.5 -3.177 #> 5 mod1 -1.0 1.0 -4.569 #> 6 mod1 -0.5 -1.0 1.907 #> # ... with 44 more rows Function seq_range is useful. ggplot(grid, aes(x1, x2)) + geom_tile(aes(fill = pred)) + facet_wrap(~ model) ggplot(grid, aes(x1, pred, colour = x2, group = x2)) + geom_line() + facet_wrap(~ model) ggplot(grid, aes(x2, pred, colour = x1, group = x1)) + geom_line() + facet_wrap(~ model) TODO We should visualize interactions with plotly 15.4.2 Exercises 15.5 Missing values TODO Need to write a tidyverse compliant na.omit function. 15.6 Other model families NOTE It’s worth mentioning these as more general models. Though they don’t appear as much in social science work. I should try to explain that. I can think of several reasons preference for easy to explain models (though I think that’s wrong–most people can’t visualize high-dimensional space well, and interpret results marginally even though they are conditional) status-quo bias and path dependence combined with lack of knowledge of work outside the field and median lack of technical ability to understand or use these models. the most principled reason is that those modre complicated models really excel in prediction. If we take an agnostic approach to regression, as in the Angrist and Pischke books, then regression isn’t being used to fit \\(f(y | x)\\), its being used to fit \\(E(f(y | x))\\), and more specifically to get some sort of average effect for a change in a specific variable. "], -["r-markdown.html", "16 R Markdown 16.1 R Markdown Basics 16.2 Text formatting with R Markdown", " 16 R Markdown 16.1 R Markdown Basics Doesn’t describe what YAML is. https://en.wikipedia.org/wiki/YAML The Ansible Guide to YAML is pretty simple; you don’t need to know what Ansible is: http://docs.ansible.com/ansible/YAMLSyntax.html https://learnxinyminutes.com/docs/yaml/ http://codebeautify.org/yaml-validator https://docs.saltstack.com/en/latest/topics/yaml/ 16.1.1 Exercise Create a new notebook using File > New File > R Notebook. Read the instructions. Practice running the chunks. Verify that you can modify the code, re-run it, and see modified output. Nothing to show Create a new R Markdown document with File > New File > R Markdown… Knit it by clicking the appropriate button. Knit it by using the appropriate keyboard short cut. Verify that you can modify the input and see the output update. Compare and contrast the R notebook and R markdown files you created above. How are the outputs similar? How are they different? How are the inputs similar? How are they different? What happens if you copy the YAML header from one to the other? R notebook files show the output inside the editor, while hiding the console. R markdown files shows the output inside the console, and does not show output inside the editor. They differ in the value of output in their YAML headers. The YAML header for the R notebook is ouptut: html_notebook while the header for the R markdown file is ouptut: html_document Create one new R Markdown document for each of the three built-in formats: HTML, PDF and Word. Knit each of the three documents. How does the output differ? How does the input differ? (You may need to install LaTeX in order to build the PDF output — RStudio will prompt you if this is necessary.) They produce different outputs, both in the final documents and intermediate files (notably the type of plots produced). The only difference in the inputs is the value of output in the YAML header: word_document for Word documents, pdf_document for PDF documents, and html_document for HTML documents. 16.2 Text formatting with R Markdown Continue "], -["r-markdown-workflow.html", "17 R Markdown Workflow", " 17 R Markdown Workflow Notes Find reproducible research articles Need good documentation on YAML and what it is No exercises "] +["explore-intro.html", "1 Introduction", " 1 Introduction No exercises. "], +["visualize.html", "2 Visualize 2.1 Introduction 2.2 Position Adjustments 2.3 Coordinate Systems", " 2 Visualize 2.1 Introduction 2.1.1 Prerequisites library("tidyverse") 2.1.2 First Steps 2.1.2.1 Exercises Run ggplot(data = mpg) what do you see? ggplot(data = mpg) Nothing. The plot is created, but ggplot is not given any data to plot. How many rows are in mtcars? How many columns? nrow(mtcars) #> [1] 32 ncol(mtcars) #> [1] 11 This can also be found by printing the dataset, or looking in the environment pane. What does the drv variable describe? Read the help for ?mpg to find out. ?mpg The drv variable takes the following values f = front-wheel drive r = rear wheel drive 4 = 4wd Make a scatterplot of hwy vs cyl ggplot(mpg, aes(x = hwy, y = cyl)) + geom_point() What happens if you make a scatterplot of class vs drv. Why is the plot not useful? ggplot(mpg, aes(x = class, y = drv)) + geom_point() A scatterplot is not a useful way to plot these variables, since both drv and class are factor variables, and the scatterplot cannot show which are overlapping or not. count(mpg, drv, class) #> Source: local data frame [12 x 3] #> Groups: drv [?] #> #> drv class n #> <chr> <chr> <int> #> 1 4 compact 12 #> 2 4 midsize 3 #> 3 4 pickup 33 #> 4 4 subcompact 4 #> 5 4 suv 51 #> 6 f compact 35 #> # ... with 6 more rows 2.1.3 Aesthetic mappings 2.1.3.1 Exercises What’s gone wrong with this code? Why are the points not blue? ggplot(data = mpg) + geom_point(mapping = aes(x = displ, y = hwy, color = "blue")) Since color = "blue" was included within the mapping argument, it was treated as an aesthetic (a mapping between a variable and a value). It was treated as a variable which has only one value: “blue”. Which variables in mpg are categorical? Which variables are continuous? (Hint: type ?mpg to read the documentation for the dataset). How can you see this information when you run mpg? ?mpg When printing the data frame, this information is given at the top of each column within angled brackets. Categorical variables have a class of “character” (<chr>). mpg #> # A tibble: 234 × 11 #> manufacturer model displ year cyl trans drv cty hwy fl #> <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> #> 1 audi a4 1.8 1999 4 auto(l5) f 18 29 p #> 2 audi a4 1.8 1999 4 manual(m5) f 21 29 p #> 3 audi a4 2.0 2008 4 manual(m6) f 20 31 p #> 4 audi a4 2.0 2008 4 auto(av) f 21 30 p #> 5 audi a4 2.8 1999 6 auto(l5) f 16 26 p #> 6 audi a4 2.8 1999 6 manual(m5) f 18 26 p #> # ... with 228 more rows, and 1 more variables: class <chr> The glimpse command from “mpg” shows this: glimpse(mpg) #> Observations: 234 #> Variables: 11 #> $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi", "... #> $ model <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 qua... #> $ displ <dbl> 1.8, 1.8, 2.0, 2.0, 2.8, 2.8, 3.1, 1.8, 1.8, 2.0,... #> $ year <int> 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1... #> $ cyl <int> 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6... #> $ trans <chr> "auto(l5)", "manual(m5)", "manual(m6)", "auto(av)... #> $ drv <chr> "f", "f", "f", "f", "f", "f", "f", "4", "4", "4",... #> $ cty <int> 18, 21, 20, 21, 16, 18, 18, 18, 16, 20, 19, 15, 1... #> $ hwy <int> 29, 29, 31, 30, 26, 26, 27, 26, 25, 28, 27, 25, 2... #> $ fl <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p", "p",... #> $ class <chr> "compact", "compact", "compact", "compact", "comp... Map a continuous variable to color, size, and shape. How do these aesthetics behave differently for categorical vs. continuous variables? The variable cty, city highway miles per gallon, is a continuous variable: ggplot(mpg, aes(x = displ, y = hwy, color = cty)) + geom_point() Instead of using discrete colors, the continous variable uses a scale that goes from black to bluish. ggplot(mpg, aes(x = displ, y = hwy, size = cty)) + geom_point() When mapped to size, the sizes of the points vary continuously with respect to the size (although the legend shows a few representative values) ggplot(mpg, aes(x = displ, y = hwy, shape = cty)) + geom_point() #> Error: A continuous variable can not be mapped to shape When a continuous value is mapped to shape, it gives an error. Though we could split a continuous variable into discrete categories and use shape, this would conceptually not make sense. A continuous numeric variable is ordered, but shapes have no natural order. It is clear that smaller points correspond to smaller values, or once the color scale is given, which points are larger or smaller. But it is not clear whether a square is greater or less than a circle. What happens if you map the same variable to multiple aesthetics? ggplot(mpg, aes(x = displ, y = hwy, color = hwy, size = displ)) + geom_point() In the above plot, hwy is mapped to both location on the y-axis and color, and displ is mapped to both location on the x-axis and size. The code works and produces a plot, even if it is a bad one. Mapping a single variable to multiple aesthetics is redundant. Because it is redundant information, in most cases avoid mapping a single variable to multiple aesthetics. What does the stroke aesthetic do? What shapes does it work with? (Hint: use ?geom_point) The following example is given in ?geom_point: ggplot(mtcars, aes(wt, mpg)) + geom_point(shape = 21, colour = "black", fill = "white", size = 5, stroke = 5) Stroke changes the color of the border for shapes (22-24). What happens if you map an aesthetic to something other than a variable name, like aes(colour = displ < 5)? ggplot(mpg, aes(x = displ, y = hwy, colour = displ < 5)) + geom_point() Aesthetics can also be mapped to expressions (code like displ < 5). It will create a temporary variable which takes values from the result of the expression. In this case, it is logical variable which is TRUE or FALSE. This also explains exercise 1, color = "blue" created a categorical variable that only had one category: “blue”. 2.1.4 Facets 2.1.4.1 Exercises What happens if you facet on a continuous variable? Let’s see ggplot(mpg, aes(x = displ, y = hwy)) + geom_point() + facet_grid(. ~ cty) It converts the continuous varible to a factor and creates facets for all unique values of it. What do the empty cells in plot with facet_grid(drv ~ cyl) mean? How do they relate to this plot? They are cells in which there are no values of the combination of drv and cyl. ggplot(data = mpg) + geom_point(mapping = aes(x = drv, y = cyl)) The locations in the above plot without points are the same cells in facet_grid(drv ~ cyl) tha have no points. What plots does the following code make? What does . do? The symbol . ignores that dimension for faceting. This plot facets by values of drv on the y-axis: ggplot(data = mpg) + geom_point(mapping = aes(x = displ, y = hwy)) + facet_grid(drv ~ .) This plot facets by values of cyl on the x-axis: ggplot(data = mpg) + geom_point(mapping = aes(x = displ, y = hwy)) + facet_grid(. ~ cyl) Read ?facet_wrap. What does nrow do? What does ncol do? What other options control the layout of the individual panels? Why doesn’t facet_grid() have nrow and ncol variables? The arguments nrow (ncol) determines the number of rows (columns) to use when laying out the facets. It is necessary since facet_wrap only facets on one variable. These arguments are unnecessary for facet_grid since the number of rows and columns are determined by the number of unique values of the variables specified. When using facet_grid() you should usually put the variable with more unique levels in the columns. Why? You should put the variable with more unique levels in the columns if the plot is laid out landscape. It is easier to compare relative levels of y by scanning horizontally, so it may be easier to visually compare these levels. I’m actually not sure about the correct answer to this. 2.1.5 Geometric Objects What does show.legend = FALSE do? What happens if you remove it? Why do you think I used it earlier in the chapter? NOTE This doesn’t appear earlier in the chapter Issue #510 What does the se argument to geom_smooth() do? It adds standard error bands to the lines. ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = drv)) + geom_point() + geom_smooth(se = TRUE) #> `geom_smooth()` using method = 'loess' By default se = TRUE: ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = drv)) + geom_point() + geom_smooth() #> `geom_smooth()` using method = 'loess' Will these two graphs look different? Why/why not? No. Because both geom_point and geom_smooth use the same data and mappings. They will inherit those options from the ggplot object, and thus don’t need to specified again (or twice). ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + geom_point() + geom_smooth() #> `geom_smooth()` using method = 'loess' ggplot() + geom_point(data = mpg, mapping = aes(x = displ, y = hwy)) + geom_smooth(data = mpg, mapping = aes(x = displ, y = hwy)) #> `geom_smooth()` using method = 'loess' Recreate the R code necessary to generate the following graphs. ggplot(mpg, aes(x = displ, y = hwy)) + geom_point() + geom_smooth(se = FALSE) #> `geom_smooth()` using method = 'loess' ggplot(mpg, aes(x = displ, y = hwy)) + geom_point() + geom_smooth(mapping = aes(group = drv), se = FALSE) #> `geom_smooth()` using method = 'loess' ggplot(mpg, aes(x = displ, y = hwy, colour = drv)) + geom_point() + geom_smooth(se = FALSE) #> `geom_smooth()` using method = 'loess' ggplot(mpg, aes(x = displ, y = hwy)) + geom_point(mapping = aes(colour = drv)) + geom_smooth(se = FALSE) #> `geom_smooth()` using method = 'loess' ggplot(mpg, aes(x = displ, y = hwy)) + geom_point(aes(colour = drv)) + geom_smooth(aes(linetype = drv), se = FALSE) #> `geom_smooth()` using method = 'loess' ggplot(mpg, aes(x = displ, y = hwy, fill = drv)) + geom_point(color = "white", shape = 21) 2.1.6 Statistical Transformations What is the default geom associated with stat_summary()? How could you rewrite the previous plot to use that geom function instead of the stat function? The default geom for stat_summary is geom_pointrange (see the stat) argument. But, the default stat for geom_pointrange is identity, so use geom_pointrange(stat = "summary"). ggplot(data = diamonds) + geom_pointrange( mapping = aes(x = cut, y = depth), stat = "summary", ) #> No summary function supplied, defaulting to `mean_se() The default message says that stat_summary uses the mean and sd to calculate the point, and range of the line. So lets use the previous values of fun.ymin, fun.ymax, and fun.y: ggplot(data = diamonds) + geom_pointrange( mapping = aes(x = cut, y = depth), stat = "summary", fun.ymin = min, fun.ymax = max, fun.y = median ) What does geom_col() do? How is it different to geom_bar()? geom_col differs from geom_bar in its default stat. geom_col has uses the identity stat. So it expects that a variable already exists for the height of the bars. geom_bar uses the count stat, and so will count observations in groups in order to generate the variable to use for the height of the bars. Most geoms and stats come in pairs that are almost always used in concert. Read through the documentation and make a list of all the pairs. What do they have in common? See the ggplot2 documentation What variables does stat_smooth() compute? What parameters control its behaviour? stat_smooth calculates y: predicted value ymin: lower value of the confidence interval ymax: upper value of the confidence interval se: standard error There’s parameters such as method which determines which method is used to calculate the predictions and confidence interval, and some other arguments that are passed to that. In our proportion bar chart, we need to set group = 1 Why? In other words what is the problem with these two graphs? If group is not set to 1, then all the bars have prop == 1. The function geom_bar assumes that the groups are equal to the x values, since the stat computes the counts within the group. ggplot(data = diamonds) + geom_bar(mapping = aes(x = cut, y = ..prop..)) The problem with these two plots is that the proportions are calculated within the groups. ggplot(data = diamonds) + geom_bar(mapping = aes(x = cut, y = ..prop..)) ggplot(data = diamonds) + geom_bar(mapping = aes(x = cut, fill = color, y = ..prop..)) This is more likely what was intended: ggplot(data = diamonds) + geom_bar(mapping = aes(x = cut, y = ..prop.., group = 1)) ggplot(data = diamonds) + geom_bar(mapping = aes(x = cut, fill = color, y = ..prop.., group = color)) 2.2 Position Adjustments What is the problem with this plot? How could you improve it? There is overplotting because there are multiple observations for each combination of cty and hwy. ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + geom_point() I’d fix it by using a jitter positition adjustment. ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + geom_point(position = "jitter") What parameters to geom_jitter() control the amount of jittering? From the position_jitter documentation, there are two arguments to jitter: width and height, which control the amount of vertical and horizontal jitter. No horizontal jitter ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + geom_point(position = position_jitter(width = 0)) Way too much vertical jitter ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + geom_point(position = position_jitter(width = 0, height = 15)) Only horizontal jitter: ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + geom_point(position = position_jitter(height = 0)) Way too much horizontal jitter: ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + geom_point(position = position_jitter(height = 0, width = 20)) Compare and contrast geom_jitter() with geom_count(). What’s the default position adjustment for geom_boxplot()? Create a visualisation of the mpg dataset that demonstrates it. The default position for geom_boxplot is position_dodge (see its docs). When we add color = class to the boxplot, the different classes within drv are placed side by side, i.e. dodged. If it was position_identity, they would be overlapping. ggplot(data = mpg, aes(x = drv, y = hwy, color = class)) + geom_boxplot() ggplot(data = mpg, aes(x = drv, y = hwy, color = class)) + geom_boxplot(position = "identity") 2.3 Coordinate Systems 2.3.1 Exercises Turn a stacked bar chart into a pie chart using coord_polar(). This is a stacked bar chart with a single category ggplot(mpg, aes(x = factor(1), fill = drv)) + geom_bar() See the documentation for coord_polar for an example of making a pie chart. In particular, theta = "y", meaning that the angle of the chart is the y variable has to be specified. ggplot(mpg, aes(x = factor(1), fill = drv)) + geom_bar(width = 1) + coord_polar(theta = "y") If theta = "y" is not specified, then you get a bullseye chart ggplot(mpg, aes(x = factor(1), fill = drv)) + geom_bar(width = 1) + coord_polar() If you had a multiple stacked bar chart, like, ggplot(data = diamonds) + geom_bar(mapping = aes(x = cut, fill = clarity), position = "fill") you end up with a multi-donut chart ggplot(data = diamonds) + geom_bar(mapping = aes(x = cut, fill = clarity), position = "fill") + coord_polar(theta = "y") What does labs() do? Read the documentation. labs is a shortcut function to add labels to different scales. ggplot(data = mpg, mapping = aes(x = class, y = hwy)) + geom_boxplot() + coord_flip() + labs(y = "Highway MPG", x = "") What’s the difference between coord_quickmap() and coord_map()? See the docs: coord_map uses a 2D projection: by default the Mercatur project of the sphere to the plot. But this requires transforming all geoms. coord_quickmap uses a quick approximation by using the lat/long ratio as an approximation. This is “quick” because the shapes don’t need to be transformed. What does the plot below tell you about the relationship between city and highway mpg? Why is coord_fixed() important? What does geom_abline() do? The coordinates coord_fixed ensures that the abline is at a 45 degree angle, which makes it easy to compare the highway and city mileage against what it would be if they were exactly the same. ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + geom_point() + geom_abline() + coord_fixed() If we didn’t include geom_point, then the line is no longer at 45 degrees: ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + geom_point() + geom_abline() "], +["workflow-basics.html", "3 Workflow Basics 3.1 Practice", " 3 Workflow Basics 3.1 Practice 3.1.1 Exercises Why does this code not work? my_variable <- 10 my_varıable #> Error in eval(expr, envir, enclos): object 'my_varıable' not found The variable being printed is my_varıable, not my_variable: the seventh character is “ı” (LATIN SMALL LETTER DOTLESS I) not “i”. While it wouldn’t have helped much in this case, the importance of distinguishing characters in code is reasons why fonts which clearly distinguish similar characters are preferred in programming: especially important are distinguishing between zero (0), Latin small letter O (o), and Latin capital letter O (O); and the numeral one (1), Latin small letter I (i), Latin capital letter I (i), and Latin small letter L (l). In these fonts, zero and the Latin letter O are often distinguished by using a glyph for zero that uses either a dot in the interior or a slash through it. Also note that the error messages of the form “object ‘…’ not found”, mean just what they say, the object can’t be found by R. This is usually because you either (1) forgot to define the function (or had an error that prevented it from being defined earlier), (2) didn’t load a package with the object, or (3) made a typo in the object’s name (either when using it or when you originally defined it). Tweak each of the following R commands so that they run correctly: library(tidyverse) #> Loading tidyverse: ggplot2 #> Loading tidyverse: tibble #> Loading tidyverse: tidyr #> Loading tidyverse: readr #> Loading tidyverse: purrr #> Loading tidyverse: dplyr #> Conflicts with tidy packages ---------------------------------------------- #> filter(): dplyr, stats #> lag(): dplyr, stats ggplot(dota = mpg) + geom_point(mapping = aes(x = displ, y = hwy)) #> Error in structure(list(data = data, layers = list(), scales = scales_list(), : argument "data" is missing, with no default The error message is argument "data" is missing, with no default. It looks like a typo, dota instead of data. ggplot(data = mpg) + geom_point(mapping = aes(x = displ, y = hwy)) fliter(mpg, cyl = 8) #> Error in eval(expr, envir, enclos): could not find function "fliter" R could not find the function fliter because we made a typo: fliter instead of filter. filter(mpg, cyl = 8) #> Error: filter() takes unnamed arguments. Do you need `==`? We aren’t done yet. But the error message gives a suggestion. Let’s follow it. filter(mpg, cyl == 8) #> # A tibble: 70 × 11 #> manufacturer model displ year cyl trans drv cty #> <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> #> 1 audi a6 quattro 4.2 2008 8 auto(s6) 4 16 #> 2 chevrolet c1500 suburban 2wd 5.3 2008 8 auto(l4) r 14 #> 3 chevrolet c1500 suburban 2wd 5.3 2008 8 auto(l4) r 11 #> 4 chevrolet c1500 suburban 2wd 5.3 2008 8 auto(l4) r 14 #> 5 chevrolet c1500 suburban 2wd 5.7 1999 8 auto(l4) r 13 #> 6 chevrolet c1500 suburban 2wd 6.0 2008 8 auto(l4) r 12 #> # ... with 64 more rows, and 3 more variables: hwy <int>, fl <chr>, #> # class <chr> filter(diamond, carat > 3) #> Error in filter_(.data, .dots = lazyeval::lazy_dots(...)): object 'diamond' not found R says it can’t find the object diamond. This is a typo; the data frame is named diamonds. filter(diamonds, carat > 3) #> # A tibble: 32 × 10 #> carat cut color clarity depth table price x y z #> <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> #> 1 3.01 Premium I I1 62.7 58 8040 9.10 8.97 5.67 #> 2 3.11 Fair J I1 65.9 57 9823 9.15 9.02 5.98 #> 3 3.01 Premium F I1 62.2 56 9925 9.24 9.13 5.73 #> 4 3.05 Premium E I1 60.9 58 10453 9.26 9.25 5.66 #> 5 3.02 Fair I I1 65.2 56 10577 9.11 9.02 5.91 #> 6 3.01 Fair H I1 56.1 62 10761 9.54 9.38 5.31 #> # ... with 26 more rows How did I know? I started typing in diamond and RStudio autocorrected it to diamonds. Since diamonds includes the variable carat and the code works, that appears to have been the problem. Press Alt + Shift + K. What happens? How can you get to the same place using the menus? This gives a menu with keyboard shortcuts. This can be found in the menu under Tools -> Keyboard Shortcuts Help. "], +["data-transformation.html", "4 Data Transformation 4.1 Prerequisites 4.2 Filter 4.3 Exercises 4.4 Arrange 4.5 Mutate 4.6 Grouped summaries with summarise() 4.7 Grouped mutates and filters", " 4 Data Transformation 4.1 Prerequisites library(nycflights13) library(tidyverse) 4.2 Filter glimpse(flights) #> Observations: 336,776 #> Variables: 19 #> $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013,... #> $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,... #> $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,... #> $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 55... #> $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 60... #> $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2... #> $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 7... #> $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 7... #> $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -... #> $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV",... #> $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79... #> $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN... #> $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR"... #> $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL"... #> $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138... #> $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 94... #> $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5,... #> $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, ... #> $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013... 4.3 Exercises Find all flights that Had an arrival delay of two or more hours Flew to Houston (IAH or HOU) Were operated by United, American, or Delta Departed in summer (July, August, and September) Arrived more than two hours late, but didn’t leave late Were delayed by at least an hour, but made up over 30 minutes in flight Departed between midnight and 6am (inclusive) Had an arrival delay of two or more hours Since delay is in minutes, we are looking for flights where arr_delay > 120: flights %>% filter(arr_delay > 120) #> # A tibble: 10,034 × 19 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 1 1 811 630 101 1047 #> 2 2013 1 1 848 1835 853 1001 #> 3 2013 1 1 957 733 144 1056 #> 4 2013 1 1 1114 900 134 1447 #> 5 2013 1 1 1505 1310 115 1638 #> 6 2013 1 1 1525 1340 105 1831 #> # ... with 1.003e+04 more rows, and 12 more variables: #> # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>, #> # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, #> # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm> Flew to Houston (IAH or HOU): flights %>% filter(dest %in% c("IAH", "HOU")) #> # A tibble: 9,313 × 19 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 1 1 517 515 2 830 #> 2 2013 1 1 533 529 4 850 #> 3 2013 1 1 623 627 -4 933 #> 4 2013 1 1 728 732 -4 1041 #> 5 2013 1 1 739 739 0 1104 #> 6 2013 1 1 908 908 0 1228 #> # ... with 9,307 more rows, and 12 more variables: sched_arr_time <int>, #> # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, #> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, #> # minute <dbl>, time_hour <dttm> Were operated by United, American, or Delta The variable carrier has the airline: but it is in two-digit carrier codes. However, we can look it up in the airlines dataset. airlines #> # A tibble: 16 × 2 #> carrier name #> <chr> <chr> #> 1 9E Endeavor Air Inc. #> 2 AA American Airlines Inc. #> 3 AS Alaska Airlines Inc. #> 4 B6 JetBlue Airways #> 5 DL Delta Air Lines Inc. #> 6 EV ExpressJet Airlines Inc. #> # ... with 10 more rows Since there are only 16 rows, its not even worth filtering. Delta is DL, American is AA, and United is UA: filter(flights, carrier %in% c("AA", "DL", "UA")) #> # A tibble: 139,504 × 19 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 1 1 517 515 2 830 #> 2 2013 1 1 533 529 4 850 #> 3 2013 1 1 542 540 2 923 #> 4 2013 1 1 554 600 -6 812 #> 5 2013 1 1 554 558 -4 740 #> 6 2013 1 1 558 600 -2 753 #> # ... with 1.395e+05 more rows, and 12 more variables: #> # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>, #> # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, #> # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm> Departed in summer (July, August, and September) The variable month has the month, and it is numeric. filter(flights, between(month, 7, 9)) #> # A tibble: 86,326 × 19 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 7 1 1 2029 212 236 #> 2 2013 7 1 2 2359 3 344 #> 3 2013 7 1 29 2245 104 151 #> 4 2013 7 1 43 2130 193 322 #> 5 2013 7 1 44 2150 174 300 #> 6 2013 7 1 46 2051 235 304 #> # ... with 8.632e+04 more rows, and 12 more variables: #> # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>, #> # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, #> # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm> Arrived more than two hours late, but didn’t leave late filter(flights, !is.na(dep_delay), dep_delay <= 0, arr_delay > 120) #> # A tibble: 29 × 19 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 1 27 1419 1420 -1 1754 #> 2 2013 10 7 1350 1350 0 1736 #> 3 2013 10 7 1357 1359 -2 1858 #> 4 2013 10 16 657 700 -3 1258 #> 5 2013 11 1 658 700 -2 1329 #> 6 2013 3 18 1844 1847 -3 39 #> # ... with 23 more rows, and 12 more variables: sched_arr_time <int>, #> # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, #> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, #> # minute <dbl>, time_hour <dttm> Were delayed by at least an hour, but made up over 30 minutes in flight filter(flights, !is.na(dep_delay), dep_delay >= 60, arr_delay < 30) #> # A tibble: 206 × 19 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 1 3 1850 1745 65 2148 #> 2 2013 1 3 1950 1845 65 2228 #> 3 2013 1 3 2015 1915 60 2135 #> 4 2013 1 6 1019 900 79 1558 #> 5 2013 1 7 1543 1430 73 1758 #> 6 2013 1 11 1020 920 60 1311 #> # ... with 200 more rows, and 12 more variables: sched_arr_time <int>, #> # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, #> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, #> # minute <dbl>, time_hour <dttm> Departed between midnight and 6am (inclusive). filter(flights, dep_time >= 0, dep_time <= 600) #> # A tibble: 9,344 × 19 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 1 1 517 515 2 830 #> 2 2013 1 1 533 529 4 850 #> 3 2013 1 1 542 540 2 923 #> 4 2013 1 1 544 545 -1 1004 #> 5 2013 1 1 554 600 -6 812 #> 6 2013 1 1 554 558 -4 740 #> # ... with 9,338 more rows, and 12 more variables: sched_arr_time <int>, #> # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, #> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, #> # minute <dbl>, time_hour <dttm> or using between (see next question) filter(flights, between(dep_time, 0, 600)) #> # A tibble: 9,344 × 19 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 1 1 517 515 2 830 #> 2 2013 1 1 533 529 4 850 #> 3 2013 1 1 542 540 2 923 #> 4 2013 1 1 544 545 -1 1004 #> 5 2013 1 1 554 600 -6 812 #> 6 2013 1 1 554 558 -4 740 #> # ... with 9,338 more rows, and 12 more variables: sched_arr_time <int>, #> # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, #> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, #> # minute <dbl>, time_hour <dttm> Another useful dplyr filtering helper is between(). What does it do? Can you use it to simplify the code needed to answer the previous challenges? between(x, left, right) is equivalent to x >= left & x <= right. I already used it in 1.4. How many flights have a missing dep_time? What other variables are missing? What might these rows represent? filter(flights, is.na(dep_time)) #> # A tibble: 8,255 × 19 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 1 1 NA 1630 NA NA #> 2 2013 1 1 NA 1935 NA NA #> 3 2013 1 1 NA 1500 NA NA #> 4 2013 1 1 NA 600 NA NA #> 5 2013 1 2 NA 1540 NA NA #> 6 2013 1 2 NA 1620 NA NA #> # ... with 8,249 more rows, and 12 more variables: sched_arr_time <int>, #> # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, #> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, #> # minute <dbl>, time_hour <dttm> Since arr_time is also missing, these are cancelled flights. Why is NA ^ 0 not missing? Why is NA | TRUE not missing? Why is FALSE & NA not missing? Can you figure out the general rule? (NA * 0 is a tricky counterexample!) NA ^ 0 == 1 since for all numeric values \\(x ^ 0 = 1\\). NA ^ 0 #> [1] 1 NA | TRUE is TRUE because the it doesn’t matter whether the missing value is TRUE or FALSE, x \\lor T = T for all values of x. NA | TRUE #> [1] TRUE Likewise, anything and FALSE is always FALSE. NA & FALSE #> [1] FALSE Because the value of the missing element matters in NA | FALSE and NA & TRUE, these are missing: NA | FALSE #> [1] NA NA & TRUE #> [1] NA wut? Since x * 0 = 0 for all \\(x\\) we might expect NA * 0 = 0, but that’s not the case. NA * 0 #> [1] NA 4.4 Arrange missing values always at the end. 4.4.1 Exercises How could you use arrange() to sort all missing values to the start? (Hint: use is.na()). This sorts by increasing dep_time, but with all missing values put first. arrange(flights, desc(is.na(dep_time)), dep_time) #> # A tibble: 336,776 × 19 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 1 1 NA 1630 NA NA #> 2 2013 1 1 NA 1935 NA NA #> 3 2013 1 1 NA 1500 NA NA #> 4 2013 1 1 NA 600 NA NA #> 5 2013 1 2 NA 1540 NA NA #> 6 2013 1 2 NA 1620 NA NA #> # ... with 3.368e+05 more rows, and 12 more variables: #> # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>, #> # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, #> # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm> Sort flights to find the most delayed flights. Find the flights that left earliest. The most delayed flights are found by sorting by dep_delay in descending order. arrange(flights, desc(dep_delay)) #> # A tibble: 336,776 × 19 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 1 9 641 900 1301 1242 #> 2 2013 6 15 1432 1935 1137 1607 #> 3 2013 1 10 1121 1635 1126 1239 #> 4 2013 9 20 1139 1845 1014 1457 #> 5 2013 7 22 845 1600 1005 1044 #> 6 2013 4 10 1100 1900 960 1342 #> # ... with 3.368e+05 more rows, and 12 more variables: #> # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>, #> # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, #> # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm> If we sort dep_delay in ascending order, we get those that left earliest. There was a flight that left 43 minutes early. arrange(flights, dep_delay) #> # A tibble: 336,776 × 19 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 12 7 2040 2123 -43 40 #> 2 2013 2 3 2022 2055 -33 2240 #> 3 2013 11 10 1408 1440 -32 1549 #> 4 2013 1 11 1900 1930 -30 2233 #> 5 2013 1 29 1703 1730 -27 1947 #> 6 2013 8 9 729 755 -26 1002 #> # ... with 3.368e+05 more rows, and 12 more variables: #> # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>, #> # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, #> # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm> Sort flights to find the fastest flights. I assume that by by “fastest flights” it means the flights with the minimum air time. So I sort by air_time. The fastest flights. The fastest flights area couple of flights between EWR and BDL with an air time of 20 minutes. arrange(flights, air_time) #> # A tibble: 336,776 × 19 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 1 16 1355 1315 40 1442 #> 2 2013 4 13 537 527 10 622 #> 3 2013 12 6 922 851 31 1021 #> 4 2013 2 3 2153 2129 24 2247 #> 5 2013 2 5 1303 1315 -12 1342 #> 6 2013 2 12 2123 2130 -7 2211 #> # ... with 3.368e+05 more rows, and 12 more variables: #> # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>, #> # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, #> # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm> Which flights travelled the longest? Which travelled the shortest? I’ll assume hat travelled the longest or shortest refers to distance, rather than air-time. The longest flights are the Hawaii Air (HA 51) between JFK and HNL (Honolulu) at 4,983 miles. arrange(flights, desc(distance)) #> # A tibble: 336,776 × 19 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 1 1 857 900 -3 1516 #> 2 2013 1 2 909 900 9 1525 #> 3 2013 1 3 914 900 14 1504 #> 4 2013 1 4 900 900 0 1516 #> 5 2013 1 5 858 900 -2 1519 #> 6 2013 1 6 1019 900 79 1558 #> # ... with 3.368e+05 more rows, and 12 more variables: #> # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>, #> # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, #> # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm> Apart from an EWR to LGA flight that was cancelled, the shortest flights are the Envoy Air Flights between EWR and PHL at 80 miles. arrange(flights, distance) #> # A tibble: 336,776 × 19 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 7 27 NA 106 NA NA #> 2 2013 1 3 2127 2129 -2 2222 #> 3 2013 1 4 1240 1200 40 1333 #> 4 2013 1 4 1829 1615 134 1937 #> 5 2013 1 4 2128 2129 -1 2218 #> 6 2013 1 5 1155 1200 -5 1241 #> # ... with 3.368e+05 more rows, and 12 more variables: #> # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>, #> # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, #> # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm> Brainstorm as many ways as possible to select dep_time, dep_delay, arr_time, and arr_delay from flights. A few ways include: select(flights, dep_time, dep_delay, arr_time, arr_delay) #> # A tibble: 336,776 × 4 #> dep_time dep_delay arr_time arr_delay #> <int> <dbl> <int> <dbl> #> 1 517 2 830 11 #> 2 533 4 850 20 #> 3 542 2 923 33 #> 4 544 -1 1004 -18 #> 5 554 -6 812 -25 #> 6 554 -4 740 12 #> # ... with 3.368e+05 more rows select(flights, starts_with("dep_"), starts_with("arr_")) #> # A tibble: 336,776 × 4 #> dep_time dep_delay arr_time arr_delay #> <int> <dbl> <int> <dbl> #> 1 517 2 830 11 #> 2 533 4 850 20 #> 3 542 2 923 33 #> 4 544 -1 1004 -18 #> 5 554 -6 812 -25 #> 6 554 -4 740 12 #> # ... with 3.368e+05 more rows select(flights, matches("^(dep|arr)_(time|delay)$")) #> # A tibble: 336,776 × 4 #> dep_time dep_delay arr_time arr_delay #> <int> <dbl> <int> <dbl> #> 1 517 2 830 11 #> 2 533 4 850 20 #> 3 542 2 923 33 #> 4 544 -1 1004 -18 #> 5 554 -6 812 -25 #> 6 554 -4 740 12 #> # ... with 3.368e+05 more rows using ends_with() doesn’t work well since it would bget sched_arr_time and sched_dep_time. What happens if you include the name of a variable multiple times in a select() call? It ignores the duplicates, and that variable is only included once. No error, warning, or message is emited. select(flights, year, month, day, year, year) #> # A tibble: 336,776 × 3 #> year month day #> <int> <int> <int> #> 1 2013 1 1 #> 2 2013 1 1 #> 3 2013 1 1 #> 4 2013 1 1 #> 5 2013 1 1 #> 6 2013 1 1 #> # ... with 3.368e+05 more rows What does the one_of() function do? Why might it be helpful in conjunction with this vector? The one_of vector allows you to select variables with a character vector rather than as unquoted variable names. It’s useful because then you can easily pass vectors to select(). vars <- c("year", "month", "day", "dep_delay", "arr_delay") select(flights, one_of(vars)) #> # A tibble: 336,776 × 5 #> year month day dep_delay arr_delay #> <int> <int> <int> <dbl> <dbl> #> 1 2013 1 1 2 11 #> 2 2013 1 1 4 20 #> 3 2013 1 1 2 33 #> 4 2013 1 1 -1 -18 #> 5 2013 1 1 -6 -25 #> 6 2013 1 1 -4 12 #> # ... with 3.368e+05 more rows Does the result of running the following code surprise you? How do the select helpers deal with case by default? How can you change that default? select(flights, contains("TIME")) #> # A tibble: 336,776 × 6 #> dep_time sched_dep_time arr_time sched_arr_time air_time #> <int> <int> <int> <int> <dbl> #> 1 517 515 830 819 227 #> 2 533 529 850 830 227 #> 3 542 540 923 850 160 #> 4 544 545 1004 1022 183 #> 5 554 600 812 837 116 #> 6 554 558 740 728 150 #> # ... with 3.368e+05 more rows, and 1 more variables: time_hour <dttm> The default behavior for contains is to ignore case. Yes, it surprises me. Upon reflection, I realized that this is likely the default behavior because dplyr is designed to deal with a variety of data backends, and some database engines don’t differentiate case. To change the behavior add the argument ignore.case = FALSE. Now no variables are selected. select(flights, contains("TIME", ignore.case = FALSE)) #> # A tibble: 336,776 × 0 4.5 Mutate 4.5.1 Exercises Currently dep_time and sched_dep_time are convenient to look at, but hard to compute with because they’re not really continuous numbers. Convert them to a more convenient representation of number of minutes since midnight. To get the departure times in the number of minutes, (integer) divide dep_time by 100 to get the hours since midnight and muliply by 60 and add the remainder of dep_time divided by 100. mutate(flights, dep_time_mins = dep_time %/% 100 * 60 + dep_time %% 100, sched_dep_time_mins = sched_dep_time %/% 100 * 60 + sched_dep_time %% 100) %>% select(dep_time, dep_time_mins, sched_dep_time, sched_dep_time_mins) #> # A tibble: 336,776 × 4 #> dep_time dep_time_mins sched_dep_time sched_dep_time_mins #> <int> <dbl> <int> <dbl> #> 1 517 317 515 315 #> 2 533 333 529 329 #> 3 542 342 540 340 #> 4 544 344 545 345 #> 5 554 354 600 360 #> 6 554 354 558 358 #> # ... with 3.368e+05 more rows This would be more cleanly done by first definining a funciton and reusing that: time2mins <- function(x) { x %/% 100 * 60 + x %% 100 } mutate(flights, dep_time_mins = time2mins(dep_time), sched_dep_time_mins = time2mins(sched_dep_time)) %>% select(dep_time, dep_time_mins, sched_dep_time, sched_dep_time_mins) #> # A tibble: 336,776 × 4 #> dep_time dep_time_mins sched_dep_time sched_dep_time_mins #> <int> <dbl> <int> <dbl> #> 1 517 317 515 315 #> 2 533 333 529 329 #> 3 542 342 540 340 #> 4 544 344 545 345 #> 5 554 354 600 360 #> 6 554 354 558 358 #> # ... with 3.368e+05 more rows Compare air_time with arr_time - dep_time. What do you expect to see? What do you see? What do you need to do to fix it? Since arr_time and dep_time may be in different time zones, the air_time doesn’t equal the difference. We would need to account for time-zones in these calculations. mutate(flights, air_time2 = arr_time - dep_time, air_time_diff = air_time2 - air_time) %>% filter(air_time_diff != 0) %>% select(air_time, air_time2, dep_time, arr_time, dest) #> # A tibble: 326,128 × 5 #> air_time air_time2 dep_time arr_time dest #> <dbl> <int> <int> <int> <chr> #> 1 227 313 517 830 IAH #> 2 227 317 533 850 IAH #> 3 160 381 542 923 MIA #> 4 183 460 544 1004 BQN #> 5 116 258 554 812 ATL #> 6 150 186 554 740 ORD #> # ... with 3.261e+05 more rows Compare dep_time, sched_dep_time, and dep_delay. How would you expect those three numbers to be related? I’d expect dep_time, sched_dep_time, and dep_delay to be related so that dep_time - sched_dep_time = dep_delay. mutate(flights, dep_delay2 = dep_time - sched_dep_time) %>% filter(dep_delay2 != dep_delay) %>% select(dep_time, sched_dep_time, dep_delay, dep_delay2) #> # A tibble: 99,777 × 4 #> dep_time sched_dep_time dep_delay dep_delay2 #> <int> <int> <dbl> <int> #> 1 554 600 -6 -46 #> 2 555 600 -5 -45 #> 3 557 600 -3 -43 #> 4 557 600 -3 -43 #> 5 558 600 -2 -42 #> 6 558 600 -2 -42 #> # ... with 9.977e+04 more rows Oops, I forgot to convert to minutes. I’ll reuse the time2mins function I wrote earlier. mutate(flights, dep_delay2 = time2mins(dep_time) - time2mins(sched_dep_time)) %>% filter(dep_delay2 != dep_delay) %>% select(dep_time, sched_dep_time, dep_delay, dep_delay2) #> # A tibble: 1,207 × 4 #> dep_time sched_dep_time dep_delay dep_delay2 #> <int> <int> <dbl> <dbl> #> 1 848 1835 853 -587 #> 2 42 2359 43 -1397 #> 3 126 2250 156 -1284 #> 4 32 2359 33 -1407 #> 5 50 2145 185 -1255 #> 6 235 2359 156 -1284 #> # ... with 1,201 more rows Well, that solved most of the problems, but these two numbers don’t match because we aren’t accounting for flights where the departure time is the next day from the scheduled departure time. Find the 10 most delayed flights using a ranking function. How do you want to handle ties? Carefully read the documentation for min_rank(). I’d want to handle ties by taking the minimum of tied values. If three flights are have the same value and are the most delayed, we would say they are tied for first, not tied for third or second. mutate(flights, dep_delay_rank = min_rank(-dep_delay)) %>% arrange(dep_delay_rank) %>% filter(dep_delay_rank <= 10) #> # A tibble: 10 × 20 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 1 9 641 900 1301 1242 #> 2 2013 6 15 1432 1935 1137 1607 #> 3 2013 1 10 1121 1635 1126 1239 #> 4 2013 9 20 1139 1845 1014 1457 #> 5 2013 7 22 845 1600 1005 1044 #> 6 2013 4 10 1100 1900 960 1342 #> # ... with 4 more rows, and 13 more variables: sched_arr_time <int>, #> # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, #> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, #> # minute <dbl>, time_hour <dttm>, dep_delay_rank <int> What does 1:3 + 1:10 return? Why? It returns c(1 + 1, 2 + 2, 3 + 3, 1 + 4, 2 + 5, 3 + 6, 1 + 7, 2 + 8, 3 + 9, 1 + 10). When adding two vectors recycles the shorter vector’s values to get vectors of the same length. We get a warning vector since the shorter vector is not a multiple of the longer one (this often, but not necessarily, means we made an error somewhere). 1:3 + 1:10 #> Warning in 1:3 + 1:10: longer object length is not a multiple of shorter #> object length #> [1] 2 4 6 5 7 9 8 10 12 11 What trigonometric functions does R provide? All the classics: cos, sin, tan, acos, asin, atan, plus a few others that are drive by numerical or computational issues. 4.6 Grouped summaries with summarise() 4.6.1 Exercises Brainstorm at least 5 different ways to assess the typical delay characteristics of a group of flights. Consider the following scenarios: A flight is 15 minutes early 50% of the time, and 15 minutes late 50% of the time. A flight is always 10 minutes late. A flight is 30 minutes early 50% of the time, and 30 minutes late 50% of the time. 99% of the time a flight is on time. 1% of the time it’s 2 hours late. Which is more important: arrival delay or departure delay? Arrival delay is more important. Arriving early is nice, but equally as good as arriving late is bad. Variation is worse than consistency; if I know the plane will always arrive 10 minutes late, then I can plan for it arriving as if the actual arrival time was 10 minutes later than the scheduled arrival time. So I’d try something that calculates the expected time of the flight, and then aggregates over any delays from that time. I would ignore any early arrival times. A better ranking would also consider cancellations, and need a way to convert them to a delay time (perhaps using the arrival time of the next flight to the same destination). Come up with another approach that will give you the same output as not_cancelled %>% count(dest) and not_cancelled %>% count(tailnum, wt = distance) (without using count()). Our definition of cancelled flights (is.na(dep_delay) | is.na(arr_delay)) is slightly suboptimal. Why? Which is the most important column? If a flight doesn’t depart, then it won’t arrive. A flight can also depart and not arrive if it crashes; I’m not sure how this data would handle flights that are redirected and land at other airports for whatever reason. The more important column is arr_delay so we could just use that. filter(flights, !is.na(dep_delay), is.na(arr_delay)) %>% select(dep_time, arr_time, sched_arr_time, dep_delay, arr_delay) #> # A tibble: 1,175 × 5 #> dep_time arr_time sched_arr_time dep_delay arr_delay #> <int> <int> <int> <dbl> <dbl> #> 1 1525 1934 1805 -5 NA #> 2 1528 2002 1647 29 NA #> 3 1740 2158 2020 -5 NA #> 4 1807 2251 2103 29 NA #> 5 1939 29 2151 59 NA #> 6 1952 2358 2207 22 NA #> # ... with 1,169 more rows Okay, I’m not sure what’s going on in this data. dep_time can be non-missing and arr_delay missing but arr_time not missing. They may be combining different flights? Look at the number of cancelled flights per day. Is there a pattern? Is the proportion of cancelled flights related to the average delay? cancelled_delayed <- flights %>% mutate(cancelled = (is.na(arr_delay) | is.na(dep_delay))) %>% group_by(year, month, day) %>% summarise(prop_cancelled = mean(cancelled), avg_dep_delay = mean(dep_delay, na.rm = TRUE)) ggplot(cancelled_delayed, aes(x = avg_dep_delay, prop_cancelled)) + geom_point() + geom_smooth() #> `geom_smooth()` using method = 'loess' Which carrier has the worst delays? Challenge: can you disentangle the effects of bad airports vs. bad carriers? Why/why not? (Hint: think about flights %>% group_by(carrier, dest) %>% summarise(n())) flights %>% group_by(carrier) %>% summarise(arr_delay = mean(arr_delay, na.rm = TRUE)) %>% arrange(desc(arr_delay)) #> # A tibble: 16 × 2 #> carrier arr_delay #> <chr> <dbl> #> 1 F9 21.9 #> 2 FL 20.1 #> 3 EV 15.8 #> 4 YV 15.6 #> 5 OO 11.9 #> 6 MQ 10.8 #> # ... with 10 more rows filter(airlines, carrier == "F9") #> # A tibble: 1 × 2 #> carrier name #> <chr> <chr> #> 1 F9 Frontier Airlines Inc. Frontier Airlines (FL) has the worst delays. You can get part of the way to disentangling the effects of airports vs. carriers by comparing each flight’s delay to the average delay of destination airport. However, you’d really want to compare it to the average delay of the desination airport, after removing other flights from the same airline. 538 has done something like this: http://fivethirtyeight.com/features/the-best-and-worst-airlines-airports-and-flights-summer-2015-update/. For each plane, count the number of flights before the first delay of greater than 1 hour. I think this requires grouped mutate (but I may be wrong): flights %>% arrange(tailnum, year, month, day) %>% group_by(tailnum) %>% mutate(delay_gt1hr = dep_delay > 60) %>% mutate(before_delay = cumsum(delay_gt1hr)) %>% filter(before_delay < 1) %>% count(sort = TRUE) #> # A tibble: 3,755 × 2 #> tailnum n #> <chr> <int> #> 1 N954UW 206 #> 2 N952UW 163 #> 3 N957UW 142 #> 4 N5FAAA 117 #> 5 N38727 99 #> 6 N3742C 98 #> # ... with 3,749 more rows What does the sort argument to count() do. When might you use it? The sort argument to count sorts the results in order of n. You could use this anytime you would do count followed by arrange. 4.7 Grouped mutates and filters 4.7.1 Exercises Refer back to the table of useful mutate and filtering functions. Describe how each operation changes when you combine it with grouping. They operate within each group rather than over the entire data frame. E.g. mean will calculate the mean within each group. Which plane (tailnum) has the worst on-time record? flights %>% group_by(tailnum) %>% summarise(arr_delay = mean(arr_delay, na.rm = TRUE)) %>% ungroup() %>% filter(rank(desc(arr_delay)) <= 1) #> # A tibble: 1 × 2 #> tailnum arr_delay #> <chr> <dbl> #> 1 N844MH 320 What time of day should you fly if you want to avoid delays as much as possible? Let’s group by hour. The earlier the better to fly. This is intuitive as delays early in the morning are likely to propogate throughout the day. flights %>% group_by(hour) %>% summarise(arr_delay = mean(arr_delay, na.rm = TRUE)) %>% ungroup() %>% arrange(arr_delay) #> # A tibble: 20 × 2 #> hour arr_delay #> <dbl> <dbl> #> 1 7 -5.304 #> 2 5 -4.797 #> 3 6 -3.384 #> 4 9 -1.451 #> 5 8 -1.113 #> 6 10 0.954 #> # ... with 14 more rows For each destination, compute the total minutes of delay. For each, flight, compute the proportion of the total delay for its destination. flights %>% filter(!is.na(arr_delay), arr_delay > 0) %>% group_by(dest) %>% mutate(total_delay = sum(arr_delay), prop_delay = arr_delay / sum(arr_delay)) #> Source: local data frame [133,004 x 21] #> Groups: dest [103] #> #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 1 1 517 515 2 830 #> 2 2013 1 1 533 529 4 850 #> 3 2013 1 1 542 540 2 923 #> 4 2013 1 1 554 558 -4 740 #> 5 2013 1 1 555 600 -5 913 #> 6 2013 1 1 558 600 -2 753 #> # ... with 1.33e+05 more rows, and 14 more variables: #> # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>, #> # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, #> # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>, #> # total_delay <dbl>, prop_delay <dbl> Alternatively, consider the delay as relative to the minimum delay for any flight to that destination. Now all non-cancelled flights have a proportion. flights %>% filter(!is.na(arr_delay), arr_delay > 0) %>% group_by(dest) %>% mutate(total_delay = sum(arr_delay - min(arr_delay)), prop_delay = arr_delay / sum(arr_delay)) #> Source: local data frame [133,004 x 21] #> Groups: dest [103] #> #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 1 1 517 515 2 830 #> 2 2013 1 1 533 529 4 850 #> 3 2013 1 1 542 540 2 923 #> 4 2013 1 1 554 558 -4 740 #> 5 2013 1 1 555 600 -5 913 #> 6 2013 1 1 558 600 -2 753 #> # ... with 1.33e+05 more rows, and 14 more variables: #> # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>, #> # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, #> # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>, #> # total_delay <dbl>, prop_delay <dbl> Delays are typically temporally correlated: even once the problem that caused the initial delay has been resolved, later flights are delayed to allow earlier flights to leave. Using lag() explore how the delay of a flight is related to the delay of the immediately preceding flight. We want to group by day to avoid taking the lag from the previous day. Also, I want to use departure delay, since this mechanism is relevant for departures. Also, I remove missing values both before and after calculating the lag delay. However, it would be interesting to ask the probability or averge delay after a cancellation. flights %>% group_by(year, month, day) %>% filter(!is.na(dep_delay)) %>% mutate(lag_delay = lag(dep_delay)) %>% filter(!is.na(lag_delay)) %>% ggplot(aes(x = dep_delay, y = lag_delay)) + geom_point() + geom_smooth() #> `geom_smooth()` using method = 'gam' Look at each destination. Can you find flights that are suspiciously fast? (i.e. flights that represent a potential data entry error). Compute the air time a flight relative to the shortest flight to that destination. Which flights were most delayed in the air? The shorter BOS and PHL flights that are 20 minutes for 30+ minutes flights seem plausible - though maybe entries of +/- a few minutes can easily create large changes. I assume that departure time has a standardized definition, but I’m not sure; if there is some discretion, that could create errors that are small in absolute time, but large in relative time for small flights. The ATL, GSP, an BNA flights looks a little suspicious as it’s almost half the time for longer flights. flights %>% filter(!is.na(air_time)) %>% group_by(dest) %>% mutate(med_time = median(air_time), fast = (air_time - med_time) / med_time) %>% arrange(fast) %>% select(air_time, med_time, fast, dep_time, sched_dep_time, arr_time, sched_arr_time) %>% head(15) #> Adding missing grouping variables: `dest` #> Source: local data frame [15 x 8] #> Groups: dest [9] #> #> dest air_time med_time fast dep_time sched_dep_time arr_time #> <chr> <dbl> <dbl> <dbl> <int> <int> <int> #> 1 BOS 21 38 -0.447 1450 1500 1547 #> 2 ATL 65 112 -0.420 1709 1700 1923 #> 3 GSP 55 92 -0.402 2040 2025 2225 #> 4 BOS 23 38 -0.395 1954 2000 2131 #> 5 BNA 70 113 -0.381 1914 1910 2045 #> 6 MSP 93 149 -0.376 1558 1513 1745 #> # ... with 9 more rows, and 1 more variables: sched_arr_time <int> I could also try a z-score. Though the sd and mean will be affected by large delays. flights %>% filter(!is.na(air_time)) %>% group_by(dest) %>% mutate(air_time_mean = mean(air_time), air_time_sd = sd(air_time), z_score = (air_time - air_time_mean) / air_time_sd) %>% arrange(z_score) %>% select(z_score, air_time_mean, air_time_sd, air_time, dep_time, sched_dep_time, arr_time, sched_arr_time) #> Adding missing grouping variables: `dest` #> Source: local data frame [327,346 x 9] #> Groups: dest [104] #> #> dest z_score air_time_mean air_time_sd air_time dep_time sched_dep_time #> <chr> <dbl> <dbl> <dbl> <dbl> <int> <int> #> 1 MSP -4.90 150.6 11.75 93 1558 1513 #> 2 ATL -4.88 112.9 9.81 65 1709 1700 #> 3 GSP -4.72 93.4 8.13 55 2040 2025 #> 4 BNA -4.05 114.4 10.96 70 1914 1910 #> 5 CVG -3.98 96.0 8.52 62 1359 1343 #> 6 BOS -3.63 39.0 4.95 21 1450 1500 #> # ... with 3.273e+05 more rows, and 2 more variables: arr_time <int>, #> # sched_arr_time <int> flights %>% filter(!is.na(air_time)) %>% group_by(dest) %>% mutate(air_time_diff = air_time - min(air_time)) %>% arrange(desc(air_time_diff)) %>% select(dest, year, month, day, carrier, flight, air_time_diff, air_time, dep_time, arr_time) %>% head() #> Source: local data frame [6 x 10] #> Groups: dest [5] #> #> dest year month day carrier flight air_time_diff air_time dep_time #> <chr> <int> <int> <int> <chr> <int> <dbl> <dbl> <int> #> 1 SFO 2013 7 28 DL 841 195 490 1727 #> 2 LAX 2013 11 22 DL 426 165 440 1812 #> 3 EGE 2013 1 28 AA 575 163 382 1806 #> 4 DEN 2013 9 10 UA 745 149 331 1513 #> 5 LAX 2013 7 10 DL 17 147 422 1814 #> 6 LAS 2013 11 22 UA 587 143 399 2142 #> # ... with 1 more variables: arr_time <int> Find all destinations that are flown by at least two carriers. Use that information to rank the carriers. The carrier tha flies to the most locations is ExpressJet Airlines (EV). ExpressJet is a regional airline and partner for major airlines, so its one of those that flies small planes to close airports flights %>% group_by(dest, carrier) %>% count(carrier) %>% group_by(carrier) %>% count(sort = TRUE) #> # A tibble: 16 × 2 #> carrier nn #> <chr> <int> #> 1 EV 61 #> 2 9E 49 #> 3 UA 47 #> 4 B6 42 #> 5 DL 40 #> 6 MQ 20 #> # ... with 10 more rows filter(airlines, carrier == "EV") #> # A tibble: 1 × 2 #> carrier name #> <chr> <chr> #> 1 EV ExpressJet Airlines Inc. "], +["exploratory-data-analysis.html", "5 Exploratory Data Analysis 5.1 Introduction 5.2 Missing Values 5.3 Covariation", " 5 Exploratory Data Analysis 5.1 Introduction library("tidyverse") library("viridis") library("forcats") This will also use data from nycflights13, library("nycflights13") 5.1.1 Questions 5.1.2 Variation 5.1.2.1 Exercises 1. Explore the distribution of each of the x, y, and z variables in diamonds. What do you learn? Think about a diamond and how you might decide which dimension is the length, width, and depth. In order to make it eaiser to plot them, I’ll reshape the dataset so that I can use the variables as facets. diamonds %>% mutate(id = row_number()) %>% select(x, y, z, id) %>% gather(variable, value, -id) %>% ggplot(aes(x = value)) + geom_density() + geom_rug() + facet_grid(variable ~ .) There several noticeable features of thedistributions They are right skewed, with most diamonds small, but a few very large ones. There is an outlier in y, and z (see the rug) All three distributions have a bimodality (perhaps due to some sort of threshhold) According to the documentation for diamonds: x is length, y is width, and z is depth. I don’t know if I would have figured that out before; maybe if there was data on the type of cuts. 2. Explore the distribution of price. Do you discover anything unusual or surprising? (Hint: Carefully think about the binwidth and make sure you try a wide range of values.) The price data is spikey, but I can’t tell what that corresponds to, as the following plots don’t show much difference in the distributions in the last one and last two digits. There are no diamonds with a price of 1500 There’s a bulge in the distribution around 7500. ggplot(filter(diamonds, price < 2500), aes(x = price)) + geom_histogram(binwidth = 10, center = 0) ggplot(filter(diamonds), aes(x = price)) + geom_histogram(binwidth = 100, center = 0) Distribution of last digit diamonds %>% mutate(ending = price %% 10) %>% ggplot(aes(x = ending)) + geom_histogram(binwidth = 1, center = 0) + geom_bar() diamonds %>% mutate(ending = price %% 100) %>% ggplot(aes(x = ending)) + geom_histogram(binwidth = 1) + geom_bar() diamonds %>% mutate(ending = price %% 1000) %>% filter(ending >= 500, ending <= 800) %>% ggplot(aes(x = ending)) + geom_histogram(binwidth = 1) + geom_bar() 3. How many diamonds are 0.99 carat? How many are 1 carat? What do you think is the cause of the difference? There are more than 70 times as many 1 carat diamonds as 0.99 carat diamond. diamonds %>% filter(carat >= 0.99, carat <= 1) %>% count(carat) #> # A tibble: 2 × 2 #> carat n #> <dbl> <int> #> 1 0.99 23 #> 2 1.00 1558 I don’t know exactly the process behind how carats are measured, but some way or another some diamonds carat values are being “rounded up”, because presumably there is a premium for a 1 carat diamond vs. a 0.99 carat diamond beyond the expected increase in price due to a 0.01 carat increase. To check this intuition, we’d want to look at the number of diamonds in each carat range to seem if there is an abnormally low number at 0.99 carats, and an abnormally high number at 1 carat. diamonds %>% filter(carat >= 0.9, carat <= 1.1) %>% count(carat) %>% print(n = 30) #> # A tibble: 21 × 2 #> carat n #> <dbl> <int> #> 1 0.90 1485 #> 2 0.91 570 #> 3 0.92 226 #> 4 0.93 142 #> 5 0.94 59 #> 6 0.95 65 #> 7 0.96 103 #> 8 0.97 59 #> 9 0.98 31 #> 10 0.99 23 #> 11 1.00 1558 #> 12 1.01 2242 #> 13 1.02 883 #> 14 1.03 523 #> 15 1.04 475 #> 16 1.05 361 #> 17 1.06 373 #> 18 1.07 342 #> 19 1.08 246 #> 20 1.09 287 #> 21 1.10 278 Q Can you think of other examples of similar phenoma where you might expect to see similar discontinuities in areas related to your research. Compare and contrast coord_cartesian() vs xlim() or ylim() when zooming in on a histogram. What happens if you leave binwidth unset? What happens if you try and zoom so only half a bar shows? coord_cartesian simply zooms in on the area specified by the limits. The calculation of the histogram is unaffected. ggplot(diamonds) + geom_histogram(mapping = aes(x = price)) + coord_cartesian(xlim = c(100, 5000), ylim = c(0, 3000)) #> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. However, the xlim and ylim functions first drop any values outside the limits (the ylim doesn’t matter in this case), then calculates the histogram, and draws the graph with the given limits. ggplot(diamonds) + geom_histogram(mapping = aes(x = price)) + xlim(100, 5000) + ylim(0, 3000) #> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. #> Warning: Removed 14714 rows containing non-finite values (stat_bin). #> Warning: Removed 5 rows containing missing values (geom_bar). 5.2 Missing Values 5.2.1 Exercises What happens to missing values in a histogram? What happens to missing values in a bar chart? Why is there a difference? Missing values are removed when the number of observations in each bin are calculated. See the warning message: Removed 9 rows containing non-finite values (stat_bin) diamonds2 <- diamonds %>% mutate(y = ifelse(y < 3 | y > 20, NA, y)) ggplot(diamonds2, aes(x = y)) + geom_histogram() #> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. #> Warning: Removed 9 rows containing non-finite values (stat_bin). In geom_bar, NA is treated as another category. This is because the x aesthetic in geom_bar should be a discrete (categorical) variable, and missing values are just another category. diamonds %>% mutate(cut = if_else(runif(n()) < 0.1, NA_character_, as.character(cut))) %>% ggplot() + geom_bar(mapping = aes(x = cut)) In a histogram, the x aesthetic variable needs to be numeric, and stat_bin groups the observations by ranges into bins. Since the numeric value of the NA observations is unknown, they cannot be placed in a particular bin, and are dropped. What does na.rm = TRUE do in mean() and sum()? This option removes NA values from the vector prior to calculating the mean and sum. mean(c(0, 1, 2, NA), na.rm = TRUE) #> [1] 1 sum(c(0, 1, 2, NA), na.rm = TRUE) #> [1] 3 5.3 Covariation 5.3.1 A categorical and continuous variable For a history of the boxplot see Wikckham [40 years of the boxplot] (http://vita.had.co.nz/papers/boxplots.pdf) Krywinski, Martin, and Naomi Altman. 2014. “Points of Significance: Visualizing samples with box plots.” Nature Methods URL Where does the 1.5 x IQR come from? It’s kind of arbitrary. But in a normal distribution, the IQR is approximatley 2, and 1.5 x IQR is approx 4, so the outliers are approximately within 4 standard deviations of the median (mean). 5.3.1.1 Excercises Use what you’ve learned to improve the visualisation of the departure times of cancelled vs. non-cancelled flights. Instead of a freqplot use a box-plot nycflights13::flights %>% mutate( cancelled = is.na(dep_time), sched_hour = sched_dep_time %/% 100, sched_min = sched_dep_time %% 100, sched_dep_time = sched_hour + sched_min / 60 ) %>% ggplot() + geom_boxplot(mapping = aes(y = sched_dep_time, x = cancelled)) What variable in the diamonds dataset is most important for predicting the price of a diamond? How is that variable correlated with cut? Why does the combination of those two relationships lead to lower quality diamonds being more expensive? I’m not exactly sure what this question is asking conditional on using only the tools introduced in the book thus far. Install the ggstance package, and create a horizontal boxplot. How does this compare to using coord_flip()? Earlier we created a horizontal boxplot of the distribution hwy by class, using geom_boxplot and coord_flip: ggplot(data = mpg) + geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) + coord_flip() In this case the output looks the same, but in the aesthetics the x and y are flipped from the previous case. library("ggstance") ggplot(data = mpg) + geom_boxploth(mapping = aes(y = reorder(class, hwy, FUN = median), x = hwy)) One problem with boxplots is that they were developed in an era of much smaller datasets and tend to display a prohibitively large number of “outlying values”. One approach to remedy this problem is the letter value plot. Install the lvplot package, and try using geom_lv() to display the distribution of price vs cut. What do you learn? How do you interpret the plots? The boxes of the letter-value plot correspond to many more quantiles. They are useful for larger datasets because larger datasets can give precise estiamtes of quantiles beyond the quartiles in expectation, larger datasets should have many more outliers The letter-value plot is described in: Heike Hofmann, Karen Kafadar, and Hadley Wickham. 2011. “Letter-value plots: Boxplots for large data” http://vita.had.co.nz/papers/letter-value-plot.pdf library("lvplot") ggplot(diamonds, aes(x = cut, y = price)) + geom_lv() Compare and contrast geom_violin() with a facetted geom_histogram(), or a coloured geom_freqpoly(). What are the pros and cons of each method? I produce plots for these three methods below. The geom_freqpoly is better for look-up: meaning that given a price, it is easy to tell which cut has the highest density. However, the overlapping lines makes it difficult to distinguish how the overall distributions relate to each other. The geom_violin and facetted geom_histogram have similar strengths and weaknesses. It is easy to visually distinguish differences in the overall shape of the distributions (skewness, central values, variance, etc). However, since we can’t easily compare the vertical values of the distribution, its difficult to look up which category has the highest density for a given price. All of these methods depend on tuning parameters to determine the level of smoothness of the distribution. ggplot(data = diamonds, mapping = aes(x = price, y = ..density..)) + geom_freqpoly(mapping = aes(colour = cut), binwidth = 500) ggplot(data = diamonds, mapping = aes(x = price)) + geom_histogram() + facet_wrap(~ cut, ncol = 1, scales = "free_y") #> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. ggplot(data = diamonds, mapping = aes(x = cut, y = price)) + geom_violin() + coord_flip() The violin plot was first described in Hintze JL, Nelson RD (1998). “Violin Plots: A Box Plot-Density Trace Synergism.” The American Statistician, 52(2), 181–184 If you have a small dataset, it’s sometimes useful to use geom_jitter() to see the relationship between a continuous and categorical variable. The ggbeeswarm package provides a number of methods similar to geom_jitter(). List them and briefly describe what each one does. There are two methods: geom_quasirandom that produces plots that resemble something between jitter and violin. There are several different methods that determine exactly how the random location of the points is generated. geom_beeswarm creates a shape similar to a violin plot, but by offsetting the points. I’ll use the mpg boxplot example since these methods display individual points, they are better suited for smaller datasets. library("ggbeeswarm") ggplot(data = mpg) + geom_quasirandom(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) ggplot(data = mpg) + geom_quasirandom(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy), method = "tukey") ggplot(data = mpg) + geom_quasirandom(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy), method = "tukeyDense") ggplot(data = mpg) + geom_quasirandom(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy), method = "frowney") ggplot(data = mpg) + geom_quasirandom(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy), method = "smiley") ggplot(data = mpg) + geom_beeswarm(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) 5.3.2 Two categorical variables How could you rescale the count dataset above to more clearly show the distribution of cut within colour, or colour within cut? TO clearly show the distribution of cut within color, calculate a new variable prop which is the proportion of each cut within a color. This is done using a grouped mutate. diamonds %>% count(color, cut) %>% group_by(color) %>% mutate(prop = n / sum(n)) %>% ggplot(mapping = aes(x = color, y = cut)) + geom_tile(mapping = aes(fill = prop)) + scale_fill_viridis(limits = c(0, 1)) Similarly, to scale by the distribution of color within cut, diamonds %>% count(color, cut) %>% group_by(cut) %>% mutate(prop = n / sum(n)) %>% ggplot(mapping = aes(x = color, y = cut)) + geom_tile(mapping = aes(fill = prop)) + scale_fill_viridis(limits = c(0, 1)) I add limit = c(0, 1) to put the color scale between (0, 1). These are the logical boundaries of proportions. This makes it possible to compare each cell to its actual value, and would improve comparisons across multiple plots. However, it ends up limiting the colors and makes it harder to compare within the dataset. However, using the default limits of the minimum and maximum values makes it easier to compare within the dataset the emphasizing relative differences, but harder to compare across datasets. Use geom_tile() together with dplyr to explore how average flight delays vary by destination and month of year. What makes the plot difficult to read? How could you improve it? flights %>% group_by(month, dest) %>% summarise(dep_delay = mean(dep_delay, na.rm = TRUE)) %>% ggplot(aes(x = factor(month), y = dest, fill = dep_delay)) + geom_tile() + labs(x = "Month", y = "Destination", fill = "Departure Delay") There are several things that could be done to improve it, sort destinations by a meaningful quanity (distance, number of flights, average delay) remove missing values better color scheme (viridis) How to treat missing values is difficult. In this case, missing values correspond to airports which don’t have regular flights (at least one flight each month) from NYC. These are likely smaller airports (with higher variance in their average due to fewer observations). library("viridis") flights %>% group_by(month, dest) %>% summarise(dep_delay = mean(dep_delay, na.rm = TRUE)) %>% group_by(dest) %>% filter(n() == 12) %>% ungroup() %>% mutate(dest = fct_reorder(dest, dep_delay)) %>% ggplot(aes(x = factor(month), y = dest, fill = dep_delay)) + geom_tile() + scale_fill_viridis() + labs(x = "Month", y = "Destination", fill = "Departure Delay") Why is it slightly better to use aes(x = color, y = cut) rather than aes(x = cut, y = color) in the example above? It’s usually better to use the categorical variable with a larger number of categories or the longer labels on the y axis. If at all possible, labels should be horizontal because that is easier to read. However, switching the order doesn’t result in overlapping labels. diamonds %>% count(color, cut) %>% ggplot(mapping = aes(y = color, x = cut)) + geom_tile(mapping = aes(fill = n)) Another justification, for switching the order is that the larger numbers are at the top when x = color and y = cut, and that lowers the cognitive burden of interpreting the plot. 5.3.3 Two continuous variables Instead of summarising the conditional distribution with a boxplot, you could use a frequency polygon. What do you need to consider when using cut_width() vs cut_number()? How does that impact a visualisation of the 2d distribution of carat and price? When using cut_width the number in each bin may be unequal. The distribution of carat is right skewed so there are few diamonds in those bins. ggplot(data = diamonds, mapping = aes(x = price, colour = cut_width(carat, 0.3))) + geom_freqpoly() #> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. Plotting the density instead of counts will make the distributions comparable, although the bins with few observations will still be hard to interpret. ggplot(data = diamonds, mapping = aes(x = price, y = ..density.., colour = cut_width(carat, 0.3))) + geom_freqpoly() #> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. Plotting the density instead of counts will make the distributions comparable, although the bins with few observations will still be hard to interpret. ggplot(data = diamonds, mapping = aes(x = price, colour = cut_number(carat, 10))) + geom_freqpoly() #> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. Since there are equal numbers in each bin, the plot looks the same if density is used for the y aesthetic (although the values are on a different scale). ggplot(data = diamonds, mapping = aes(x = price, y = ..density.., colour = cut_number(carat, 10))) + geom_freqpoly() #> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. Visualise the distribution of carat, partitioned by price. With a boxplot, partitionining into an 10 bins with the same number of observations: ggplot(diamonds, aes(x = cut_number(price, 10), y = carat)) + geom_boxplot() + coord_flip() + xlab("Price") With a boxplot, partitionining into an bins of $2,000 with the width of the box determined by the number of observations. I use boundary = 0 to ensure the first bin goes from $0–$2,000. ggplot(diamonds, aes(x = cut_width(price, 2000, boundary = 0), y = carat)) + geom_boxplot(varwidth = TRUE) + coord_flip() + xlab("Price") How does the price distribution of very large diamonds compare to small diamonds. Is it as you expect, or does it surprise you? The distribution of very large diamonds is more variable. I’m not surprised, since I had a very weak prior about diamond prices. Ex post, I would reason that above a certain size other factors such as cut, clarity, color play more of a role in the price. Combine two of the techniques you’ve learned to visualise the combined distribution of cut, carat, and price. There’s lots of options to try: Here’s a couple. What else did you try? What’s the best way? ggplot(diamonds, aes(x = carat, y = price)) + geom_hex() + facet_wrap(~ cut, ncol = 1) + scale_fill_viridis() #> Loading required package: methods ggplot(diamonds, aes(x = cut_number(carat, 5), y = price, color = cut)) + geom_boxplot() ggplot(diamonds, aes(color = cut_number(carat, 5), y = price, x = cut)) + geom_boxplot() "], +["tibbles.html", "6 Tibbles 6.1 Prerquisites 6.2 Creating Tibbles 6.3 Tibbles vs. data.frame 6.4 Subsetting 6.5 Interacting with older code 6.6 Exercises", " 6 Tibbles 6.1 Prerquisites library("tidyverse") Functions and packages covered in this chapter: package tibble as_tibble, tibble 6.2 Creating Tibbles Why might you want to create non-syntactic variable names? Since variable names are often used as in plots (e.g. axis-titles) or headers in tables, where having spaces or other characters that are invalid R variable names is useful. Those functions will have ways to use text other than the column. 6.3 Tibbles vs. data.frame Discuss the definition of a data frame. What is the traditional R data.frame? In general, discuss how this “dialect” of R relates to base R and other R that they will see. Also, need to discuss types of variables. If nycflights::flights were printed in the console it would be much worse. Just try it, I dare you. as.data.frame(nycflights13::flights) 6.4 Subsetting Note Warnings about partial matching! What is it and why is it dangerous. 6.5 Interacting with older code Note Not all older functions work with tibbles (an example includes giAmelia); usually because they rely on quirks in data.frame behavior that tibbles “fix”. Use as.data.frame() to turn a tibble back into a data.frame. This is usually because of [ and the way it inconsistenly returns a vector or a data frame. With tibbles [ always returns a data frame 6.6 Exercises How can you tell if an object is a tibble? (Hint: try printing mtcars, which is a regular data frame). mtcars #> mpg cyl disp hp drat wt qsec vs am gear carb #> Mazda RX4 21.0 6 160.0 110 3.90 2.62 16.5 0 1 4 4 #> Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.88 17.0 0 1 4 4 #> Datsun 710 22.8 4 108.0 93 3.85 2.32 18.6 1 1 4 1 #> Hornet 4 Drive 21.4 6 258.0 110 3.08 3.21 19.4 1 0 3 1 #> Hornet Sportabout 18.7 8 360.0 175 3.15 3.44 17.0 0 0 3 2 #> Valiant 18.1 6 225.0 105 2.76 3.46 20.2 1 0 3 1 #> Duster 360 14.3 8 360.0 245 3.21 3.57 15.8 0 0 3 4 #> Merc 240D 24.4 4 146.7 62 3.69 3.19 20.0 1 0 4 2 #> Merc 230 22.8 4 140.8 95 3.92 3.15 22.9 1 0 4 2 #> Merc 280 19.2 6 167.6 123 3.92 3.44 18.3 1 0 4 4 #> Merc 280C 17.8 6 167.6 123 3.92 3.44 18.9 1 0 4 4 #> Merc 450SE 16.4 8 275.8 180 3.07 4.07 17.4 0 0 3 3 #> Merc 450SL 17.3 8 275.8 180 3.07 3.73 17.6 0 0 3 3 #> Merc 450SLC 15.2 8 275.8 180 3.07 3.78 18.0 0 0 3 3 #> Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.25 18.0 0 0 3 4 #> Lincoln Continental 10.4 8 460.0 215 3.00 5.42 17.8 0 0 3 4 #> Chrysler Imperial 14.7 8 440.0 230 3.23 5.34 17.4 0 0 3 4 #> Fiat 128 32.4 4 78.7 66 4.08 2.20 19.5 1 1 4 1 #> Honda Civic 30.4 4 75.7 52 4.93 1.61 18.5 1 1 4 2 #> Toyota Corolla 33.9 4 71.1 65 4.22 1.83 19.9 1 1 4 1 #> Toyota Corona 21.5 4 120.1 97 3.70 2.46 20.0 1 0 3 1 #> Dodge Challenger 15.5 8 318.0 150 2.76 3.52 16.9 0 0 3 2 #> AMC Javelin 15.2 8 304.0 150 3.15 3.44 17.3 0 0 3 2 #> Camaro Z28 13.3 8 350.0 245 3.73 3.84 15.4 0 0 3 4 #> Pontiac Firebird 19.2 8 400.0 175 3.08 3.85 17.1 0 0 3 2 #> Fiat X1-9 27.3 4 79.0 66 4.08 1.94 18.9 1 1 4 1 #> Porsche 914-2 26.0 4 120.3 91 4.43 2.14 16.7 0 1 5 2 #> Lotus Europa 30.4 4 95.1 113 3.77 1.51 16.9 1 1 5 2 #> Ford Pantera L 15.8 8 351.0 264 4.22 3.17 14.5 0 1 5 4 #> Ferrari Dino 19.7 6 145.0 175 3.62 2.77 15.5 0 1 5 6 #> Maserati Bora 15.0 8 301.0 335 3.54 3.57 14.6 0 1 5 8 #> Volvo 142E 21.4 4 121.0 109 4.11 2.78 18.6 1 1 4 2 class(mtcars) #> [1] "data.frame" class(as_tibble(mtcars)) #> [1] "tbl_df" "tbl" "data.frame" Tibbles will only print out a limited number of rows and show the class on top of each column. Addtionally, tibbles have class "tbl_df" and "tbl_" in addition to "data.frame". Compare and contrast the following operations on a data.frame and equivalent tibble. What is different? Why might the default data frame behaviours cause you frustration? df <- data.frame(abc = 1, xyz = "a") df$x #> [1] a #> Levels: a df[, "xyz"] #> [1] a #> Levels: a df[, c("abc", "xyz")] #> abc xyz #> 1 1 a tbl <- as_tibble(df) tbl$x #> Warning: Unknown column 'x' #> NULL tbl[, "xyz"] #> # A tibble: 1 × 1 #> xyz #> <fctr> #> 1 a tbl[, c("abc", "xyz")] #> # A tibble: 1 × 2 #> abc xyz #> <dbl> <fctr> #> 1 1 a Using $ a data.frame will partially complete the column. So even though we wrote df$x it returned df$xyz. This saves a few keystrokes, but can result in accidentally using a different variable than you thought you were using. With data.frames, with [ the type of object that is returned differs on the number of columns. If it is one column, it won’t return a data.frame, but instead will return a vector. With more than one column, then it will return a data.frame. This is fine if you know what you are passing in, but suppose you did df[ , vars] where vars was a variable. Then you what that code does depends on length(vars) and you’d have to write code to account for those situations or risk bugs. If you have the name of a variable stored in an object, e.g. var <- "mpg", how can you extract the reference variable from a tibble? You can use the double bracket, like df[[var]]. You cannot use the dollar sign, becuase df$var would look for a column named var. Practice referring to non-syntactic names in the following data frame by: Extracting the variable called 1. Plotting a scatterplot of 1 vs 2. Creating a new column called 3 which is 2 divided by 1. Renaming the columns to one, two and three. annoying <- tibble( `1` = 1:10, `2` = `1` * 2 + rnorm(length(`1`)) ) Extract the variable called 1: annoying[["1"]] #> [1] 1 2 3 4 5 6 7 8 9 10 or annoying$`1` #> [1] 1 2 3 4 5 6 7 8 9 10 A scatterplot of 1 vs. 2: ggplot(annoying, aes(x = `1`, y = `2`)) + geom_point() A new column 3 with is 2 divided by 1: annoying[["3"]] <- annoying$`2` / annoying$`1` or annoying[["3"]] <- annoying[["2"]] / annoying[["1"]] Renaming the columns to one, two, and three: annoying <- rename(annoying, one = `1`, two = `2`, three = `3`) glimpse(annoying) #> Observations: 10 #> Variables: 3 #> $ one <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 #> $ two <dbl> 0.60, 4.26, 3.56, 7.99, 10.62, 13.15, 12.18, 15.75, 17.7... #> $ three <dbl> 0.60, 2.13, 1.19, 2.00, 2.12, 2.19, 1.74, 1.97, 1.97, 1.97 What does tibble::enframe() do? When might you use it? It converts named vectors to a data frame with names and values ?tibble::enframe enframe(c(a = 1, b = 2, c = 3)) #> # A tibble: 3 × 2 #> name value #> <chr> <dbl> #> 1 a 1 #> 2 b 2 #> 3 c 3 What option controls how many additional column names are printed at the footer of a tibble? The print function for tibbles is in print.tbl_df: ?print.tbl_df The option n_extra determines the number of extra columns to print information for. "], +["data-import.html", "7 Data Import 7.1 Introduction 7.2 Getting started 7.3 Parsing a vector 7.4 Other Types of Data", " 7 Data Import 7.1 Introduction Functions and packages used: readr, feather, haven, rio read_csv parse_* type_convert save, load readRDS, writeRDS write_csv, write_tsv, write_feather read_lines, read_file library("tidyverse") 7.2 Getting started Note: read_log() is important for web data, but likely to be irrelevant to most political science research TODO fill in the links and add any missing 7.2.1 Exercises What function would you use to read a file where fields were separated with “|”? I’d use read_delim with delim="|": read_delim(file, delim = "|") Apart from file, skip, and comment, what other arguments do read_csv() and read_tsv() have in common? They have the following arguments in common: union(names(formals(read_csv)), names(formals(read_tsv))) #> [1] "file" "col_names" "col_types" "locale" "na" #> [6] "quoted_na" "comment" "trim_ws" "skip" "n_max" #> [11] "guess_max" "progress" col_names and col_types are used to specify the column names and how to parse the columns locale is important for determining things like the enecoding and whether “.” or “,” is used as a decimal mark. na and quoted_na control which strings are treated as missing values when parsing vectors trim_ws trims whitespace before and after cells before parsing n_max sets how many rows to read guess_max sets how many rows to use when guessing the column type progress determines whether a progress bar is shown. What are the most important arguments to read_fwf()? The most important argument to read_fwf which reads “fixed-width formats”, is col_positions which tells the function where data columns begin and end. Sometimes strings in a CSV file contain commas. To prevent them from causing problems they need to be surrounded by a quoting character, like " or '. By convention, read_csv() assumes that the quoting character will be ", and if you want to change it you’ll need to use read_delim() instead. What arguments do you need to specify to read the following text into a data frame? "x,y\\n1,'a,b'" x <- "x,y\\n1,'a,b'" read_delim(x, ",", quote = "'") #> # A tibble: 1 × 2 #> x y #> <int> <chr> #> 1 1 a,b Identify what is wrong with each of the following inline CSV files. What happens when you run the code? read_csv("a,b\\n1,2,3\\n4,5,6") #> Warning: 2 parsing failures. #> row col expected actual #> 1 -- 2 columns 3 columns #> 2 -- 2 columns 3 columns #> # A tibble: 2 × 2 #> a b #> <int> <int> #> 1 1 2 #> 2 4 5 Only two columns are specified in the header “a” and “b”, but the rows have three columns, so the last column in dropped. read_csv("a,b,c\\n1,2\\n1,2,3,4") #> Warning: 2 parsing failures. #> row col expected actual #> 1 -- 3 columns 2 columns #> 2 -- 3 columns 4 columns #> # A tibble: 2 × 3 #> a b c #> <int> <int> <int> #> 1 1 2 NA #> 2 1 2 3 The numbers of columns in the data do not match the number of columns in the header (three). In row one, there are only two values, so column c is set to missing. In row two, there is an extra value, and that value is dropped. read_csv("a,b\\n\\"1") #> Warning: 2 parsing failures. #> row col expected actual #> 1 a closing quote at end of file #> 1 -- 2 columns 1 columns #> # A tibble: 1 × 2 #> a b #> <int> <chr> #> 1 1 <NA> It’s not clear what the intent was here. The opening quote \\\\"1 is dropped because it is not closed, and a is treated as an integer. read_csv("a,b\\n1,2\\na,b") #> # A tibble: 2 × 2 #> a b #> <chr> <chr> #> 1 1 2 #> 2 a b Both “a” and “b” are treated as character vectors since they contain non-numeric strings. This may have been intentional, or the author may have intended the values of the columns to be “1,2” and “a,b”. read_csv("a;b\\n1;3") #> # A tibble: 1 × 1 #> `a;b` #> <chr> #> 1 1;3 The values are separated by “;” rather than “,”. Use read_csv2 instead: read_csv2("a;b\\n1;3") #> # A tibble: 1 × 2 #> a b #> <int> <int> #> 1 1 3 7.3 Parsing a vector Notes This is detailed, but these details can make your life hell. Skim now, but be aware that what should be simple, actually is not. In data analysis, ��% is data cleaning, ��% is modeling, and the rest is character encoding issues — Jeffrey B. Arnold (@jrnld) July 31, 2016 This Computerphile video on Unicode is great Characters, Symbols and the Unicode Miracle - Computerphile Note that these issues are real. Reusing one of Chris Adolph’s csv files from an earlier version of this course gave me problems, resulting in me filing this bug report. The suggested reading is very useful: http://kunststube.net/encoding/ This becomes especially useful when you take “Text as Data”. charToRaw("Jeff") #> [1] 4a 65 66 66 class(charToRaw("Jeff")) #> [1] "raw" 7.3.1 Exercises What are the most important arguments to locale()? The locale broadly controls the following: date and time formats: date_names, date_format, and time_format time_zone: tz numbers: decimal_mark, grouping_mark encoding: encoding What happens if you try and set decimal_mark and grouping_mark to the same character? What happens to the default value of grouping_mark when you set decimal_mark to “,”? What happens to the default value of decimal_mark when you set the grouping_mark to “.”? If the decimal and grouping marks are set to the same character, locale throws an error: locale(decimal_mark = ".", grouping_mark = ".") #> Error: `decimal_mark` and `grouping_mark` must be different If the decimal_mark is set to the comma “,", then the grouping mark is set to the period ".": locale(decimal_mark = ",") #> <locale> #> Numbers: 123.456,78 #> Formats: %AD / %AT #> Timezone: UTC #> Encoding: UTF-8 #> <date_names> #> Days: Sunday (Sun), Monday (Mon), Tuesday (Tue), Wednesday (Wed), #> Thursday (Thu), Friday (Fri), Saturday (Sat) #> Months: January (Jan), February (Feb), March (Mar), April (Apr), May #> (May), June (Jun), July (Jul), August (Aug), September #> (Sep), October (Oct), November (Nov), December (Dec) #> AM/PM: AM/PM If the grouping mark is set to a period, then the decimal mark is set to a comma locale(grouping_mark = ",") #> <locale> #> Numbers: 123,456.78 #> Formats: %AD / %AT #> Timezone: UTC #> Encoding: UTF-8 #> <date_names> #> Days: Sunday (Sun), Monday (Mon), Tuesday (Tue), Wednesday (Wed), #> Thursday (Thu), Friday (Fri), Saturday (Sat) #> Months: January (Jan), February (Feb), March (Mar), April (Apr), May #> (May), June (Jun), July (Jul), August (Aug), September #> (Sep), October (Oct), November (Nov), December (Dec) #> AM/PM: AM/PM I didn’t discuss the date_format and time_format options to locale(). What do they do? Construct an example that shows when they might be useful. They provide default date and time formats. The readr vignette discusses using these to parse dates: since dates can include languages specific weekday and month names, and different conventions for specifying AM/PM locale() #> <locale> #> Numbers: 123,456.78 #> Formats: %AD / %AT #> Timezone: UTC #> Encoding: UTF-8 #> <date_names> #> Days: Sunday (Sun), Monday (Mon), Tuesday (Tue), Wednesday (Wed), #> Thursday (Thu), Friday (Fri), Saturday (Sat) #> Months: January (Jan), February (Feb), March (Mar), April (Apr), May #> (May), June (Jun), July (Jul), August (Aug), September #> (Sep), October (Oct), November (Nov), December (Dec) #> AM/PM: AM/PM Examples from the readr vignette of parsing French dates parse_date("1 janvier 2015", "%d %B %Y", locale = locale("fr")) #> [1] "2015-01-01" parse_date("14 oct. 1979", "%d %b %Y", locale = locale("fr")) #> [1] "1979-10-14" Apparently the time format is not used for anything, but the date format is used for guessing column types. If you live outside the US, create a new locale object that encapsulates the settings for the types of file you read most commonly. ?locale What’s the difference between read_csv() and read_csv2()? The delimiter. The function read_csv uses a comma, while read_csv2 uses a semi-colon (;). Using a semi-colon is useful when commas are used as the decimal point (as in Europe). What are the most common encodings used in Europe? What are the most common encodings used in Asia? Do some googling to find out. UTF-8 is standard now, and ASCII has been around forever. For the European languages, there are separate encodings for Romance languages and Eastern European languages using Latin script, Cyrillic, Greek, Hebrew, Turkish: usually with separate ISO and Windows encoding standards. There is also Mac OS Roman. For Asian languages Arabic and Vietnamese have ISO and Windows standards. The other major Asian scripts have their own: Japanese: JIS X 0208, Shift JIS, ISO-2022-JP Chinese: GB 2312, GBK, GB 18030 Korean: KS X 1001, EUC-KR, ISO-2022-KR The list in the documentation for stringi::stri_enc_detect is pretty good since it supports the most common encodings: Western European Latin script languages: ISO-8859-1, Windows-1250 (also CP-1250 for code-point) Eastern European Latin script languages: ISO-8859-2, Windows-1252 Greek: ISO-8859-7 Turkish: ISO-8859-9, Windows-1254 Hebrew: ISO-8859-8, IBM424, Windows 1255 Russian: Windows 1251 Japanese: Shift JIS, ISO-2022-JP, EUC-JP Korean: ISO-2022-KR, EUC-KR Chinese: GB18030, ISO-2022-CN (Simplified), Big5 (Traditional) Arabic: ISO-8859-6, IBM420, Windows 1256 For more information: https://en.wikipedia.org/wiki/Character_encoding has a good list http://stackoverflow.com/questions/8509339/what-is-the-most-common-encoding-of-each-language http://kunststube.net/encoding/ Some of the more useful programs for this In R see readr::guess_encoding and the stringi package with str_enc_detect iconv: https://en.wikipedia.org/wiki/Iconv chardet: https://github.com/chardet/chardet (Python) Generate the correct format string to parse each of the following dates and times: 7.4 Other Types of Data NOTES Expand on what’s in this section: The rio package is very useful for loading different types of Other useful functions and packages not mentioned here: Stata: haven, read_dta. While the foreign package in R reads Stata files, it cannot read files created by the most recent version of Stata (> 13). SPSS: haven::read_spss SAS: haven::read_sas XLS: readxl::read_excel JSON: jsonlite pacakge. However, often there are APIs and clients which make this easier, e.g. pollstR which returns data from the Huffington Post Pollster API. XML: xml2 HTML: rvest Databases: DBI and backends PDF: This is really a different thing as you are extracting data from an unstructured form. It also depends on whether the PDF text is actually an image as from a scan, in which case you need to use OCR to first indentify words. tabulizer: extracts tables from PDF documents pdftools: extracts text from pdf documents Also see general text analysis packages like tm, quanteda, etc. which often have functions to assist with getting text from PDFs. Links to Jenny Bryan’s tutorials: purrr-tutorial worked examples Discussing csvy as an example of a csv with metadata "], +["tidy-data.html", "8 Tidy Data 8.1 Introduction 8.2 Tidy Data 8.3 Spreading and Gathering 8.4 Separating and Uniting 8.5 Missing Values 8.6 Case Study 8.7 Non-Tidy Data", " 8 Tidy Data 8.1 Introduction Functions used in this chapter spread gather separate unite complete fill library(tidyverse) 8.2 Tidy Data NOTES Add Tidy Data to reading Use COW war dataset as an example of non-tidy data Also WDI data for non-tidy data Replication datatsets are often non-tidy. Why? See this post by Jeff Leek The Rules Each variable has its own column Each observation muust have its own row Each value must have its own cell or even Put each dataset in a tibble Put each variable in a column These seem obvious at first, so we need to see examples of not-following tidy data and what happens. Some nuances: The definitions of variable, observation, and value are not always clear. And how you store and arrange the data can depend on how you aim to use it. Generally, aim for storing the data in a tidy format that ensures minimal errors. When you model it, you can transform the data later. See non-tidy data. It is easier to work with variables in columns because of mutate and summary functions. It will also work better with tidyverse functions: e.g. using group_by to group and summarize, or facet_* and aesthetics in ggplot2. The tidy data ideas are adapted from the database normalization, but simplified and adapted to the general uses of practicing data scientists. 8.2.1 Exercises Using prose, describe how the variables and observations are organised in each of the sample tables. In table1 each row is a (country, year) with variables cases and population. table1 #> # A tibble: 6 × 4 #> country year cases population #> <chr> <int> <int> <int> #> 1 Afghanistan 1999 745 19987071 #> 2 Afghanistan 2000 2666 20595360 #> 3 Brazil 1999 37737 172006362 #> 4 Brazil 2000 80488 174504898 #> 5 China 1999 212258 1272915272 #> 6 China 2000 213766 1280428583 In table2, each row is country, year , variable (“cases”, “population”) combination, and there is a count variable with the numeric value of the variable. table2 #> # A tibble: 12 × 4 #> country year type count #> <chr> <int> <chr> <int> #> 1 Afghanistan 1999 cases 745 #> 2 Afghanistan 1999 population 19987071 #> 3 Afghanistan 2000 cases 2666 #> 4 Afghanistan 2000 population 20595360 #> 5 Brazil 1999 cases 37737 #> 6 Brazil 1999 population 172006362 #> # ... with 6 more rows In table3, each row is a (country, year) combination with the column rate having the rate of cases to population as a character string in the format "cases/rate". table3 #> # A tibble: 6 × 3 #> country year rate #> * <chr> <int> <chr> #> 1 Afghanistan 1999 745/19987071 #> 2 Afghanistan 2000 2666/20595360 #> 3 Brazil 1999 37737/172006362 #> 4 Brazil 2000 80488/174504898 #> 5 China 1999 212258/1272915272 #> 6 China 2000 213766/1280428583 Table 4 is split into two tables, one table for each variable: table4a is the table for cases, while table4b is the table for population. Within each table, each row is a country, each column is a year, and the cells are the value of the variable for the table. table4a #> # A tibble: 3 × 3 #> country `1999` `2000` #> * <chr> <int> <int> #> 1 Afghanistan 745 2666 #> 2 Brazil 37737 80488 #> 3 China 212258 213766 table4b #> # A tibble: 3 × 3 #> country `1999` `2000` #> * <chr> <int> <int> #> 1 Afghanistan 19987071 20595360 #> 2 Brazil 172006362 174504898 #> 3 China 1272915272 1280428583 Compute the rate for table2, and table4a + table4b. You will need to perform four operations: Extract the number of TB cases per country per year. Extract the matching population per country per year. Divide cases by population, and multiply by 10000. Store back in the appropriate place. Which representation is easiest to work with? Which is hardest? Why? Without using the joins covered in Ch 12: tb2_cases <- filter(table2, type == "cases")[["count"]] tb2_country <- filter(table2, type == "cases")[["country"]] tb2_year <- filter(table2, type == "cases")[["year"]] tb2_population <- filter(table2, type == "population")[["count"]] table2_clean <- tibble(country = tb2_country, year = tb2_year, rate = tb2_cases / tb2_population) table2_clean #> # A tibble: 6 × 3 #> country year rate #> <chr> <int> <dbl> #> 1 Afghanistan 1999 3.73e-05 #> 2 Afghanistan 2000 1.29e-04 #> 3 Brazil 1999 2.19e-04 #> 4 Brazil 2000 4.61e-04 #> 5 China 1999 1.67e-04 #> 6 China 2000 1.67e-04 Note, that this assumes that all observations are sorted so that each country, year will have the observation for cases followed by population. tibble(country = table4a[["country"]], `1999` = table4a[["1999"]] / table4b[["1999"]], `2000` = table4b[["2000"]] / table4b[["2000"]]) #> # A tibble: 3 × 3 #> country `1999` `2000` #> <chr> <dbl> <dbl> #> 1 Afghanistan 3.73e-05 1 #> 2 Brazil 2.19e-04 1 #> 3 China 1.67e-04 1 or tibble(country = rep(table4a[["country"]], 2), year = rep(c(1999, 2000), each = nrow(table4a)), `rate` = c(table4a[["1999"]] / table4b[["1999"]], table4b[["2000"]] / table4b[["2000"]])) #> # A tibble: 6 × 3 #> country year rate #> <chr> <dbl> <dbl> #> 1 Afghanistan 1999 3.73e-05 #> 2 Brazil 1999 2.19e-04 #> 3 China 1999 1.67e-04 #> 4 Afghanistan 2000 1.00e+00 #> 5 Brazil 2000 1.00e+00 #> 6 China 2000 1.00e+00 Recreate the plot showing change in cases over time using table2 instead of table1. What do you need to do first? First, I needed to filter the tibble to only include those rows that represented the “cases” variable. table2 %>% filter(type == "cases") %>% ggplot(aes(year, count)) + geom_line(aes(group = country), colour = "grey50") + geom_point(aes(colour = country)) 8.3 Spreading and Gathering tidy4a <- table4a %>% gather(`1999`, `2000`, key = "year", value = "cases") tidy4b <- table4b %>% gather(`1999`, `2000`, key = "year", value = "cases") left_join(tidy4a, tidy4b) #> Joining, by = c("country", "year", "cases") #> # A tibble: 6 × 3 #> country year cases #> <chr> <chr> <int> #> 1 Afghanistan 1999 745 #> 2 Brazil 1999 37737 #> 3 China 1999 212258 #> 4 Afghanistan 2000 2666 #> 5 Brazil 2000 80488 #> 6 China 2000 213766 spread(table2, key = type, value = count) #> # A tibble: 6 × 4 #> country year cases population #> * <chr> <int> <int> <int> #> 1 Afghanistan 1999 745 19987071 #> 2 Afghanistan 2000 2666 20595360 #> 3 Brazil 1999 37737 172006362 #> 4 Brazil 2000 80488 174504898 #> 5 China 1999 212258 1272915272 #> 6 China 2000 213766 1280428583 8.3.1 Exercises Why are gather() and spread() not perfectly symmetrical? Carefully consider the following example: stocks <- tibble( year = c(2015, 2015, 2016, 2016), half = c( 1, 2, 1, 2), return = c(1.88, 0.59, 0.92, 0.17) ) stocks %>% spread(year, return) %>% gather("year", "return", `2015`:`2016`) #> # A tibble: 4 × 3 #> half year return #> <dbl> <chr> <dbl> #> 1 1 2015 1.88 #> 2 2 2015 0.59 #> 3 1 2016 0.92 #> 4 2 2016 0.17 The functions spread and gather are not perfectly symmetrical because column type information is not transferred between them. In the original table the column year was numeric, but after the spread-gather cyle it is character, because with gather, variable names are always converted to a character vector. The convert argument tries to convert character vectors to the appropriate type. In the background this uses the type.convert function. stocks %>% spread(year, return) %>% gather("year", "return", `2015`:`2016`, convert = TRUE) #> # A tibble: 4 × 3 #> half year return #> <dbl> <int> <dbl> #> 1 1 2015 1.88 #> 2 2 2015 0.59 #> 3 1 2016 0.92 #> 4 2 2016 0.17 Why does this code fail? table4a %>% gather(1999, 2000, key = "year", value = "cases") #> Error in eval(expr, envir, enclos): Position must be between 0 and n The code fails because the column names 1999 and 2000 are not standard and thus needs to be quoted. The tidyverse functions will interpret 1999 and 2000 without quotes as looking for the 1999th and 2000th column of the data frame. This will work: table4a %>% gather(`1999`, `2000`, key = "year", value = "cases") #> # A tibble: 6 × 3 #> country year cases #> <chr> <chr> <int> #> 1 Afghanistan 1999 745 #> 2 Brazil 1999 37737 #> 3 China 1999 212258 #> 4 Afghanistan 2000 2666 #> 5 Brazil 2000 80488 #> 6 China 2000 213766 Why does spreading this tibble fail? How could you add a new column to fix the problem? people <- tribble( ~name, ~key, ~value, #-----------------|--------|------ "Phillip Woods", "age", 45, "Phillip Woods", "height", 186, "Phillip Woods", "age", 50, "Jessica Cordero", "age", 37, "Jessica Cordero", "height", 156 ) glimpse(people) #> Observations: 5 #> Variables: 3 #> $ name <chr> "Phillip Woods", "Phillip Woods", "Phillip Woods", "Jess... #> $ key <chr> "age", "height", "age", "age", "height" #> $ value <dbl> 45, 186, 50, 37, 156 spread(people, key, value) #> Error: Duplicate identifiers for rows (1, 3) Spreading the data frame fails because there are two rows with “age” for “Phillip Woods”. We would need to add another column with an indicator for the number observation it is, people <- tribble( ~name, ~key, ~value, ~obs, #-----------------|--------|------|------ "Phillip Woods", "age", 45, 1, "Phillip Woods", "height", 186, 1, "Phillip Woods", "age", 50, 2, "Jessica Cordero", "age", 37, 1, "Jessica Cordero", "height", 156, 1 ) spread(people, key, value) #> # A tibble: 3 × 4 #> name obs age height #> * <chr> <dbl> <dbl> <dbl> #> 1 Jessica Cordero 1 37 156 #> 2 Phillip Woods 1 45 186 #> 3 Phillip Woods 2 50 NA Tidy the simple tibble below. Do you need to spread or gather it? What are the variables? preg <- tribble( ~pregnant, ~male, ~female, "yes", NA, 10, "no", 20, 12 ) You need to gather it. The variables are: pregnant: logical (“yes”, “no”) female: logical count: integer gather(preg, sex, count, male, female) %>% mutate(pregnant = pregnant == "yes", female = sex == "female") %>% select(-sex) #> # A tibble: 4 × 3 #> pregnant count female #> <lgl> <dbl> <lgl> #> 1 TRUE NA FALSE #> 2 FALSE 20 FALSE #> 3 TRUE 10 TRUE #> 4 FALSE 12 TRUE Converting pregnant and female from character vectors to logical was not necessary to tidy it, but it makes it easier to work with. 8.4 Separating and Uniting table3 %>% separate(rate, into = c("cases", "population"), sep = "/", convert = TRUE) %>% separate(year, into = c("century", "year"), sep = 2) #> # A tibble: 6 × 5 #> country century year cases population #> * <chr> <chr> <chr> <int> <int> #> 1 Afghanistan 19 99 745 19987071 #> 2 Afghanistan 20 00 2666 20595360 #> 3 Brazil 19 99 37737 172006362 #> 4 Brazil 20 00 80488 174504898 #> 5 China 19 99 212258 1272915272 #> 6 China 20 00 213766 1280428583 table5 %>% unite(new, century, year, sep = "") #> # A tibble: 6 × 3 #> country new rate #> * <chr> <chr> <chr> #> 1 Afghanistan 1999 745/19987071 #> 2 Afghanistan 2000 2666/20595360 #> 3 Brazil 1999 37737/172006362 #> 4 Brazil 2000 80488/174504898 #> 5 China 1999 212258/1272915272 #> 6 China 2000 213766/1280428583 8.4.1 Exercises What do the extra and fill arguments do in separate()? Experiment with the various options for the following two toy datasets. tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>% separate(x, c("one", "two", "three")) #> Warning: Too many values at 1 locations: 2 #> # A tibble: 3 × 3 #> one two three #> * <chr> <chr> <chr> #> 1 a b c #> 2 d e f #> 3 h i j tibble(x = c("a,b,c", "d,e", "f,g,i")) %>% separate(x, c("one", "two", "three")) #> Warning: Too few values at 1 locations: 2 #> # A tibble: 3 × 3 #> one two three #> * <chr> <chr> <chr> #> 1 a b c #> 2 d e <NA> #> 3 f g i ?separate The extra argument tells separate what to do if there are too many pieces, and the fill argument if there aren’t enough. tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>% separate(x, c("one", "two", "three")) #> Warning: Too many values at 1 locations: 2 #> # A tibble: 3 × 3 #> one two three #> * <chr> <chr> <chr> #> 1 a b c #> 2 d e f #> 3 h i j By default separate drops the extra values with a warning. tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>% separate(x, c("one", "two", "three"), extra = "drop") #> # A tibble: 3 × 3 #> one two three #> * <chr> <chr> <chr> #> 1 a b c #> 2 d e f #> 3 h i j This produces the same result as above, dropping extra values, but without the warning. tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>% separate(x, c("one", "two", "three"), extra = "merge") #> # A tibble: 3 × 3 #> one two three #> * <chr> <chr> <chr> #> 1 a b c #> 2 d e f,g #> 3 h i j In this, the extra values are not split, so “f,g” appears in column three. In this, one of the entries for column, “d,e”, has too few elements. The default for fill is similar to separate; it fills with missing values but emits a warning. In this, row 2 of column “three”, is NA. tibble(x = c("a,b,c", "d,e", "f,g,i")) %>% separate(x, c("one", "two", "three")) #> Warning: Too few values at 1 locations: 2 #> # A tibble: 3 × 3 #> one two three #> * <chr> <chr> <chr> #> 1 a b c #> 2 d e <NA> #> 3 f g i Alternative options for fill are "right", to fill with missing values from the right, but without a warning tibble(x = c("a,b,c", "d,e", "f,g,i")) %>% separate(x, c("one", "two", "three"), fill = "right") #> # A tibble: 3 × 3 #> one two three #> * <chr> <chr> <chr> #> 1 a b c #> 2 d e <NA> #> 3 f g i The option fill = "left" also fills with missing values without a warning, but this time from the left side. Now, column “one” of row 2 will be missing, and the other values in that row are shifted over. tibble(x = c("a,b,c", "d,e", "f,g,i")) %>% separate(x, c("one", "two", "three"), fill = "left") #> # A tibble: 3 × 3 #> one two three #> * <chr> <chr> <chr> #> 1 a b c #> 2 <NA> d e #> 3 f g i Both unite() and separate() have a remove argument. What does it do? Why would you set it to FALSE? You would set it to FALSE if you want to create a new variable, but keep the old one. Compare and contrast separate() and extract(), Why are there three variations of separation (by position, by separator, and with groups), but only one unite? The function extract uses a regular expression to find groups and split into columns. In unite it is unambigous since it is many columns to one, and once the columns are specified, there is only one way to do it, the only choice is the sep. In separate, it is one to many, and there are multiple ways to split the character string. 8.5 Missing Values 8.5.1 Exercises Compare and contrast the fill arguments to spread() and complete(). ?spread ?complete In spread, the fill argument explicitly sets the value to replace NAs. In complete, the fill argument also sets a value to replace NAs but it is named list, allowing for different values for different variables. Also, both cases replace both implicit and explicit missing values. What does the direction argument to fill() do? With fill, it determines whether NA values should be replaced by the previous non-missing value ("down") or the next non-missing value ("up"). 8.6 Case Study who1 <- who %>% gather(new_sp_m014:newrel_f65, key = "key", value = "cases", na.rm = TRUE) glimpse(who1) #> Observations: 76,046 #> Variables: 6 #> $ country <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanis... #> $ iso2 <chr> "AF", "AF", "AF", "AF", "AF", "AF", "AF", "AF", "AF", ... #> $ iso3 <chr> "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG"... #> $ year <int> 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, ... #> $ key <chr> "new_sp_m014", "new_sp_m014", "new_sp_m014", "new_sp_m... #> $ cases <int> 0, 30, 8, 52, 129, 90, 127, 139, 151, 193, 186, 187, 2... who2 <- who1 %>% mutate(key = stringr::str_replace(key, "newrel", "new_rel")) who3 <- who2 %>% separate(key, c("new", "type", "sexage"), sep = "_") who3 #> # A tibble: 76,046 × 8 #> country iso2 iso3 year new type sexage cases #> * <chr> <chr> <chr> <int> <chr> <chr> <chr> <int> #> 1 Afghanistan AF AFG 1997 new sp m014 0 #> 2 Afghanistan AF AFG 1998 new sp m014 30 #> 3 Afghanistan AF AFG 1999 new sp m014 8 #> 4 Afghanistan AF AFG 2000 new sp m014 52 #> 5 Afghanistan AF AFG 2001 new sp m014 129 #> 6 Afghanistan AF AFG 2002 new sp m014 90 #> # ... with 7.604e+04 more rows who3 %>% count(new) #> # A tibble: 1 × 2 #> new n #> <chr> <int> #> 1 new 76046 who4 <- who3 %>% select(-new, -iso2, -iso3) who5 <- who4 %>% separate(sexage, c("sex", "age"), sep = 1) who5 #> # A tibble: 76,046 × 6 #> country year type sex age cases #> * <chr> <int> <chr> <chr> <chr> <int> #> 1 Afghanistan 1997 sp m 014 0 #> 2 Afghanistan 1998 sp m 014 30 #> 3 Afghanistan 1999 sp m 014 8 #> 4 Afghanistan 2000 sp m 014 52 #> 5 Afghanistan 2001 sp m 014 129 #> 6 Afghanistan 2002 sp m 014 90 #> # ... with 7.604e+04 more rows 8.6.1 Exercises In this case study I set na.rm = TRUE just to make it easier to check that we had the correct values. Is this reasonable? Think about how missing values are represented in this dataset. Are there implicit missing values? What’s the difference between an NA and zero? Perhaps? I would need to know more about the data generation process. There are zero’s in the data, which means they may explicitly be indicating no cases. who1 %>% filter(cases == 0) %>% nrow() #> [1] 11080 So it appears that either a country has all its values in a year as non-missing if the WHO collected data for that country, or all its values are non-missing. So it is okay to treat explicitly and implicitly missing values the same, and we don’t lose any information by dropping them. gather(who, new_sp_m014:newrel_f65, key = "key", value = "cases") %>% group_by(country, year) %>% mutate(missing = is.na(cases)) %>% select(country, year, missing) %>% distinct() %>% group_by(country, year) %>% filter(n() > 1) #> Source: local data frame [0 x 2] #> Groups: country, year [0] #> #> # ... with 2 variables: country <chr>, year <int> What happens if you neglect the mutate() step? (mutate(key = stringr::str_replace(key, "newrel", "new_rel")) separate emits the warning “too few values”, and if we check the rows for keys beginning with "newrel_", we see that sexage is messing, and type = m014. who3a <- who1 %>% separate(key, c("new", "type", "sexage"), sep = "_") #> Warning: Too few values at 2580 locations: 73467, 73468, 73469, 73470, #> 73471, 73472, 73473, 73474, 73475, 73476, 73477, 73478, 73479, 73480, #> 73481, 73482, 73483, 73484, 73485, 73486, ... filter(who3a, new == "newrel") %>% head() #> # A tibble: 6 × 8 #> country iso2 iso3 year new type sexage cases #> <chr> <chr> <chr> <int> <chr> <chr> <chr> <int> #> 1 Afghanistan AF AFG 2013 newrel m014 <NA> 1705 #> 2 Albania AL ALB 2013 newrel m014 <NA> 14 #> 3 Algeria DZ DZA 2013 newrel m014 <NA> 25 #> 4 Andorra AD AND 2013 newrel m014 <NA> 0 #> 5 Angola AO AGO 2013 newrel m014 <NA> 486 #> 6 Anguilla AI AIA 2013 newrel m014 <NA> 0 I claimed that iso2 and iso3 were redundant with country. Confirm this claim. select(who3, country, iso2, iso3) %>% distinct() %>% group_by(country) %>% filter(n() > 1) #> Source: local data frame [0 x 3] #> Groups: country [0] #> #> # ... with 3 variables: country <chr>, iso2 <chr>, iso3 <chr> For each country, year, and sex compute the total number of cases of TB. Make an informative visualisation of the data. who5 %>% group_by(country, year, sex) %>% filter(year > 1995) %>% summarise(cases = sum(cases)) %>% unite(country_sex, country, sex, remove = FALSE) %>% ggplot(aes(x = year, y = cases, group = country_sex, colour = sex)) + geom_line() A small multiples plot faceting by country is difficult given the number of countries. Focusing on those countries with the largest changes or absolute magnitudes after providing the context above is another option. 8.7 Non-Tidy Data Corpus and text data is often stored in sparse Matrices https://cran.r-project.org/web/packages/tm/tm.pdf Graphical data has its own format: http://igraph.org/r/doc/ "], +["relational-data.html", "9 Relational Data 9.1 Prerequisites 9.2 nycflights13 9.3 Keys 9.4 Mutating Joins 9.5 Filtering Joins 9.6 Set operations", " 9 Relational Data 9.1 Prerequisites library("tidyverse") library("nycflights13") Topics, functions keys: primary key, foreign key, mutating joins: left_join, right_join, inner_join, full_join merge vs. joins filtering joins: semi_join, anti_join set operations: intersect, union, setdiff TODO: fuzzy joining 9.2 nycflights13 NOTES nycflights13 is an example of a data-only R package. R packages can contain both functions and data. Since data-sets can get large, often they can be packaged as their own dataset. These sorts of data-only R packages make it convenient for R users to access your data, but it should not be the only way you provide your research data. Not everyone uses R, so the original data should be provided in a program agnostic format (e.g. csv files). This also holds for those using Stata; they should not be distributing data in .dta format files specific to Stata (even if as we saw earlier, other programs can read that data.) Another example of a data-only R package is gapminder. How does Hadley create his diagrams? The four tables in the nycflights13 package: airlines #> # A tibble: 16 × 2 #> carrier name #> <chr> <chr> #> 1 9E Endeavor Air Inc. #> 2 AA American Airlines Inc. #> 3 AS Alaska Airlines Inc. #> 4 B6 JetBlue Airways #> 5 DL Delta Air Lines Inc. #> 6 EV ExpressJet Airlines Inc. #> # ... with 10 more rows airports #> # A tibble: 1,458 × 8 #> faa name lat lon alt tz dst #> <chr> <chr> <dbl> <dbl> <int> <dbl> <chr> #> 1 04G Lansdowne Airport 41.1 -80.6 1044 -5 A #> 2 06A Moton Field Municipal Airport 32.5 -85.7 264 -6 A #> 3 06C Schaumburg Regional 42.0 -88.1 801 -6 A #> 4 06N Randall Airport 41.4 -74.4 523 -5 A #> 5 09J Jekyll Island Airport 31.1 -81.4 11 -5 A #> 6 0A9 Elizabethton Municipal Airport 36.4 -82.2 1593 -5 A #> # ... with 1,452 more rows, and 1 more variables: tzone <chr> planes #> # A tibble: 3,322 × 9 #> tailnum year type manufacturer model engines #> <chr> <int> <chr> <chr> <chr> <int> #> 1 N10156 2004 Fixed wing multi engine EMBRAER EMB-145XR 2 #> 2 N102UW 1998 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2 #> 3 N103US 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2 #> 4 N104UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2 #> 5 N10575 2002 Fixed wing multi engine EMBRAER EMB-145LR 2 #> 6 N105UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2 #> # ... with 3,316 more rows, and 3 more variables: seats <int>, #> # speed <int>, engine <chr> weather #> # A tibble: 26,130 × 15 #> origin year month day hour temp dewp humid wind_dir wind_speed #> <chr> <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> #> 1 EWR 2013 1 1 0 37.0 21.9 54.0 230 10.4 #> 2 EWR 2013 1 1 1 37.0 21.9 54.0 230 13.8 #> 3 EWR 2013 1 1 2 37.9 21.9 52.1 230 12.7 #> 4 EWR 2013 1 1 3 37.9 23.0 54.5 230 13.8 #> 5 EWR 2013 1 1 4 37.9 24.1 57.0 240 15.0 #> 6 EWR 2013 1 1 6 39.0 26.1 59.4 270 10.4 #> # ... with 2.612e+04 more rows, and 5 more variables: wind_gust <dbl>, #> # precip <dbl>, pressure <dbl>, visib <dbl>, time_hour <dttm> 9.2.1 Exercises Imagine you wanted to draw (approximately) the route each plane flies from its origin to its destination. What variables would you need? What tables would you need to combine? flights table: origin and dest airports table: longitude and latitude variables We would merge the flights with airports twice: once to get the location of the origin airport, and once to get the location of the dest airport. I forgot to draw the relationship between weather and airports. What is the relationship and how should it appear in the diagram? The variable origin in weather is matched with faa in airports. weather only contains information for the origin (NYC) airports. If it contained weather records for all airports in the USA, what additional relation would it define with flights? year, month, day, hour, origin in weather would be matched to year, month, day, hour, dest in flight (though it should use the arrival date-time values for dest if possible). We know that some days of the year are “special”, and fewer people than usual fly on them. How might you represent that data as a data frame? What would be the primary keys of that table? How would it connect to the existing tables? I would add a table of special dates. The primary key would be date. It would match to the year, month, day columns of `flights. 9.3 Keys Add a surrogate key to flights. I add the column flight_id as a surrogate key. I sort the data prior to making the key, even though it is not strictly necessary, so the order of the rows has some meaning. flights %>% arrange(year, month, day, sched_dep_time, carrier, flight) %>% mutate(flight_id = row_number()) %>% glimpse() #> Observations: 336,776 #> Variables: 20 #> $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013,... #> $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,... #> $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,... #> $ dep_time <int> 517, 533, 542, 544, 554, 559, 558, 559, 558, 55... #> $ sched_dep_time <int> 515, 529, 540, 545, 558, 559, 600, 600, 600, 60... #> $ dep_delay <dbl> 2, 4, 2, -1, -4, 0, -2, -1, -2, -2, -3, NA, 1, ... #> $ arr_time <int> 830, 850, 923, 1004, 740, 702, 753, 941, 849, 8... #> $ sched_arr_time <int> 819, 830, 850, 1022, 728, 706, 745, 910, 851, 8... #> $ arr_delay <dbl> 11, 20, 33, -18, 12, -4, 8, 31, -2, -3, -8, NA,... #> $ carrier <chr> "UA", "UA", "AA", "B6", "UA", "B6", "AA", "AA",... #> $ flight <int> 1545, 1714, 1141, 725, 1696, 1806, 301, 707, 49... #> $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N39463... #> $ origin <chr> "EWR", "LGA", "JFK", "JFK", "EWR", "JFK", "LGA"... #> $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ORD", "BOS", "ORD"... #> $ air_time <dbl> 227, 227, 160, 183, 150, 44, 138, 257, 149, 158... #> $ distance <dbl> 1400, 1416, 1089, 1576, 719, 187, 733, 1389, 10... #> $ hour <dbl> 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,... #> $ minute <dbl> 15, 29, 40, 45, 58, 59, 0, 0, 0, 0, 0, 0, 0, 0,... #> $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013... #> $ flight_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, ... Identify the keys in the following datasets Lahman::Batting babynames::babynames nasaweather::atmos fueleconomy::vehicles ggplot2::diamonds (You might need to install some packages and read some documentation.) The primary key for Lahman::Batting is playerID, yearID, stint. It is not simply playerID, yearID because players can have different stints in different leagues within the same year. Lahman::Batting %>% group_by(playerID, yearID, stint) %>% filter(n() > 1) %>% nrow() #> [1] 0 The primary key for babynames::babynames is year, sex, name. It is no simply year, name since names can appear for both sexes with different counts. babynames::babynames %>% group_by(year, sex, name) %>% filter(n() > 1) %>% nrow() #> [1] 0 The primary key for nasaweather::atmos is the location and time of the measurement: lat, long, year, month. nasaweather::atmos %>% group_by(lat, long, year, month) %>% filter(n() > 1) %>% nrow() #> [1] 0 The column id (unique EPA identifier) is the primary key for fueleconomy::vehicles: fueleconomy::vehicles %>% group_by(id) %>% filter(n() > 1) %>% nrow() #> [1] 0 There is no primary key for ggplot2::diamonds. Using all variables in the data frame, the number of distinct rows is less than the total number of rows, meaning no combination of variables uniquely identifies the observations. ggplot2::diamonds %>% distinct() %>% nrow() #> [1] 53794 nrow(ggplot2::diamonds) #> [1] 53940 Draw a diagram illustrating the connections between the Batting, Master, and Salaries tables in the Lahman package. Draw another diagram that shows the relationship between Master, Managers, AwardsManagers. Batting primary key: playerID, yearID, stint foreign keys: playerID -> Master.playerID Master primary key: playerID Salaries primary key: yearID, teamID, playerID foreign keys: playerID -> Master.playerID Managers: primary key: yearID, playerID, teamID, inseason foreign keys: playerID -> Master.teamID Managers: primary key: awardID, yearID AwardsManagers: primary key: playerID, awardID, yearID (since there are ties and while tie distinguishes those awards it has NA values) foreign keys: playerID -> Master.playerID playerID, yearID, lgID -> Managers.playerID, yearID, lgID lgID and teamID appear in multiple tables, but should be primary keys for league and team tables. How would you characterise the relationship between the Batting, Pitching, and Fielding tables? 9.4 Mutating Joins flights2 <- flights %>% select(year:day, hour, origin, dest, tailnum, carrier) flights2 %>% select(-origin, -dest) %>% left_join(airlines, by = "carrier") #> # A tibble: 336,776 × 7 #> year month day hour tailnum carrier name #> <int> <int> <int> <dbl> <chr> <chr> <chr> #> 1 2013 1 1 5 N14228 UA United Air Lines Inc. #> 2 2013 1 1 5 N24211 UA United Air Lines Inc. #> 3 2013 1 1 5 N619AA AA American Airlines Inc. #> 4 2013 1 1 5 N804JB B6 JetBlue Airways #> 5 2013 1 1 6 N668DN DL Delta Air Lines Inc. #> 6 2013 1 1 5 N39463 UA United Air Lines Inc. #> # ... with 3.368e+05 more rows 9.4.1 Exercises Compute the average delay by destination, then join on the airports data frame so you can show the spatial distribution of delays. Here’s an easy way to draw a map of the United States: airports %>% semi_join(flights, c("faa" = "dest")) %>% ggplot(aes(lon, lat)) + borders("state") + geom_point() + coord_quickmap() (Don’t worry if you don’t understand what semi_join() does — you’ll learn about it next.) avg_dest_delays <- flights %>% group_by(dest) %>% # arrival delay NA's are cancelled flights summarise(delay = mean(arr_delay, na.rm = TRUE)) %>% inner_join(airports, by = c(dest = "faa")) avg_dest_delays %>% ggplot(aes(lon, lat, colour = delay)) + borders("state") + geom_point() + coord_quickmap() You might want to use the size or colour of the points to display the average delay for each airport. Add the location of the origin and destination (i.e. the lat and lon) to flights. flights %>% left_join(airports, by = c(dest = "faa")) %>% left_join(airports, by = c(origin = "faa")) %>% head() #> # A tibble: 6 × 33 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 1 1 517 515 2 830 #> 2 2013 1 1 533 529 4 850 #> 3 2013 1 1 542 540 2 923 #> 4 2013 1 1 544 545 -1 1004 #> 5 2013 1 1 554 600 -6 812 #> 6 2013 1 1 554 558 -4 740 #> # ... with 26 more variables: sched_arr_time <int>, arr_delay <dbl>, #> # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, #> # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, #> # time_hour <dttm>, name.x <chr>, lat.x <dbl>, lon.x <dbl>, alt.x <int>, #> # tz.x <dbl>, dst.x <chr>, tzone.x <chr>, name.y <chr>, lat.y <dbl>, #> # lon.y <dbl>, alt.y <int>, tz.y <dbl>, dst.y <chr>, tzone.y <chr> Is there a relationship between the age of a plane and its delays? Suprisingly not. If anything (departure) delay seems to decrease slightly with age (perhaps because of selection): plane_ages <- planes %>% mutate(age = 2013 - year) %>% select(tailnum, age) flights %>% inner_join(plane_ages, by = "tailnum") %>% group_by(age) %>% filter(!is.na(dep_delay)) %>% summarise(delay = mean(dep_delay)) %>% ggplot(aes(x = age, y = delay)) + geom_point() + geom_line() #> Warning: Removed 1 rows containing missing values (geom_point). #> Warning: Removed 1 rows containing missing values (geom_path). What weather conditions make it more likely to see a delay? Almost any amount or precipitation is associated with a delay, though not as strong a trend after 0.02 in as one would expect flight_weather <- flights %>% inner_join(weather, by = c("origin" = "origin", "year" = "year", "month" = "month", "day" = "day", "hour" = "hour")) flight_weather %>% group_by(precip) %>% summarise(delay = mean(dep_delay, na.rm = TRUE)) %>% ggplot(aes(x = precip, y = delay)) + geom_line() + geom_point() What happened on June 13 2013? Display the spatial pattern of delays, and then use Google to cross-reference with the weather. There was a large series of storms (derechos) in the southeastern US (see June 12-13, 2013 derecho series) The largest delays are in Tennessee (Nashville) and the Southeaste and Midwest (the location of the derechos). library(viridis) flights %>% filter(year == 2013, month == 6, day == 13) %>% group_by(dest) %>% summarise(delay = mean(arr_delay, na.rm = TRUE)) %>% inner_join(airports, by = c("dest" = "faa")) %>% ggplot(aes(y = lat, x = lon, size = delay, colour = delay)) + borders("state") + geom_point() + coord_quickmap() + scale_color_viridis() #> Warning: Removed 3 rows containing missing values (geom_point). 9.5 Filtering Joins semi_join: keep all obs in x with match in y anti_join: drop all obs in x with a match in y 9.5.1 Exercises What does it mean for a flight to have a missing tailnum? What do the tail numbers that don’t have a matching record in planes have in common? (Hint: one variable explains ~90% of the problems.) American Airlines (AA) and Envoy Airlines (MQ) don’t report tail numbers. flights %>% anti_join(planes, by = "tailnum") %>% count(carrier, sort = TRUE) #> # A tibble: 10 × 2 #> carrier n #> <chr> <int> #> 1 MQ 25397 #> 2 AA 22558 #> 3 UA 1693 #> 4 9E 1044 #> 5 B6 830 #> 6 US 699 #> # ... with 4 more rows Filter flights to only show flights with planes that have flown at least 100 flights. planes_gt100 <- filter(flights) %>% group_by(tailnum) %>% count() %>% filter(n > 100) flights %>% semi_join(planes_gt100, by = "tailnum") #> # A tibble: 229,202 × 19 #> year month day dep_time sched_dep_time dep_delay arr_time #> <int> <int> <int> <int> <int> <dbl> <int> #> 1 2013 1 1 1604 1510 54 1817 #> 2 2013 1 1 2100 2100 0 2307 #> 3 2013 1 2 827 835 -8 1059 #> 4 2013 1 2 2014 2020 -6 2256 #> 5 2013 1 4 1621 1625 -4 1853 #> 6 2013 1 5 834 835 -1 1050 #> # ... with 2.292e+05 more rows, and 12 more variables: #> # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>, #> # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, #> # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm> Combine fueleconomy::vehicles and fueleconomy::common to find only the records for the most common models. The table fueleconomy::common identifies vehicles by make and model: glimpse(fueleconomy::vehicles) #> Observations: 33,442 #> Variables: 12 #> $ id <int> 27550, 28426, 27549, 28425, 1032, 1033, 3347, 13309, 133... #> $ make <chr> "AM General", "AM General", "AM General", "AM General", ... #> $ model <chr> "DJ Po Vehicle 2WD", "DJ Po Vehicle 2WD", "FJ8c Post Off... #> $ year <int> 1984, 1984, 1984, 1984, 1985, 1985, 1987, 1997, 1997, 19... #> $ class <chr> "Special Purpose Vehicle 2WD", "Special Purpose Vehicle ... #> $ trans <chr> "Automatic 3-spd", "Automatic 3-spd", "Automatic 3-spd",... #> $ drive <chr> "2-Wheel Drive", "2-Wheel Drive", "2-Wheel Drive", "2-Wh... #> $ cyl <int> 4, 4, 6, 6, 4, 6, 6, 4, 4, 6, 4, 4, 6, 4, 4, 6, 5, 5, 6,... #> $ displ <dbl> 2.5, 2.5, 4.2, 4.2, 2.5, 4.2, 3.8, 2.2, 2.2, 3.0, 2.3, 2... #> $ fuel <chr> "Regular", "Regular", "Regular", "Regular", "Regular", "... #> $ hwy <int> 17, 17, 13, 13, 17, 13, 21, 26, 28, 26, 27, 29, 26, 27, ... #> $ cty <int> 18, 18, 13, 13, 16, 13, 14, 20, 22, 18, 19, 21, 17, 20, ... glimpse(fueleconomy::common) #> Observations: 347 #> Variables: 4 #> $ make <chr> "Acura", "Acura", "Acura", "Acura", "Acura", "Audi", "Au... #> $ model <chr> "Integra", "Legend", "MDX 4WD", "NSX", "TSX", "A4", "A4 ... #> $ n <int> 42, 28, 12, 28, 27, 49, 49, 66, 20, 12, 46, 20, 30, 29, ... #> $ years <int> 16, 10, 12, 14, 11, 19, 15, 19, 19, 12, 20, 15, 16, 16, ... fueleconomy::vehicles %>% semi_join(fueleconomy::common, by = c("make", "model")) #> # A tibble: 14,531 × 12 #> id make model year class trans #> <int> <chr> <chr> <int> <chr> <chr> #> 1 1833 Acura Integra 1986 Subcompact Cars Automatic 4-spd #> 2 1834 Acura Integra 1986 Subcompact Cars Manual 5-spd #> 3 3037 Acura Integra 1987 Subcompact Cars Automatic 4-spd #> 4 3038 Acura Integra 1987 Subcompact Cars Manual 5-spd #> 5 4183 Acura Integra 1988 Subcompact Cars Automatic 4-spd #> 6 4184 Acura Integra 1988 Subcompact Cars Manual 5-spd #> # ... with 1.452e+04 more rows, and 6 more variables: drive <chr>, #> # cyl <int>, displ <dbl>, fuel <chr>, hwy <int>, cty <int> Find the 48 hours (over the course of the whole year) that have the worst delays. Cross-reference it with the weather data. Can you see any patterns? What does anti_join(flights, airports, by = c("dest" = "faa")) tell you? What does anti_join(airports, flights, by = c("faa" = "dest")) tell you? anti_join(flights, airports, by = c("dest" = "faa")) are flights that go to an airport that is not in FAA list of destinations, likely foreign airports. anti_join(airports, flights, by = c("faa" = "dest")) are US airports that don’t have a flight in the data, meaning that there were no flights to that aiport from New York in 2013. You might expect that there’s an implicit relationship between plane and airline, because each plane is flown by a single airline. Confirm or reject this hypothesis using the tools you’ve learned above. There isn’t such a relationship, since planes can be sold or airlines can merge. However, that doesn’t necessarily mean that such a plane will appear in this data. There are eight planes which flights %>% group_by(tailnum, carrier) %>% count() %>% filter(n() > 1) %>% select(tailnum) %>% distinct() #> Source: local data frame [18 x 1] #> Groups: tailnum [18] #> #> tailnum #> <chr> #> 1 N146PQ #> 2 N153PQ #> 3 N176PQ #> 4 N181PQ #> 5 N197PQ #> 6 N200PQ #> # ... with 12 more rows 9.6 Set operations No exercises "], +["strings.html", "10 Strings 10.1 Introduction 10.2 String Basics 10.3 Matching Patterns and Regular Expressions 10.4 Tools 10.5 Other types of patterns 10.6 stringi", " 10 Strings 10.1 Introduction Functions and packages coverered stringr package str_length str_c str_replace_na str_sub str_to_uppser, str_sort, str_to_lower, str_order str_length, str_pad, str_trim, str_sub For regex = str_view, str_view_all regex syntax str_detect str_subset str_count str_extract str_match tidyr::extract str_split str_locate str_sub the stringi package Ideas mention rex. A package with friendly regular expressions. Use it to match country names? Extract numbers from text? Discuss fuzzy joining and string distance, approximate matching. library(tidyverse) library(stringr) 10.2 String Basics 10.2.1 Exercises In code that doesn’t use stringr, you’ll often see paste() and paste0(). What’s the difference between the two functions? What stringr function are they equivalent to? How do the functions differ in their handling of NA? The function paste seperates strings by spaces by default, while paste0 does not seperate strings with spaces by default. paste("foo", "bar") #> [1] "foo bar" paste0("foo", "bar") #> [1] "foobar" Since str_c does not seperate strings with spaces by default it is closer in behabior to paste0. str_c("foo", "bar") #> [1] "foobar" However, str_c and the paste ufnction handle NA differently. The function str_c propogates NA, if any argument is a missing value, it returns a missing value. This is in line with how the numeric R functions, e.g. sum, mean, handle missing values. However, the paste functions, convert NA to the string "NA" and then treat it as any other character vector. str_c("foo", NA) #> [1] NA paste("foo", NA) #> [1] "foo NA" paste0("foo", NA) #> [1] "fooNA" In your own words, describe the difference between the sep and collapse arguments to str_c(). The sep argument is the string inserted between argugments to str_c, while collapse is the string used to separate any elements of the character vector into a character vector of length one. Use str_length() and str_sub() to extract the middle character from a string. What will you do if the string has an even number of characters? The following function extracts the middle character. If the string has an even number of characters the choice is arbitrary. We choose to select \\(\\lceil n / 2 \\rceil\\), because that case works even if the string is only of length one. A more general method would allow the user to select either the floor or ceiling for the middle character of an even string. x <- c("a", "abc", "abcd", "abcde", "abcdef") L <- str_length(x) m <- ceiling(L / 2) str_sub(x, m, m) #> [1] "a" "b" "b" "c" "c" What does str_wrap() do? When might you want to use it? The function str_wrap wraps text so that it fits within a certain width. This is useful for wrapping long strings of text to be typeset. What does str_trim() do? What’s the opposite of str_trim()? The function str_trim trims the whitespace from a string. str_trim(" abc ") #> [1] "abc" str_trim(" abc ", side = "left") #> [1] "abc " str_trim(" abc ", side = "right") #> [1] " abc" The opposite of str_trim is str_pad which adds characters to each side. str_pad("abc", 5, side = "both") #> [1] " abc " str_pad("abc", 4, side = "right") #> [1] "abc " str_pad("abc", 4, side = "left") #> [1] " abc" Write a function that turns (e.g.) a vector c(“a”, “b”, “c”) into the string a, b, and c. Think carefully about what it should do if given a vector of length 0, 1, or 2. Note: See Ch 19 for writing functions. str_commasep <- function(x, sep = ", ", last = ", and ") { if (length(x) > 1) { str_c(str_c(x[-length(x)], collapse = sep), x[length(x)], sep = last) } else { x } } str_commasep("") #> [1] "" str_commasep("a") #> [1] "a" str_commasep(c("a", "b")) #> [1] "a, and b" str_commasep(c("a", "b", "c")) #> [1] "a, b, and c" 10.3 Matching Patterns and Regular Expressions 10.3.1 Exercises Explain why each of these strings don’t match a \\: "\\", "\\\\", "\\\\\\". "\\": This will escape the next character in the R string. "\\\\": This will resolve to \\ in the regular expression, which will escape the next character in the regular expression. "\\\\\\": The first two backslashes will resolve to a literal backslash in the regular expression, the third will escape the next character. So in the regular expresion, this will escape some escaped character. How would you match the sequence "'\\ ? What patterns will the regular expression \\..\\..\\.. match? How would you represent it as a string? It will match any patterns that are a dot followed by any character, repeated three times. 10.3.1.1 Exercises How would you match the literal string “\\(^\\)”? str_view(c("$^$", "ab$^$sfas"), "^\\\\$\\\\^\\\\$$") Given the corpus of common words in stringr::words, create regular expressions that find all words that: Start with “y”. End with “x” Are exactly three letters long. (Don’t cheat by using str_length()!) Have seven letters or more. Since this list is long, you might want to use the match argument to str_view() to show only the matching or non-matching words. head(stringr::words) #> [1] "a" "able" "about" "absolute" "accept" "account" A simpler way, shown later is 10.3.1.2 Character classes and alternatives 10.3.1.2.1 Exercises Create regular expressions to find all words that: Start with a vowel. That only contain consonants. (Hint: thinking about matching “not”-vowels.) End with ed, but not with eed. End with ing or ise. Words starting with vowels str_view(stringr::words, "^[aeiou]") Words that contain only consonants str_view(stringr::words, "^[^aeiou]+$", match=TRUE) This seems to require using the + pattern introduced later, unless one wants to be very verbose and specify words of certain lengths. Words that end with ed but not with eed. This handles the special case of “ed”, as well as words with length > 2. str_view(stringr::words, "^ed$|[^e]ed$", match = TRUE) Words ending in ing or ise: str_view(stringr::words, "i(ng|se)$", match = TRUE) Empirically verify the rule “i before e except after c”. Using only what has been introduced thus far: str_view(stringr::words, "(cei|[^c]ie)", match = TRUE) str_view(stringr::words, "(cie|[^c]ei)", match = TRUE) Using str_detect: sum(str_detect(stringr::words, "(cei|[^c]ie)")) #> [1] 14 sum(str_detect(stringr::words, "(cie|[^c]ei)")) #> [1] 3 Is “q” always followed by a “u”? In the stringr::words dataset, yes. In the full English language, no. str_view(stringr::words, "q[^u]", match = TRUE) Write a regular expression that matches a word if it’s probably written in British English, not American English. Ummm. In the general case, this is hard. But, there are a few heuristics to consider that can get part of the way there: British English uses “ou” instead of “o” use of “ae” and “oe” instead of “a” and “o” ends in ise instead of ize ending yse ou|ise^|ae|oe|yse^ There are others, but https://en.wikipedia.org/wiki/American_and_British_English_spelling_differences but this is not handled best by a regular expression. It would require a dictionary with differences in spellings for different words. And even then, a good algorithm would be statistical, inferring the probability that a text or word is using the British spelling rather than some deterministic algorithm. Create a regular expression that will match telephone numbers as commonly written in your country. Using what has been covered in R4DS thus far, x <- c("123-456-7890", "1235-2351") str_view(x, "\\\\d\\\\d\\\\d-\\\\d\\\\d\\\\d-\\\\d\\\\d\\\\d\\\\d") Using stuff covered in the next section, str_view(x, "\\\\d{3}-\\\\d{3}-\\\\d{4}") Note that this pattern doesn’t account for phone numbers that are invalid because of unassigned area code, or special numbers like 911, or for extensions. See https://en.wikipedia.org/wiki/North_American_Numbering_Plan for the complexities of US phone numbers, and http://stackoverflow.com/questions/123559/a-comprehensive-regex-for-phone-number-validation for one discussion of using a regex for phone number validation. 10.3.2 Repitition 10.3.2.1 Exercises Describe the equivalents of ?, +, * in {m,n} form. The equivalent of ? is {,1}, matching at most 1. The equivalent of + is {1,}, matching 1 or more. There is no direct equivalent of * in {m,n} form since there are no bounds on the matches: it can be 0 up to infinity matches. Describe in words what these regular expressions match: (read carefully to see if I’m using a regular expression or a string that defines a regular expression.) ^.*$: Any string "\\\\{.+\\\\}": Any string with curly braces surrounding at least one character. \\d{4}-\\d{2}-\\d{2}: A date in “%Y-%m-%d” format: four digits followed by a dash, followed by two digits followed by a dash, followed by another two digits followed by a dash. "\\\\\\\\{4}": This resolves to the regex \\\\{4}, which is four backslashes. Create regular expressions to find all words that: Start with three consonants. Have three or more vowels in a row. Have two or more vowel-consonant pairs in a row. A regex to find all words starting with three consonants str_view(words, "^[^aeiou]{3}", match = TRUE) A regex to find three or more vowels in a row: str_view(words, "[aeiou]{3,}", match = TRUE) Two or more vowel-consonant pairs in a row. str_view(words, "([aeiou][^aeiou]){2,}", match = TRUE) Solve the beginner regexp crosswords at https://regexcrossword.com/challenges/beginner Nope 10.3.3 Grouping and backreferences str_view(fruit, "(..)\\\\1", match = TRUE) 10.3.3.1 Exercises Describe, in words, what these expressions will match: (.)\\1\\1 : The same character apearing three times in a row. E.g. “aaa” "(.)(.)\\\\2\\\\1": A pair of characters followed by the same pair of characters in reversed order. E.g. “abba”. (..)\\1: Any two characters repeated. E.g. “a1a1”. "(.).\\\\1.\\\\1": A character followed by any character, the original character, any other character, the original character again. E.g. “abaca”, “b8b.b”. "(.)(.)(.).*\\\\3\\\\2\\\\1" Three characters followed by zero or more characters of any kind followed by the same three characters but in reverse order. E.g. “abcsgasgddsadgsdgcba” or “abccba” or “abc1cba”. Construct regular expressions to match words that: Start and end with the same character. Assuming the word is more than one character and all strings are considered words, ^(.).*\\1$ str_view(words, "^(.).*\\\\1$", match = TRUE) 2 Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.). # any two characters repeated str_view(words, "(..).*\\\\1", match = TRUE) # more stringent, letters only, but also allowing for differences in capitalization str_view(str_to_lower(words), "([a-z][a-z]).*\\\\1", match = TRUE) Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.) str_view(words, "(.).*\\\\1.*\\\\1", match = TRUE) 10.4 Tools 10.4.1 Detect matches x <- c("apple", "banana", "pear") str_detect(x, "e") #> [1] TRUE FALSE TRUE Number of words starting with t? sum(str_detect(words, "^t")) #> [1] 65 Proportion of words ending with a vowel? mean(str_detect(words, "[aeiou]$")) #> [1] 0.277 To find all words with no vowels no_vowels_1 <- !str_detect(words, "[aeiou]") no_vowels_2 <- str_detect(words, "^[^aeiou]+$") identical(no_vowels_1, no_vowels_2) #> [1] TRUE words[str_detect(words, "x$")] #> [1] "box" "sex" "six" "tax" str_subset(words, "x$") #> [1] "box" "sex" "six" "tax" df <- tibble( word = words, i = seq_along(word) ) df %>% filter(str_detect(words, "x$")) #> # A tibble: 4 × 2 #> word i #> <chr> <int> #> 1 box 108 #> 2 sex 747 #> 3 six 772 #> 4 tax 841 Number of matches in each string x <- c("apple", "banana", "pear") str_count(x, "a") #> [1] 1 3 1 Average vowels per word mean(str_count(words, "[aeiou]")) #> [1] 1.99 df %>% mutate( vowels = str_count(word, "[aeiou]"), consonants = str_count(word, "[^aeiou]") ) #> # A tibble: 980 × 4 #> word i vowels consonants #> <chr> <int> <int> <int> #> 1 a 1 1 0 #> 2 able 2 2 2 #> 3 about 3 3 2 #> 4 absolute 4 4 4 #> 5 accept 5 2 4 #> 6 account 6 3 4 #> # ... with 974 more rows matches do not overlap - they are usually greedy, except when otherwise noted. matches only match the first one. _all() functions will get all matches. 10.4.2 Exercises For each of the following challenges, try solving it by using both a single regular expression, and a combination of multiple str_detect() calls. Find all words that start or end with x. Find all words that start with a vowel and end with a consonant. Are there any words that contain at least one of each different vowel? Words that start or end with x? # one regex words[str_detect(words, "^x|x$")] #> [1] "box" "sex" "six" "tax" # split regex into parts start_with_x <- str_detect(words, "^x") end_with_x <- str_detect(words, "x$") words[start_with_x | end_with_x] #> [1] "box" "sex" "six" "tax" Find all words starting with vowel and ending with consonant. str_subset(words, "^[aeiou].*[^aeiou]$") %>% head() #> [1] "about" "accept" "account" "across" "act" "actual" start_with_vowel <- str_detect(words, "^[aeiou]") end_with_consonant <- str_detect(words, "[^aeiou]$") words[start_with_vowel & end_with_consonant] %>% head() #> [1] "about" "accept" "account" "across" "act" "actual" Words that contain at least one of each vowel. I can’t think of a good way of doing this without doing a regex of the permutations: pattern <- cross_n(rerun(5, c("a", "e", "i", "o", "u")), .filter = function(...) { x <- as.character(unlist(list(...))) length(x) != length(unique(x)) }) %>% map_chr(~ str_c(unlist(.x), collapse = ".*")) %>% str_c(collapse = "|") str_subset(words, pattern) #> character(0) words[str_detect(words, "a") & str_detect(words, "e") & str_detect(words, "i") & str_detect(words, "o") & str_detect(words, "u")] #> character(0) There appear to be none. To check that it works, str_subset("aseiouds", pattern) #> [1] "aseiouds" What word has the highest number of vowels? What word has the highest proportion of vowels? (Hint: what is the denominator?) prop_vowels <- str_count(words, "[aeiou]") / str_length(words) words[which(prop_vowels == max(prop_vowels))] #> [1] "a" 10.4.3 Extract Matches The Harvard sentences: length(sentences) #> [1] 720 head(sentences) #> [1] "The birch canoe slid on the smooth planks." #> [2] "Glue the sheet to the dark blue background." #> [3] "It's easy to tell the depth of a well." #> [4] "These days a chicken leg is a rare dish." #> [5] "Rice is often served in round bowls." #> [6] "The juice of lemons makes fine punch." colours <- c("red", "orange", "yellow", "green", "blue", "purple") colour_match <- str_c(colours, collapse = "|") colour_match #> [1] "red|orange|yellow|green|blue|purple" has_colour <- str_subset(sentences, colour_match) matches <- str_extract(has_colour, colour_match) head(matches) #> [1] "blue" "blue" "red" "red" "red" "blue" more <- sentences[str_count(sentences, colour_match) > 1] str_view_all(more, colour_match) str_extract(more, colour_match) #> [1] "blue" "green" "orange" The _all versions of functions return lists. str_extract_all(more, colour_match) #> [[1]] #> [1] "blue" "red" #> #> [[2]] #> [1] "green" "red" #> #> [[3]] #> [1] "orange" "red" str_extract_all(more, colour_match, simplify = TRUE) #> [,1] [,2] #> [1,] "blue" "red" #> [2,] "green" "red" #> [3,] "orange" "red" x <- c("a", "a b", "a b c") str_extract_all(x, "[a-z]", simplify = TRUE) #> [,1] [,2] [,3] #> [1,] "a" "" "" #> [2,] "a" "b" "" #> [3,] "a" "b" "c" 10.4.3.1 Exercises In the previous example, you might have noticed that the regular expression matched “flickered”, which is not a colour. Modify the regex to fix the problem. Add the \\b before and after the pattern colour_match2 <- str_c("\\\\b(", str_c(colours, collapse = "|"), ")\\\\b") colour_match2 #> [1] "\\\\b(red|orange|yellow|green|blue|purple)\\\\b" more2 <- sentences[str_count(sentences, colour_match) > 1] str_view_all(more2, colour_match2, match = TRUE) From the Harvard sentences data, extract: The first word from each sentence. All words ending in ing. All plurals. The first word in each sentence requires defining what a word is. I’ll consider a word any contiguous str_extract(sentences, "[a-zA-X]+") %>% head() #> [1] "The" "Glue" "It" "These" "Rice" "The" All words ending in ing: pattern <- "\\\\b[A-Za-z]+ing\\\\b" sentences_with_ing <- str_detect(sentences, pattern) unique(unlist(str_extract_all(sentences[sentences_with_ing], pattern))) %>% head() #> [1] "spring" "evening" "morning" "winding" "living" "king" All plurals. To do this correct requires linguistic information. But if we just want to say any word ending in an “s” is plural (and with more than 3 characters to remove as, is, gas, etc.) unique(unlist(str_extract_all(sentences, "\\\\b[A-Za-z]{3,}s\\\\b"))) %>% head() #> [1] "planks" "days" "bowls" "lemons" "makes" "hogs" 10.4.4 Grouped Matches noun <- "(a|the) ([^ ]+)" has_noun <- sentences %>% str_subset(noun) %>% head(10) has_noun %>% str_extract(noun) #> [1] "the smooth" "the sheet" "the depth" "a chicken" "the parked" #> [6] "the sun" "the huge" "the ball" "the woman" "a helps" has_noun %>% str_match(noun) #> [,1] [,2] [,3] #> [1,] "the smooth" "the" "smooth" #> [2,] "the sheet" "the" "sheet" #> [3,] "the depth" "the" "depth" #> [4,] "a chicken" "a" "chicken" #> [5,] "the parked" "the" "parked" #> [6,] "the sun" "the" "sun" #> [7,] "the huge" "the" "huge" #> [8,] "the ball" "the" "ball" #> [9,] "the woman" "the" "woman" #> [10,] "a helps" "a" "helps" tibble(sentence = sentences) %>% tidyr::extract( sentence, c("article", "noun"), "(a|the) ([^ ]+)", remove = FALSE ) #> # A tibble: 720 × 3 #> sentence article noun #> * <chr> <chr> <chr> #> 1 The birch canoe slid on the smooth planks. the smooth #> 2 Glue the sheet to the dark blue background. the sheet #> 3 It's easy to tell the depth of a well. the depth #> 4 These days a chicken leg is a rare dish. a chicken #> 5 Rice is often served in round bowls. <NA> <NA> #> 6 The juice of lemons makes fine punch. <NA> <NA> #> # ... with 714 more rows 10.4.4.1 Exercises Find all words that come after a “number” like “one”, “two”, “three” etc. Pull out both the number and the word. I’ll use the same following “word” pattern as used above numword <- "(one|two|three|four|five|six|seven|eight|nine|ten) +(\\\\S+)" sentences[str_detect(sentences, numword)] %>% str_extract(numword) #> [1] "ten served" "one over" "seven books" "two met" #> [5] "two factors" "one and" "three lists" "seven is" #> [9] "two when" "one floor." "ten inches." "one with" #> [13] "one war" "one button" "six minutes." "ten years" #> [17] "one in" "ten chased" "one like" "two shares" #> [21] "two distinct" "one costs" "ten two" "five robins." #> [25] "four kinds" "one rang" "ten him." "three story" #> [29] "ten by" "one wall." "three inches" "ten your" #> [33] "six comes" "one before" "three batches" "two leaves." Find all contractions. Separate out the pieces before and after the apostrophe. contraction <- "([A-Za-z]+)'([A-Za-z]+)" sentences %>% `[`(str_detect(sentences, contraction)) %>% str_extract(contraction) #> [1] "It's" "man's" "don't" "store's" "workmen's" #> [6] "Let's" "sun's" "child's" "king's" "It's" #> [11] "don't" "queen's" "don't" "pirate's" "neighbor's" 10.4.5 Splitting 10.4.5.1 Exercises Split up a string like "apples, pears, and bananas" into individual components. x <- c("apples, pears, and bananas") str_split(x, ", +(and +)?")[[1]] #> [1] "apples" "pears" "bananas" Why is it better to split up by boundary("word") than " "? Splitting by boundary("word") splits on punctuation and not just whitespace. What does splitting with an empty string ("") do? Experiment, and then read the documentation. str_split("ab. cd|agt", "")[[1]] #> [1] "a" "b" "." " " "c" "d" "|" "a" "g" "t" It splits the string into individual characters. 10.5 Other types of patterns 10.5.1 Exercises How would you find all strings containing \\ with regex() vs. with fixed()? str_subset(c("a\\\\b", "ab"), "\\\\\\\\") #> [1] "a\\\\b" str_subset(c("a\\\\b", "ab"), fixed("\\\\")) #> [1] "a\\\\b" What are the five most common words in sentences? str_extract_all(sentences, boundary("word")) %>% unlist() %>% str_to_lower() %>% tibble() %>% set_names("word") %>% group_by(word) %>% count(sort = TRUE) %>% head(5) #> # A tibble: 5 × 2 #> word n #> <chr> <int> #> 1 the 751 #> 2 a 202 #> 3 of 132 #> 4 to 123 #> 5 and 118 10.6 stringi 10.6.1 Exercises Find the stringi functions that: Count the number of words. stri_count_words Find duplicated strings. stri_duplicated Generate random text. There are several functions beginning with stri_rand_. stri_rand_lipsum generates lorem ipsum text, stri_rand_strings generates random strings, stri_rand_shuffle randomly shuffles the code points in the text. How do you control the language that stri_sort() uses for sorting? Use the locale argument to the opts_collator argument. "], +["factors.html", "11 Factors 11.1 Introduction 11.2 Creating Factors 11.3 General Social Survey 11.4 Modifying factor order 11.5 Modifying factor levels", " 11 Factors 11.1 Introduction Functions and packages: forcats factor fct_inorder levels readr::parse_factor fct_reorder fct_relevel fct_reorder2 fct_infreq fct_rev fct_recode fct_lump fct_collapse library("tidyverse") library("forcats") 11.2 Creating Factors No exercises 11.3 General Social Survey 11.3.1 Exercises Explore the distribution of rincome (reported income). What makes the default bar chart hard to understand? How could you improve the plot? rincome_plot <- gss_cat %>% ggplot(aes(rincome)) + geom_bar() rincome_plot The default bar chart labels are too squished to read. One solution is to change the angle of the labels, rincome_plot + theme(axis.text.x = element_text(angle = 90)) But that’s not natural either, because text is vertical, and we read horizontally. So with long labels, it is better to flip it. rincome_plot + coord_flip() This is better, but it unituively goes from low to high. It would help if the scale is reversed. Also, if all the missing factors were differentiated. What is the most common relig in this survey? What’s the most common partyid? The most common relig is “Protestant” gss_cat %>% count(relig) %>% arrange(-n) %>% head(1) #> # A tibble: 1 × 2 #> relig n #> <fctr> <int> #> 1 Protestant 10846 The most common partyid is “Independent” gss_cat %>% count(partyid) %>% arrange(-n) %>% head(1) #> # A tibble: 1 × 2 #> partyid n #> <fctr> <int> #> 1 Independent 4119 Which relig does denom (denomination) apply to? How can you find out with a table? How can you find out with a visualisation? levels(gss_cat$denom) #> [1] "No answer" "Don't know" "No denomination" #> [4] "Other" "Episcopal" "Presbyterian-dk wh" #> [7] "Presbyterian, merged" "Other presbyterian" "United pres ch in us" #> [10] "Presbyterian c in us" "Lutheran-dk which" "Evangelical luth" #> [13] "Other lutheran" "Wi evan luth synod" "Lutheran-mo synod" #> [16] "Luth ch in america" "Am lutheran" "Methodist-dk which" #> [19] "Other methodist" "United methodist" "Afr meth ep zion" #> [22] "Afr meth episcopal" "Baptist-dk which" "Other baptists" #> [25] "Southern baptist" "Nat bapt conv usa" "Nat bapt conv of am" #> [28] "Am bapt ch in usa" "Am baptist asso" "Not applicable" From the context it is clear that denom refers to “Protestant” (and unsurprising given that it is the largest category in freq). Let’s filter out the non-responses, no answers, others, not-applicable, or no denomination, to leave only answers to denominations. After doing that, the only remaining responses are “Protestant”. gss_cat %>% filter(!denom %in% c("No answer", "Other", "Don't know", "Not applicable", "No denomination")) %>% count(relig) #> # A tibble: 1 × 2 #> relig n #> <fctr> <int> #> 1 Protestant 7025 This is also clear in a scatter plot of relig vs. denom where the points are proportional to the size of the number of answers (since otherwise there would be overplotting). gss_cat %>% count(relig, denom) %>% ggplot(aes(x = relig, y = denom, size = n)) + geom_point() + theme(axis.text.x = element_text(angle = 90)) 11.4 Modifying factor order 11.4.1 Exercises There are some suspiciously high numbers in tvhours. Is the mean a good summary? summary(gss_cat[["tvhours"]]) #> Min. 1st Qu. Median Mean 3rd Qu. Max. NA's #> 0 1 2 3 4 24 10146 gss_cat %>% filter(!is.na(tvhours)) %>% ggplot(aes(x = tvhours)) + geom_histogram(binwidth = 1) Whether the mean is the best summary epends on what you are using it for :-), i.e. your objective. But probably the median would be what most people prefer. And the hours of tv doesn’t look that surprising to me. For each factor in gss_cat identify whether the order of the levels is arbitrary or principled. The following piece of code uses functions covered in Ch 21, to print out the names of only the factors. keep(gss_cat, is.factor) %>% names() #> [1] "marital" "race" "rincome" "partyid" "relig" "denom" There are five six categorical variables: marital, race, rincome, partyid, relig, denom. The ordering of marital is “somewhat principled”. There is some sort of logic in that the levels are grouped “never married”, married at some point (separated, divorced, widowed), and “married”; though it would seem that “Never Married”, “Divorced”, “Widowed”, “Separated”, “Married” might be more natural. I find that the question of ordering can be determined by the level of aggregation in a categorical variable, and there can be more “partially ordered” factors than one would expect. levels(gss_cat[["marital"]]) #> [1] "No answer" "Never married" "Separated" "Divorced" #> [5] "Widowed" "Married" gss_cat %>% ggplot(aes(x = marital)) + geom_bar() The ordering of race is principled in that the categories are ordered by count of observations in the data. levels(gss_cat$race) #> [1] "Other" "Black" "White" "Not applicable" gss_cat %>% ggplot(aes(race)) + geom_bar(drop = FALSE) #> Warning: Ignoring unknown parameters: drop The levels of rincome are ordered in decreasing order of the income; however the placement of “No answer”, “Don’t know”, and “Refused” before, and “Not applicable” after the income levels is arbitrary. It would be better to place all the missing income level categories either before or after all the known values. levels(gss_cat$rincome) #> [1] "No answer" "Don't know" "Refused" "$25000 or more" #> [5] "$20000 - 24999" "$15000 - 19999" "$10000 - 14999" "$8000 to 9999" #> [9] "$7000 to 7999" "$6000 to 6999" "$5000 to 5999" "$4000 to 4999" #> [13] "$3000 to 3999" "$1000 to 2999" "Lt $1000" "Not applicable" The levels of relig is arbitrary: there is no natural ordering, and they don’t appear to be ordered by stats within the dataset. levels(gss_cat$relig) #> [1] "No answer" "Don't know" #> [3] "Inter-nondenominational" "Native american" #> [5] "Christian" "Orthodox-christian" #> [7] "Moslem/islam" "Other eastern" #> [9] "Hinduism" "Buddhism" #> [11] "Other" "None" #> [13] "Jewish" "Catholic" #> [15] "Protestant" "Not applicable" gss_cat %>% ggplot(aes(relig)) + geom_bar() + coord_flip() The same goes for denom. levels(gss_cat$denom) #> [1] "No answer" "Don't know" "No denomination" #> [4] "Other" "Episcopal" "Presbyterian-dk wh" #> [7] "Presbyterian, merged" "Other presbyterian" "United pres ch in us" #> [10] "Presbyterian c in us" "Lutheran-dk which" "Evangelical luth" #> [13] "Other lutheran" "Wi evan luth synod" "Lutheran-mo synod" #> [16] "Luth ch in america" "Am lutheran" "Methodist-dk which" #> [19] "Other methodist" "United methodist" "Afr meth ep zion" #> [22] "Afr meth episcopal" "Baptist-dk which" "Other baptists" #> [25] "Southern baptist" "Nat bapt conv usa" "Nat bapt conv of am" #> [28] "Am bapt ch in usa" "Am baptist asso" "Not applicable" Ignoring “No answer”, “Don’t know”, and “Other party”, the levels of partyid are ordered from “Strong Republican”" to “Strong Democrat”. levels(gss_cat$partyid) #> [1] "No answer" "Don't know" "Other party" #> [4] "Strong republican" "Not str republican" "Ind,near rep" #> [7] "Independent" "Ind,near dem" "Not str democrat" #> [10] "Strong democrat" Why did moving “Not applicable” to the front of the levels move it to the bottom of the plot? Because that gives the level “Not applicable” an integer value of 1. 11.5 Modifying factor levels 11.5.1 Exercises How have the proportions of people identifying as Democrat, Republican, and Independent changed over time? To answer that, we need to combine the multiple levels into Democrat, Republican, and Independent levels(gss_cat$partyid) #> [1] "No answer" "Don't know" "Other party" #> [4] "Strong republican" "Not str republican" "Ind,near rep" #> [7] "Independent" "Ind,near dem" "Not str democrat" #> [10] "Strong democrat" gss_cat %>% mutate(partyid = fct_collapse(partyid, other = c("No answer", "Don't know", "Other party"), rep = c("Strong republican", "Not str republican"), ind = c("Ind,near rep", "Independent", "Ind,near dem"), dem = c("Not str democrat", "Strong democrat"))) %>% count(year, partyid) %>% group_by(year) %>% mutate(p = n / sum(n)) %>% ggplot(aes(x = year, y = p, colour = fct_reorder2(partyid, year, p))) + geom_point() + geom_line() + labs(colour = "Party ID.") How could you collapse rincome into a small set of categories? Group all the non-responses into one category, and then group other categories into a smaller number. Since there is a clear ordering, we wouldn’t want to use something like fct_lump. levels(gss_cat$rincome) #> [1] "No answer" "Don't know" "Refused" "$25000 or more" #> [5] "$20000 - 24999" "$15000 - 19999" "$10000 - 14999" "$8000 to 9999" #> [9] "$7000 to 7999" "$6000 to 6999" "$5000 to 5999" "$4000 to 4999" #> [13] "$3000 to 3999" "$1000 to 2999" "Lt $1000" "Not applicable" library("stringr") gss_cat %>% mutate(rincome = fct_collapse( rincome, `Unknown` = c("No answer", "Don't know", "Refused", "Not applicable"), `Lt $5000` = c("Lt $1000", str_c("$", c("1000", "3000", "4000"), " to ", c("2999", "3999", "4999"))), `$5000 to 10000` = str_c("$", c("5000", "6000", "7000", "8000"), " to ", c("5999", "6999", "7999", "9999")) )) %>% ggplot(aes(x = rincome)) + geom_bar() + coord_flip() "], +["dates-and-times.html", "12 Dates and Times 12.1 Prerequisite 12.2 Creating date/times 12.3 Date-Time Components 12.4 Time Spans", " 12 Dates and Times lubridate today, now ymd etc., ymd_hms etc. make_datetime, make_date as_datetime, as_date year, month, mday, yday, wday and year<- floor_date, round_date, ceiling_date update as.duration, duration functions (ddays, etc) period functions (days, months, etc) interval creation with %--% with_tz, force_tz hms package has times Ideas for applications: CDB90 data, COW war start end and duration Read more on time-zones: https://en.wikipedia.org/wiki/Time_zone Computerphile The Problem with Time & Timezones - Computerphile The history of the tz database are themselves interesting: https://en.wikipedia.org/wiki/Tz_database A literary appreciation of the Olson/Zoneinfo/tz database I think time-zones are likely a point for social science research in and of themselves. Policy choices. Coordination. Regression discontinuity designs. Just sayin… 12.1 Prerequisite library(tidyverse) library(lubridate) library(nycflights13) 12.2 Creating date/times NOTE %/% is integer division, divide and throw away the remainder. %% calculates the modulus (remainder of division). For example to test for an even number: x %% 2 == 0, or odd x %% 2 == 1. To get the thousands value of a number x %/% 1000. make_datetime_100 <- function(year, month, day, time) { make_datetime(year, month, day, time %/% 100, time %% 100) } flights_dt <- flights %>% filter(!is.na(dep_time), !is.na(arr_time)) %>% mutate( dep_time = make_datetime_100(year, month, day, dep_time), arr_time = make_datetime_100(year, month, day, arr_time), sched_dep_time = make_datetime_100(year, month, day, sched_dep_time), sched_arr_time = make_datetime_100(year, month, day, sched_arr_time) ) %>% select(origin, dest, ends_with("delay"), ends_with("time")) flights_dt %>% head #> # A tibble: 6 × 9 #> origin dest dep_delay arr_delay dep_time sched_dep_time #> <chr> <chr> <dbl> <dbl> <dttm> <dttm> #> 1 EWR IAH 2 11 2013-01-01 05:17:00 2013-01-01 05:15:00 #> 2 LGA IAH 4 20 2013-01-01 05:33:00 2013-01-01 05:29:00 #> 3 JFK MIA 2 33 2013-01-01 05:42:00 2013-01-01 05:40:00 #> 4 JFK BQN -1 -18 2013-01-01 05:44:00 2013-01-01 05:45:00 #> 5 LGA ATL -6 -25 2013-01-01 05:54:00 2013-01-01 06:00:00 #> 6 EWR ORD -4 12 2013-01-01 05:54:00 2013-01-01 05:58:00 #> # ... with 3 more variables: arr_time <dttm>, sched_arr_time <dttm>, #> # air_time <dbl> Times are often stored as integers since a reference time, called an epoch. The most epoch is the UNIX (or POSIX) Epoch of January 1st, 1970 00:00:00. So, interally, times are stored as the number of days, seconds, or milliseconds, etc. since the 1970-01-01 00:00:00.000. Calculate dates and datetimes from number of seconds (as_datetime) or days (as_date) from Unix epoch. as_datetime(60 * 60 * 10) #> [1] "1970-01-01 10:00:00 UTC" as_date(365 * 10 + 2) #> [1] "1980-01-01" 12.2.1 Exercises What happens if you parse a string that contains invalid dates? ret <- ymd(c("2010-10-10", "bananas")) #> Warning: 1 failed to parse. print(class(ret)) #> [1] "Date" ret #> [1] "2010-10-10" NA It produces an NA and an warning message. What does the tzone argument to today() do? Why is it important? It determines the time-zone of the date. Since different time-zones can have different dates, the value of today() can vary depending on the time-zone specified. Use the appropriate lubridate function to parse each of the following dates: d1 <- "January 1, 2010" mdy(d1) #> [1] "2010-01-01" d2 <- "2015-Mar-07" ymd(d2) #> [1] "2015-03-07" d3 <- "06-Jun-2017" dmy(d3) #> [1] "2017-06-06" d4 <- c("August 19 (2015)", "July 1 (2015)") mdy(d4) #> [1] "2015-08-19" "2015-07-01" d5 <- "12/30/14" # Dec 30, 2014 mdy(d5) #> [1] "2014-12-30" 12.3 Date-Time Components sched_dep <- flights_dt %>% mutate(minute = minute(sched_dep_time)) %>% group_by(minute) %>% summarise( avg_delay = mean(arr_delay, na.rm = TRUE), n = n()) Note The difference between rounded and unrounded dates provides the within period time. (datetime <- ymd_hms("2016-07-08 12:34:56")) #> [1] "2016-07-08 12:34:56 UTC" year(datetime) <- 2020 datetime #> [1] "2020-07-08 12:34:56 UTC" month(datetime) <- 01 datetime #> [1] "2020-01-08 12:34:56 UTC" hour(datetime) <- hour(datetime) + 1 datetime #> [1] "2020-01-08 13:34:56 UTC" 12.3.1 Exercises How does the distribution of flight times within a day change over the course of the year? Let’s try plotting this by month: flights_dt %>% mutate(time = hour(dep_time) * 100 + minute(dep_time), mon = as.factor(month (dep_time))) %>% ggplot(aes(x = time, group = mon, color = mon)) + geom_freqpoly(binwidth = 100) This will look better if everything is normalized within groups. The reason that February is lower is that there are fewer days and thus fewer flights. flights_dt %>% mutate(time = hour(dep_time) * 100 + minute(dep_time), mon = as.factor(month (dep_time))) %>% ggplot(aes(x = time, y = ..density.., group = mon, color = mon)) + geom_freqpoly(binwidth = 100) At least to me there doesn’t appear to much difference in within-day distribution over the year, but I maybe thinking about it incorrectly. Compare dep_time, sched_dep_time and dep_delay. Are they consistent? Explain your findings. If they are consistent, then dep_time = sched_dep_time + dep_delay. flights_dt %>% mutate(dep_time_ = sched_dep_time + dep_delay * 60) %>% filter(dep_time_ != dep_time) %>% select(dep_time_, dep_time, sched_dep_time, dep_delay) #> # A tibble: 1,205 × 4 #> dep_time_ dep_time sched_dep_time dep_delay #> <dttm> <dttm> <dttm> <dbl> #> 1 2013-01-02 08:48:00 2013-01-01 08:48:00 2013-01-01 18:35:00 853 #> 2 2013-01-03 00:42:00 2013-01-02 00:42:00 2013-01-02 23:59:00 43 #> 3 2013-01-03 01:26:00 2013-01-02 01:26:00 2013-01-02 22:50:00 156 #> 4 2013-01-04 00:32:00 2013-01-03 00:32:00 2013-01-03 23:59:00 33 #> 5 2013-01-04 00:50:00 2013-01-03 00:50:00 2013-01-03 21:45:00 185 #> 6 2013-01-04 02:35:00 2013-01-03 02:35:00 2013-01-03 23:59:00 156 #> # ... with 1,199 more rows There exist discrepencies. It looks like there are mistakes in the dates. These are flights in which the actual departure time is on the next day relative to the scheduled departure time. We forgot to account for this when creating the date-times. The code would have had to check if the departure time is less than the scheduled departure time. Alternatively, simply adding the delay time is more robust because it will automatically account for crossing into the next day. Compare air_time with the duration between the departure and arrival. Explain your findings. flights_dt %>% mutate(flight_duration = as.numeric(arr_time - dep_time), air_time_mins = air_time, diff = flight_duration - air_time_mins) %>% select(origin, dest, flight_duration, air_time_mins, diff) #> # A tibble: 328,063 × 5 #> origin dest flight_duration air_time_mins diff #> <chr> <chr> <dbl> <dbl> <dbl> #> 1 EWR IAH 193 227 -34 #> 2 LGA IAH 197 227 -30 #> 3 JFK MIA 221 160 61 #> 4 JFK BQN 260 183 77 #> 5 LGA ATL 138 116 22 #> 6 EWR ORD 106 150 -44 #> # ... with 3.281e+05 more rows How does the average delay time change over the course of a day? Should you use dep_time or sched_dep_time? Why? Use sched_dep_time because that is the relevant metric for someone scheduling a flight. Also, using dep_time will always bias delays to later in the day since delays will push flights later. flights_dt %>% mutate(sched_dep_hour = hour(sched_dep_time)) %>% group_by(sched_dep_hour) %>% summarise(dep_delay = mean(dep_delay)) %>% ggplot(aes(y = dep_delay, x = sched_dep_hour)) + geom_point() + geom_smooth() #> `geom_smooth()` using method = 'loess' On what day of the week should you leave if you want to minimise the chance of a delay? Sunday has the lowest average departure delay time and the lowest average arrival delay time. flights_dt %>% mutate(dow = wday(sched_dep_time)) %>% group_by(dow) %>% summarise(dep_delay = mean(dep_delay), arr_delay = mean(arr_delay, na.rm = TRUE)) #> # A tibble: 7 × 3 #> dow dep_delay arr_delay #> <dbl> <dbl> <dbl> #> 1 1 11.5 4.82 #> 2 2 14.7 9.65 #> 3 3 10.6 5.39 #> 4 4 11.7 7.05 #> 5 5 16.1 11.74 #> 6 6 14.7 9.07 #> # ... with 1 more rows What makes the distribution of diamonds$carat and flights$sched_dep_time similar? ggplot(diamonds, aes(x = carat)) + geom_density() In both carat and sched_dep_time there are abnormally large numbers of values are at nice “human” numbers. In sched_dep_time it is at 00 and 30 minutes. In carats, it is at 0, 1/3, 1/2, 2/3, ggplot(diamonds, aes(x = carat %% 1 * 100)) + geom_histogram(binwidth = 1) In scheduled departure times it is 00 and 30 minutes, and minutes ending in 0 and 5. ggplot(flights_dt, aes(x = minute(sched_dep_time))) + geom_histogram(binwidth = 1) Confirm my hypothesis that the early departures of flights in minutes 20-30 and 50-60 are caused by scheduled flights that leave early. Hint: create a binary variable that tells you whether or not a flight was delayed. At the minute level, there doesn’t appear to be anything: flights_dt %>% mutate(early = dep_delay < 0, minute = minute(sched_dep_time)) %>% group_by(minute) %>% summarise(early = mean(early)) %>% ggplot(aes(x = minute, y = early)) + geom_point() But if grouped in 10 minute intervals, there is a higher proportion of early flights during those minutes. flights_dt %>% mutate(early = dep_delay < 0, minute = minute(sched_dep_time) %% 10) %>% group_by(minute) %>% summarise(early = mean(early)) %>% ggplot(aes(x = minute, y = early)) + geom_point() 12.4 Time Spans duration: exact number of seconds period: human time periods - e.g. weeks, months interval: start and end points 12.4.1 Durations No exercises 12.4.2 Periods Define overnight when arr_time < dep_time (no flights > 24 hours): flights_dt <- flights_dt %>% mutate( overnight = arr_time < dep_time, arr_time = arr_time + days(overnight * 1), sched_arr_time = sched_arr_time + days(overnight * 1) ) 12.4.3 Intervals NOTE This section seems less complete than the others. Refer to the lubridate vignette for more information. 12.4.4 Exercises Why is there months() but no dmonths()? There is no direct unambigous value of months in seconds: 31 days: Jan, Mar, May, Jul, Aug, Oct, 30 days: Apr, Jun, Sep, Nov, Dec 28 or 29 days: Feb Though in the past, in the pre-computer era, for arithmetic convenience, bankers adopoted a 360 day year with 30 day months. Explain days(overnight * 1) to someone who has just started learning R. How does it work? overnight is equal to TRUE (1) or FALSE (0). So if it is an overnight flight, this becomes 1 day, and if not, then overnight = 0, and no days are added to the date. Create a vector of dates giving the first day of every month in 2015. Create a vector of dates giving the first day of every month in the current year. A vector of the first day of the month for every month in 2015: ymd("2015-01-01") + months(0:11) #> [1] "2015-01-01" "2015-02-01" "2015-03-01" "2015-04-01" "2015-05-01" #> [6] "2015-06-01" "2015-07-01" "2015-08-01" "2015-09-01" "2015-10-01" #> [11] "2015-11-01" "2015-12-01" To get the vector of the first day of the month for this year, we first need to figure out what this year is, and get January 1st of it. I can do that by taking today() and truncating it to the year using floor_date: floor_date(today(), unit = "year") + months(0:11) #> [1] "2017-01-01" "2017-02-01" "2017-03-01" "2017-04-01" "2017-05-01" #> [6] "2017-06-01" "2017-07-01" "2017-08-01" "2017-09-01" "2017-10-01" #> [11] "2017-11-01" "2017-12-01" Write a function that given your birthday (as a date), returns how old you are in years. age <- function(bday) { (bday %--% today()) %/% years(1) } age(ymd("1990-10-12")) #> Note: method with signature 'Timespan#Timespan' chosen for function '%/%', #> target signature 'Interval#Period'. #> "Interval#ANY", "ANY#Period" would also be valid #> [1] 26 Why can’t (today() %--% (today() + years(1)) / months(1) work? It appears to work. Today is a date. Today + 1 year is a valid endpoint for an interval. And months is period that is defined in this period. (today() %--% (today() + years(1))) %/% months(1) #> [1] 12 (today() %--% (today() + years(1))) / months(1) #> [1] 12 12.4.5 Time Zones No exercises. But time-zones are hell. Be happy you aren’t dealing with financial data. "], +["program-intro.html", "13 Introduction", " 13 Introduction "], +["pipes.html", "14 Pipes", " 14 Pipes No exercises in this chapter. "], +["vectors.html", "15 Vectors 15.1 Introduction 15.2 Important types of Atomic Vector 15.3 Using atomic vectors 15.4 Recursive Vectors (lists) 15.5 Augmented Vectors", " 15 Vectors 15.1 Introduction Functions mentioned typeof dplyr::near is.finite, is.nan, is.na attributes library("tidyverse") #> Loading tidyverse: ggplot2 #> Loading tidyverse: tibble #> Loading tidyverse: tidyr #> Loading tidyverse: readr #> Loading tidyverse: purrr #> Loading tidyverse: dplyr #> Conflicts with tidy packages ---------------------------------------------- #> filter(): dplyr, stats #> lag(): dplyr, stats 15.2 Important types of Atomic Vector Why does this matter? 99% of the time in the work you do, it won’t. Someone else has written the numerical methods and (hopefully) accounted for these issues. And the types of problems you encounter in social science generally are not dealing with these issues. However, if you aren’t even aware that “floating point numbers” are a “thing”, if something goes wrong, it will seem like magic. Also, at least being aware of these problems will help you understand error messages from optimization routines that complaing of “numerical precision”. 15.2.1 Exercises Describe the difference between is.finite(x) and !is.infinite(x). To find out, try the functions on a numeric vector that includes a number and the five special values (NA, NaN, Inf, -Inf). x <- c(0, NA, NaN, Inf, -Inf) is.finite(x) #> [1] TRUE FALSE FALSE FALSE FALSE !is.infinite(x) #> [1] TRUE TRUE TRUE FALSE FALSE is.finite considers only a number to be finite, and considers missing (NA), not a number (NaN), and positive and negative infinity to be not finite. However, since is.infinite only considers Inf and -Inf to be inifinite, !is.infinite considers 0 as well as missing and not-a-number to be not infinite. So NA and NaN are neither finite or infinite. Mind blown. Read the source code for dplyr::near() (Hint: to see the source code, drop the ()). How does it work? The source for dplyr::near is: dplyr::near #> function (x, y, tol = .Machine$double.eps^0.5) #> { #> abs(x - y) < tol #> } #> <environment: namespace:dplyr> Instead of checking for exact equality, it checks that two numbers are within a certain tolerance, tol. By default the tolerance is set to the square root of .Machine$double.eps, which is the smallest floating point number that the computer can represent. A logical vector can take 3 possible values. How many possible values can an integer vector take? How many possible values can a double take? Use google to do some research. The help for .Machine describes some of this: As all current implementations of R use 32-bit integers and usne IEC 60559 floating-point (double precision) arithmetic, The IEC 60559 or IEEE 754 format uses a 64 bit vector, but Brainstorm at least four functions that allow you to convert a double to an integer. How do they differ? Be precise. Broadly, could convert a double to an integer by truncating or rounding to the nearest integer. For truncating or for handling ties (doubles ending in 0.5), there are multiple methods for determing which integer value to go to. methods 0.5 -0.5 1.5 -1.5 ============================== ==== ===== ==== ==== towards zero: 0 0 1 1 away from zero 1 -1 2 -2 largest towards \\(+\\infty\\)) 1 0 2 -1 smallest (towards \\(-\\infty\\)) 0 -1 1 -2 even 0 0 2 -2 odd 1 -1 1 -1 ===================================================== See the Wikipedia article IEEE floating point for rounding rules. For rounding, R and many programming languages use the IEEE standard. This is “round to nearest, ties to even”. This is not the same as what you See the value of looking at the value of .Machine$double.rounding and its documentation. x <- seq(-10, 10, by = 0.5) round2 <- function(x, to_even = TRUE) { q <- x %/% 1 r <- x %% 1 q + (r >= 0.5) } x <- c(-12.5, -11.5, 11.5, 12.5) round(x) #> [1] -12 -12 12 12 round2(x, to_even = FALSE) #> [1] -12 -11 12 13 The problem with the always rounding 0.5 up rule is that it is biased upwards. Rounding to nearest with ties towards even is not. Consider the sequence \\(-100.5, -99.5, \\dots, 0, \\dots, 99.5, 100.5\\). Its sum is 0. It would be nice if rounding preserved that sum. Using the “ties towards even”, the sum is still zero. Hoever, the “ties towards \\(+\\infty\\)” produces a non-zero number. x <- seq(-100.5, 100.5, by = 1) sum(x) #> [1] 0 sum(round(x)) #> [1] 0 sum(round2(x)) #> [1] 101 Here’s a real-world non-engineering example of rounding going terribly wrong. In 1983, the Vancouver stock exchange adjusted its index from 524.811 to 1098.892 to correct for accumulated error due to rounding to three decimal points (see Vancouver Stock Exchange). Here’s a list of a few more. What functions from the readr package allow you to turn a string into logical, integer, and double vector? The functions parse_logical, parse_integer, and parse_number. parse_logical(c("TRUE", "FALSE", "1", "0", "true", "t", "NA")) #> [1] TRUE FALSE TRUE FALSE TRUE TRUE NA parse_integer(c("1235", "0134", "NA")) #> [1] 1235 134 NA parse_number(c("1.0", "3.5", "1,000", "NA")) #> [1] 1.0 3.5 1000.0 NA Read the documentation of read_number. In order to ignore things like currency symbols and comma seperators in number strings it ignores them using a heuristic. 15.3 Using atomic vectors What does mean(is.na(x)) tell you about a vector x? What about sum(!is.finite(x))? The expression mean(is.na(x)) calculates the proportion of missing values in a vector x <- c(1:10, NA, NaN, Inf, -Inf) mean(is.na(x)) #> [1] 0.143 The expression mean(!is.finite(x)) calcualtes the proportion of values that are NA, NaN, or infinite. mean(!is.finite(x)) #> [1] 0.286 Carefully read the documentation of is.vector(). What does it actually test for? Why does is.atomic() not agree with the definition of atomic vectors above? The function is.vector only checks whether the object has no attributes other than names. Thus a list is a vector: is.vector(list(a = 1, b = 2)) #> [1] TRUE But any object that has an attribute (other than names) is not: x <- 1:10 attr(x, "something") <- TRUE is.vector(x) #> [1] FALSE The idea behind this is that object oriented classes will include attributes, including, but not limited to "class". The function is.atomic explicitly checks whether an object is one of the atomic types (“logical”, “integer”, “numeric”, “complex”, “character”, and “raw”) or NULL. is.atomic(1:10) #> [1] TRUE is.atomic(list(a = 1)) #> [1] FALSE The function is.atomic will consider objects to be atomic even if they have extra attributes. is.atomic(x) #> [1] TRUE Compare and contrast setNames() with purrr::set_names(). These are simple functions, so we can simply print out their source code: setNames #> function (object = nm, nm) #> { #> names(object) <- nm #> object #> } #> <bytecode: 0x7fb54902f638> #> <environment: namespace:stats> purrr::set_names #> function (x, nm = x) #> { #> if (!is_vector(x)) { #> stop("`x` must be a vector", call. = FALSE) #> } #> if (length(x) != length(nm)) { #> stop("`x` and `nm` must be the same length", call. = FALSE) #> } #> names(x) <- nm #> x #> } #> <environment: namespace:purrr> From the code we can see that set_names adds a few sanity checks: x has to be a vector, and the lengths of the object and the names have to be the same. Create functions that take a vector as input and returns: The last value. Should you use [ or [[? 2 The elements at even numbered positions. Every element except the last value. Only even numbers (and no missing values). last_value <- function(x) { # check for case with no length if (length(x)) { # Use [[ as suggested because it returns one element x[[length(x)]] } else { x } } last_value(numeric()) #> numeric(0) last_value(1) #> [1] 1 last_value(1:10) #> [1] 10 even_indices <- function(x) { if (length(x)) { x[seq_along(x) %% 2 == 0] } else { x } } even_indices(numeric()) #> numeric(0) even_indices(1) #> numeric(0) even_indices(1:10) #> [1] 2 4 6 8 10 # test using case to ensure that values not indices # are being returned even_indices(letters) #> [1] "b" "d" "f" "h" "j" "l" "n" "p" "r" "t" "v" "x" "z" not_last <- function(x) { if (length(x)) { x[-length(x)] } else { x } } not_last(1:5) #> [1] 1 2 3 4 even_numbers <- function(x) { x[!is.na(x) & (x %% 2 == 0)] } even_numbers(-10:10) #> [1] -10 -8 -6 -4 -2 0 2 4 6 8 10 Why is x[-which(x > 0)] not the same as x[x <= 0]? They will treat missing values differently. x <- c(-5:5, Inf, -Inf, NaN, NA) x[-which(x > 0)] #> [1] -5 -4 -3 -2 -1 0 -Inf NaN NA -which(x > 0) #> [1] -7 -8 -9 -10 -11 -12 x[x <= 0] #> [1] -5 -4 -3 -2 -1 0 -Inf NA NA x <= 0 #> [1] TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE #> [12] FALSE TRUE NA NA -which(x > 0) which calculates the indexes for any value that is TRUE and ignores NA. Thus is keeps NA and NaN because the comparison is not TRUE. x <= 0 works slightly differently. If x <= 0 returns TRUE or FALSE it works the same way. Hoewver, if the comparison generates a NA, then it will always keep that entry, but set it to NA. This is why the last two values of x[x <= 0] are NA rather than c(NaN, NA). What happens when you subset with a positive integer that’s bigger than the length of the vector? What happens when you subset with a name that doesn’t exist? When you subset with positive integers that are larger than the length of the vector, NA values are returned for those integers larger than the length of the vector. (1:10)[11:12] #> [1] NA NA When a vector is subset with a name that doesn’t exist, an error is generated. c(a = 1, 2)[["b"]] #> Error in c(a = 1, 2)[["b"]]: subscript out of bounds 15.4 Recursive Vectors (lists) 15.4.1 Exercises Draw the following lists as nested sets: list(a, b, list(c, d), list(e, f)) list(list(list(list(list(list(a)))))) What happens if you subset a tibble as if you’re subsetting a list? What are the key differences between a list and a tibble? Subsetting a tibble works the same way as a list; a data frame can be thought of as a list of columns. The key different between a list and a tibble is that a tibble (data frame) has the restriction that all its elements (columns) must have the same length. x <- tibble(a = 1:2, b = 3:4) x[["a"]] #> [1] 1 2 x["a"] #> # A tibble: 2 × 1 #> a #> <int> #> 1 1 #> 2 2 x[1] #> # A tibble: 2 × 1 #> a #> <int> #> 1 1 #> 2 2 x[1, ] #> # A tibble: 1 × 2 #> a b #> <int> <int> #> 1 1 3 15.5 Augmented Vectors 15.5.1 Exercises What does hms::hms(3600) return? How does it print? What primitive type is the augmented vector built on top of? What attributes does it use? x <- hms::hms(3600) class(x) #> [1] "hms" "difftime" x #> 01:00:00 hms::hms returns an object of class, and prints the time in “%H:%M:%S” format. The primitive type is a double typeof(x) #> [1] "double" The atttributes is uses are "units" and "class". attributes(x) #> $units #> [1] "secs" #> #> $class #> [1] "hms" "difftime" Try and make a tibble that has columns with different lengths. What happens? If I try to create at tibble with a scalar and column of a different length there are no issues, and the scalar is repeated to the length of the longer vector. tibble(x = 1, y = 1:5) #> # A tibble: 5 × 2 #> x y #> <dbl> <int> #> 1 1 1 #> 2 1 2 #> 3 1 3 #> 4 1 4 #> 5 1 5 However, if I try to create a tibble with two vectors of different lengths (other than one), the tibble function throws an error. tibble(x = 1:3, y = 1:4) #> Error: Variables must be length 1 or 4. #> Problem variables: 'x' Based on the definition above, is it ok to have a list as a column of a tibble? If I didn’t already know the answer, what I would do is try it out. From the above, the error message was about vectors having different lengths. But there is nothing that prevents a tibble from having vectors of different types: doubles, character, integers, logical, factor, date. The later are still atomic, but they have additional attributes. So, maybe there won’t be an issue with a list vector as long as it is the same length. tibble(x = 1:3, y = list("a", 1, list(1:3))) #> # A tibble: 3 × 2 #> x y #> <int> <list> #> 1 1 <chr [1]> #> 2 2 <dbl [1]> #> 3 3 <list [1]> It works! I even used a list with heterogenous types and there wasn’t an issue. In following chapters we’ll see that list vectors can be very useful: for example, when processing many different models. "], +["iteration.html", "16 Iteration 16.1 Introduction 16.2 For Loops 16.3 For loop variations 16.4 For loops vs. functionals 16.5 The map functions 16.6 Dealing with Failure 16.7 Mapping over multiple arguments 16.8 Walk 16.9 Other patterns of for loops", " 16 Iteration 16.1 Introduction purrr package for loop while seq_len, seq_along unlist bind_rows, bind_cols, purrr::flatten_dbl Map functions in purrr: map and type-specific variants map_lgl, map_chr, map_int, map_dbl. col_summary apply function in base R: lapply, sapply, vapply safely, quietly, possibly walk and variants keep, discard, some, every, head_while, tail_while, detect, detect_index reduce library("tidyverse") library("stringr") The package microbenchmark is used for timing code library("microbenchmark") 16.2 For Loops 16.2.1 Exercises Write for loops to: Compute the mean of every column in mtcars. Determine the type of each column in nycflights13::flights. Compute the number of unique values in each column of iris. Generate 10 random normals for each of \\(\\mu = -10\\), 0, 10, and 100. Think about the output, sequence, and body before you start writing the loop. To compute the mean of every column in mtcars. output <- vector("double", ncol(mtcars)) names(output) <- names(mtcars) for (i in names(mtcars)) { output[i] <- mean(mtcars[[i]]) } output #> mpg cyl disp hp drat wt qsec vs am #> 20.091 6.188 230.722 146.688 3.597 3.217 17.849 0.438 0.406 #> gear carb #> 3.688 2.812 Determine the type of each column in nycflights13::flights. Note that we need to use a list, not a character vector, since the class can have multiple values. data("flights", package = "nycflights13") output <- vector("list", ncol(flights)) names(output) <- names(flights) for (i in names(flights)) { output[[i]] <- class(flights[[i]]) } output #> $year #> [1] "integer" #> #> $month #> [1] "integer" #> #> $day #> [1] "integer" #> #> $dep_time #> [1] "integer" #> #> $sched_dep_time #> [1] "integer" #> #> $dep_delay #> [1] "numeric" #> #> $arr_time #> [1] "integer" #> #> $sched_arr_time #> [1] "integer" #> #> $arr_delay #> [1] "numeric" #> #> $carrier #> [1] "character" #> #> $flight #> [1] "integer" #> #> $tailnum #> [1] "character" #> #> $origin #> [1] "character" #> #> $dest #> [1] "character" #> #> $air_time #> [1] "numeric" #> #> $distance #> [1] "numeric" #> #> $hour #> [1] "numeric" #> #> $minute #> [1] "numeric" #> #> $time_hour #> [1] "POSIXct" "POSIXt" data(iris) iris_uniq <- vector("double", ncol(iris)) names(iris_uniq) <- names(iris) for (i in names(iris)) { iris_uniq[i] <- length(unique(iris[[i]])) } iris_uniq #> Sepal.Length Sepal.Width Petal.Length Petal.Width Species #> 35 23 43 22 3 # number to draw n <- 10 # values of the mean mu <- c(-10, 0, 10, 100) normals <- vector("list", length(mu)) for (i in seq_along(normals)) { normals[[i]] <- rnorm(n, mean = mu[i]) } normals #> [[1]] #> [1] -11.40 -9.74 -12.44 -10.01 -9.38 -8.85 -11.82 -10.25 -10.24 -10.28 #> #> [[2]] #> [1] -0.5537 0.6290 2.0650 -1.6310 0.5124 -1.8630 -0.5220 -0.0526 #> [9] 0.5430 -0.9141 #> #> [[3]] #> [1] 10.47 10.36 8.70 10.74 11.89 9.90 9.06 9.98 9.17 8.49 #> #> [[4]] #> [1] 100.9 100.2 100.2 101.6 100.1 99.9 98.1 99.7 99.7 101.1 However, we don’t need a for loop for this since rnorm recycles means. matrix(rnorm(n * length(mu), mean = mu), ncol = n) #> [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] #> [1,] -9.930 -9.56 -9.88 -10.2061 -12.27 -8.926 -11.178 -9.51 -8.663 #> [2,] -0.639 2.76 -1.91 0.0192 2.68 -0.665 -0.976 -1.70 0.237 #> [3,] 9.950 10.05 10.86 10.0296 9.64 11.114 11.065 8.53 11.318 #> [4,] 99.749 100.58 99.76 100.5498 100.21 99.754 100.132 100.28 100.524 #> [,10] #> [1,] -9.39 #> [2,] -0.11 #> [3,] 10.17 #> [4,] 99.91 Eliminate the for loop in each of the following examples by taking advantage of an existing function that works with vectors: out <- "" for (x in letters) { out <- stringr::str_c(out, x) } out #> [1] "abcdefghijklmnopqrstuvwxyz" str_c already works with vectors, so simply use str_c with the collapse argument to return a single string. stringr::str_c(letters, collapse = "") #> [1] "abcdefghijklmnopqrstuvwxyz" For this I’m going to rename the variable sd to something different because sd is the name of the function we want to use. x <- sample(100) sd. <- 0 for (i in seq_along(x)) { sd. <- sd. + (x[i] - mean(x)) ^ 2 } sd. <- sqrt(sd. / (length(x) - 1)) sd. #> [1] 29 We could simply use the sd function. sd(x) #> [1] 29 Or if there was a need to use the equation (e.g. for pedagogical reasons), then the functions mean and sum already work with vectors: sqrt(sum((x - mean(x)) ^ 2) / (length(x) - 1)) #> [1] 29 x <- runif(100) out <- vector("numeric", length(x)) out[1] <- x[1] for (i in 2:length(x)) { out[i] <- out[i - 1] + x[i] } out #> [1] 0.126 1.064 1.865 2.623 3.156 3.703 3.799 4.187 4.359 5.050 #> [11] 5.725 6.672 6.868 7.836 8.224 8.874 9.688 9.759 10.286 11.050 #> [21] 11.485 12.038 12.242 12.273 13.242 13.421 14.199 15.085 15.921 16.527 #> [31] 17.434 17.470 17.601 17.695 18.392 18.797 18.863 18.989 19.927 20.143 #> [41] 20.809 21.013 21.562 22.389 22.517 22.778 23.066 23.081 23.935 24.349 #> [51] 25.100 25.819 26.334 27.309 27.670 27.840 28.623 28.654 29.444 29.610 #> [61] 29.639 30.425 31.250 32.216 32.594 32.769 33.372 34.178 34.215 34.947 #> [71] 35.163 35.179 35.307 35.993 36.635 36.963 37.350 38.058 38.755 39.681 #> [81] 40.140 40.736 40.901 41.468 42.366 42.960 43.792 44.386 45.165 45.562 #> [91] 46.412 47.154 47.472 47.583 47.685 48.485 48.865 48.917 49.904 50.508 The code above is calculating a cumulative sum. Use the function cumsum all.equal(cumsum(x),out) #> [1] TRUE Ex. 21.2.1.3 Combine your function writing and for loop skills: 1. Write a for loop that `prints()` the lyrics to the children's song "Alice the camel". 1. Convert the nursery rhyme "ten in the bed" to a function. Generalise it to any number of people in any sleeping structure. 1. Convert the song "99 bottles of beer on the wall" to a function. Generalise to any number of any vessel containing any liquid on any surface. I don’t know what the deal is with Hadley and nursery rhymes. Here’s the lyrics for Alice the Camel We’ll look from five to no humps, and print out a different last line if there are no humps. This uses cat instead of print, so it looks nicer. humps <- c("five", "four", "three", "two", "one", "no") for (i in humps) { cat(str_c("Alice the camel has ", rep(i, 3), " humps.", collapse = "\\n"), "\\n") if (i == "no") { cat("Now Alice is a horse.\\n") } else { cat("So go, Alice, go.\\n") } cat("\\n") } #> Alice the camel has five humps. #> Alice the camel has five humps. #> Alice the camel has five humps. #> So go, Alice, go. #> #> Alice the camel has four humps. #> Alice the camel has four humps. #> Alice the camel has four humps. #> So go, Alice, go. #> #> Alice the camel has three humps. #> Alice the camel has three humps. #> Alice the camel has three humps. #> So go, Alice, go. #> #> Alice the camel has two humps. #> Alice the camel has two humps. #> Alice the camel has two humps. #> So go, Alice, go. #> #> Alice the camel has one humps. #> Alice the camel has one humps. #> Alice the camel has one humps. #> So go, Alice, go. #> #> Alice the camel has no humps. #> Alice the camel has no humps. #> Alice the camel has no humps. #> Now Alice is a horse. The lyrics for Ten in the Bed: numbers <- c("ten", "nine", "eight", "seven", "six", "five", "four", "three", "two", "one") for (i in numbers) { cat(str_c("There were ", i, " in the bed\\n")) cat("and the little one said\\n") if (i == "one") { cat("I'm lonely...") } else { cat("Roll over, roll over\\n") cat("So they all rolled over and one fell out.\\n") } cat("\\n") } #> There were ten in the bed #> and the little one said #> Roll over, roll over #> So they all rolled over and one fell out. #> #> There were nine in the bed #> and the little one said #> Roll over, roll over #> So they all rolled over and one fell out. #> #> There were eight in the bed #> and the little one said #> Roll over, roll over #> So they all rolled over and one fell out. #> #> There were seven in the bed #> and the little one said #> Roll over, roll over #> So they all rolled over and one fell out. #> #> There were six in the bed #> and the little one said #> Roll over, roll over #> So they all rolled over and one fell out. #> #> There were five in the bed #> and the little one said #> Roll over, roll over #> So they all rolled over and one fell out. #> #> There were four in the bed #> and the little one said #> Roll over, roll over #> So they all rolled over and one fell out. #> #> There were three in the bed #> and the little one said #> Roll over, roll over #> So they all rolled over and one fell out. #> #> There were two in the bed #> and the little one said #> Roll over, roll over #> So they all rolled over and one fell out. #> #> There were one in the bed #> and the little one said #> I'm lonely... For the bottles of beer, I define a helper function to correctly print the number of bottles. bottles <- function(i) { if (i > 2) { bottles <- str_c(i - 1, " bottles") } else if (i == 2) { bottles <- "1 bottle" } else { bottles <- "no more bottles" } bottles } beer_bottles <- function(n) { # should test whether n >= 1. for (i in seq(n, 1)) { cat(str_c(bottles(i), " of beer on the wall, ", bottles(i), " of beer.\\n")) cat(str_c("Take one down and pass it around, ", bottles(i - 1), " of beer on the wall.\\n\\n")) } cat("No more bottles of beer on the wall, no more bottles of beer.\\n") cat(str_c("Go to the store and buy some more, ", bottles(n), " of beer on the wall.\\n")) } beer_bottles(3) #> 2 bottles of beer on the wall, 2 bottles of beer. #> Take one down and pass it around, 1 bottle of beer on the wall. #> #> 1 bottle of beer on the wall, 1 bottle of beer. #> Take one down and pass it around, no more bottles of beer on the wall. #> #> no more bottles of beer on the wall, no more bottles of beer. #> Take one down and pass it around, no more bottles of beer on the wall. #> #> No more bottles of beer on the wall, no more bottles of beer. #> Go to the store and buy some more, 2 bottles of beer on the wall. Ex 21.2.1.4 It’s common to see for loops that don’t preallocate the output and instead increase the length of a vector at each step: output <- vector("integer", 0) for (i in seq_along(x)) { output <- c(output, lengths(x[[i]])) } output I’ll use the package microbenchmark to time this. Microbenchmark will run an R expression a number of times and time it. Define a function that appends to an integer vector. add_to_vector <- function(n) { output <- vector("integer", 0) for (i in seq_len(n)) { output <- c(output, i) } output } microbenchmark(add_to_vector(10000), times = 3) #> Unit: milliseconds #> expr min lq mean median uq max neval #> add_to_vector(10000) 185 196 201 206 209 211 3 And one that pre-allocates it. add_to_vector_2 <- function(n) { output <- vector("integer", n) for (i in seq_len(n)) { output[[i]] <- i } output } microbenchmark(add_to_vector_2(10000), times = 3) #> Unit: milliseconds #> expr min lq mean median uq max neval #> add_to_vector_2(10000) 7.05 7.14 8.02 7.23 8.5 9.77 3 The pre-allocated vector is about 100 times faster! YMMV, but the longer the vector and the bigger the objects, the more that pre-allocation will outperform appending. 16.3 For loop variations 16.3.1 Ex Imagine you have a directory full of CSV files that you want to read in. You have their paths in a vector, files <- dir("data/", pattern = "\\\\.csv$", full.names = TRUE), and now want to read each one with read_csv(). Write the for loop that will load them into a single data frame. I pre-allocate a list, read each file as data frame into an element in that list. This creates a list of data frames. I then use bind_rows to create a single data frame from the list of data frames. df <- vector("list", length(files)) for (fname in seq_along(files)) { df[[i]] <- read_csv(files[[i]]) } df <- bind_rows(df) Ex What happens if you use for (nm in names(x)) and x has no names? What if only some of the elements are named? What if the names are not unique? Let’s try it out and see what happens. When there are no names for the vector, it does not run the code in the loop (it runs zero iterations of the loop): x <- 1:3 print(names(x)) #> NULL for (nm in names(x)) { print(nm) print(x[[nm]]) } Note that the length of NULL is zero: length(NULL) #> [1] 0 If there only some names, then we get an error if we try to access an element without a name. However, oddly, nm == "" when there is no name. x <- c(a = 1, 2, c = 3) names(x) #> [1] "a" "" "c" for (nm in names(x)) { print(nm) print(x[[nm]]) } #> [1] "a" #> [1] 1 #> [1] "" #> Error in x[[nm]]: subscript out of bounds Finally, if there are duplicate names, then x[[nm]] will give the first element with that name. There is no way to access duplicately named elements by name. x <- c(a = 1, a = 2, c = 3) names(x) #> [1] "a" "a" "c" for (nm in names(x)) { print(nm) print(x[[nm]]) } #> [1] "a" #> [1] 1 #> [1] "a" #> [1] 1 #> [1] "c" #> [1] 3 Ex Write a function that prints the mean of each numeric column in a data frame, along with its name. For example, show_mean(iris) would print: show_mean(iris) #> Sepal.Length: 5.84 #> Sepal.Width: 3.06 #> Petal.Length: 3.76 #> Petal.Width: 1.20 (Extra challenge: what function did I use to make sure that the numbers lined up nicely, even though the variable names had different lengths?) There may be other functions to do this, but I’ll use str_pad, and str_length to ensure that the space given to the variable names is the same. I messed around with the options to format until I got two digits . show_mean <- function(df, digits = 2) { # Get max length of any variable in the dataset maxstr <- max(str_length(names(df))) for (nm in names(df)) { if (is.numeric(df[[nm]])) { cat(str_c(str_pad(str_c(nm, ":"), maxstr + 1L, side = "right"), format(mean(df[[nm]]), digits = digits, nsmall = digits), sep = " "), "\\n") } } } show_mean(iris) #> Sepal.Length: 5.84 #> Sepal.Width: 3.06 #> Petal.Length: 3.76 #> Petal.Width: 1.20 Ex What does this code do? How does it work? trans <- list( disp = function(x) x * 0.0163871, am = function(x) { factor(x, labels = c("auto", "manual")) } ) for (var in names(trans)) { mtcars[[var]] <- trans[[var]](mtcars[[var]]) } This code mutates the disp and am columns: disp is multiplied by 0.0163871 am is replaced by a factor variable. The code works by looping over a named list of functions. It calls the named function in the list on the column of mtcars with the same name, and replaces the values of that column. E.g. this is a function: trans[["disp"]] This applies the function to the column of mtcars with the same name trans[["disp"]](mtcars[["disp"]]) 16.4 For loops vs. functionals col_summary <- function(df, fun) { out <- vector("double", length(df)) for (i in seq_along(df)) { out[i] <- fun(df[[i]]) } out } 16.4.1 Exercises Ex. 21.4.1.1 Read the documentation for apply(). In the 2d case, what two for loops does it generalise. It generalises looping over the rows or columns of a matrix or data-frame. Ex. 21.4.1.2 Adapt col_summary() so that it only applies to numeric columns You might want to start with an is_numeric() function that returns a logical vector that has a TRUE corresponding to each numeric column. col_summary2 <- function(df, fun) { # test whether each colum is numeric numeric_cols <- vector("logical", length(df)) for (i in seq_along(df)) { numeric_cols[[i]] <- is.numeric(df[[i]]) } # indexes of numeric columns idxs <- seq_along(df)[numeric_cols] # number of numeric columns n <- sum(numeric_cols) out <- vector("double", n) for (i in idxs) { out[i] <- fun(df[[i]]) } out } Let’s test that it works, df <- tibble( a = rnorm(10), b = rnorm(10), c = letters[1:10], d = rnorm(10) ) col_summary2(df, mean) #> [1] 0.859 0.555 0.000 -0.451 16.5 The map functions 16.5.1 Shortcuts Notes The lm() function runs a linear regression. It is covered in the Model Basics chapter. 16.5.2 Exercises Ex Write code that uses one of the map functions to: 1. Compute the mean of every column in `mtcars`. 1. Determine the type of each column in `nycflights13::flights`. 1. Compute the number of unique values in each column of `iris`. 1. Generate 10 random normals for each of $\\mu = -10$, $0$, $10$, and $100$. The mean of every column in mtcars: map_dbl(mtcars, mean) #> mpg cyl disp hp drat wt qsec vs am #> 20.091 6.188 230.722 146.688 3.597 3.217 17.849 0.438 0.406 #> gear carb #> 3.688 2.812 The type of every column in nycflights13::flights. map(nycflights13::flights, class) #> $year #> [1] "integer" #> #> $month #> [1] "integer" #> #> $day #> [1] "integer" #> #> $dep_time #> [1] "integer" #> #> $sched_dep_time #> [1] "integer" #> #> $dep_delay #> [1] "numeric" #> #> $arr_time #> [1] "integer" #> #> $sched_arr_time #> [1] "integer" #> #> $arr_delay #> [1] "numeric" #> #> $carrier #> [1] "character" #> #> $flight #> [1] "integer" #> #> $tailnum #> [1] "character" #> #> $origin #> [1] "character" #> #> $dest #> [1] "character" #> #> $air_time #> [1] "numeric" #> #> $distance #> [1] "numeric" #> #> $hour #> [1] "numeric" #> #> $minute #> [1] "numeric" #> #> $time_hour #> [1] "POSIXct" "POSIXt" I had to use map rather than map_chr since the class Though if by type, typeof is meant: map_chr(nycflights13::flights, typeof) #> year month day dep_time sched_dep_time #> "integer" "integer" "integer" "integer" "integer" #> dep_delay arr_time sched_arr_time arr_delay carrier #> "double" "integer" "integer" "double" "character" #> flight tailnum origin dest air_time #> "integer" "character" "character" "character" "double" #> distance hour minute time_hour #> "double" "double" "double" "double" The number of unique values in each column of iris: map_int(iris, ~ length(unique(.))) #> Sepal.Length Sepal.Width Petal.Length Petal.Width Species #> 35 23 43 22 3 Generate 10 random normals for each of \\(\\mu = -10\\), \\(0\\), \\(10\\), and \\(100\\): map(c(-10, 0, 10, 100), rnorm, n = 10) #> [[1]] #> [1] -11.27 -9.46 -9.92 -9.44 -9.58 -11.45 -9.06 -10.34 -10.08 -9.96 #> #> [[2]] #> [1] 0.124 -0.998 1.233 0.340 -0.473 0.709 -1.529 0.237 -1.313 0.747 #> #> [[3]] #> [1] 8.44 10.07 9.36 9.15 10.68 11.15 8.31 9.10 11.32 11.10 #> #> [[4]] #> [1] 101.2 98.6 101.4 100.0 99.9 100.4 100.1 99.2 99.5 98.8 Ex How can you create a single vector that for each column in a data frame indicates whether or not it’s a factor? Use map_lgl with the function is.factor, map_lgl(mtcars, is.factor) #> mpg cyl disp hp drat wt qsec vs am gear carb #> FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE Ex What happens when you use the map functions on vectors that aren’t lists? What does map(1:5, runif) do? Why? The function map applies the function to each element of the vector. map(1:5, runif) #> [[1]] #> [1] 0.226 #> #> [[2]] #> [1] 0.133 0.927 #> #> [[3]] #> [1] 0.894 0.204 0.257 #> #> [[4]] #> [1] 0.614 0.441 0.316 0.101 #> #> [[5]] #> [1] 0.2726 0.6537 0.9279 0.0266 0.5595 Ex What does map(-2:2, rnorm, n = 5) do? Why? What does map_dbl(-2:2, rnorm, n = 5) do? Why? This takes samples of n = 5 from normal distributions of means -2, -1, 0, 1, and 2, and returns a list with each element a numeric vectors of length 5. map(-2:2, rnorm, n = 5) #> [[1]] #> [1] -0.945 -2.821 -2.638 -2.153 -3.416 #> #> [[2]] #> [1] -0.393 -0.912 -2.570 -0.687 -0.347 #> #> [[3]] #> [1] -0.00796 1.72703 2.08647 -0.35835 -1.44212 #> #> [[4]] #> [1] 1.38 1.09 1.16 1.36 0.64 #> #> [[5]] #> [1] 1.8914 3.8278 0.0381 2.9460 2.5490 However, if we use map_dbl it throws an error. map_dbl expects the function to return a numeric vector of length one. map_dbl(-2:2, rnorm, n = 5) #> Error: Result 1 is not a length 1 atomic vector If we wanted a numeric vector, we could use map followed by flatten_dbl, flatten_dbl(map(-2:2, rnorm, n = 5)) #> [1] -1.402 -1.872 -3.717 -1.964 -0.993 -0.287 -2.110 -0.851 -1.386 -1.230 #> [11] 0.392 0.470 0.989 -0.714 1.270 1.709 2.047 -0.210 1.380 0.933 #> [21] 2.280 2.330 2.285 2.429 1.879 Ex Rewrite map(x, function(df) lm(mpg ~ wt, data = df)) to eliminate the anonymous function. map(list(mtcars), ~ lm(mpg ~ wt, data = .)) #> [[1]] #> #> Call: #> lm(formula = mpg ~ wt, data = .) #> #> Coefficients: #> (Intercept) wt #> 37.29 -5.34 16.6 Dealing with Failure 16.7 Mapping over multiple arguments 16.8 Walk 16.9 Other patterns of for loops 16.9.1 Exercises Ex Implement your own version of every() using a for loop. Compare it with purrr::every(). What does purrr’s version do that your version doesn’t? # Use ... to pass arguments to the function every2 <- function(.x, .p, ...) { for (i in .x) { if (!.p(i, ...)) { # If any is FALSE we know not all of then were TRUE return(FALSE) } } # if nothing was FALSE, then it is TRUE TRUE } every2(1:3, function(x) {x > 1}) #> [1] FALSE every2(1:3, function(x) {x > 0}) #> [1] TRUE The function purrr::every does fancy things with .p, like taking a logical vector instead of a function, or being able to test part of a string if the elements of .x are lists. Ex Create an enhanced col_sum() that applies a summary function to every numeric column in a data frame. Note this question has a typo. It is referring to col_summary. I will use map to apply the function to all the columns, and keep to only select numeric columns. col_sum2 <- function(df, f, ...) { map(keep(df, is.numeric), f, ...) } col_sum2(iris, mean) #> $Sepal.Length #> [1] 5.84 #> #> $Sepal.Width #> [1] 3.06 #> #> $Petal.Length #> [1] 3.76 #> #> $Petal.Width #> [1] 1.2 Ex A possible base R equivalent of col_sum() is: col_sum3 <- function(df, f) { is_num <- sapply(df, is.numeric) df_num <- df[, is_num] sapply(df_num, f) } But it has a number of bugs as illustrated with the following inputs: df <- tibble( x = 1:3, y = 3:1, z = c("a", "b", "c") ) # OK col_sum3(df, mean) # Has problems: don't always return numeric vector col_sum3(df[1:2], mean) col_sum3(df[1], mean) col_sum3(df[0], mean) What causes the bugs? The problem is that sapply doesn’t always return numeric vectors. If no columns are selected, instead of gracefully exiting, it returns an empty list. This causes an error since we can’t use a list with [. sapply(df[0], is.numeric) #> named list() sapply(df[1], is.numeric) #> a #> TRUE sapply(df[1:2], is.numeric) #> a b #> TRUE TRUE "], +["model-intro.html", "17 Introduction", " 17 Introduction Some of the discussion of models is slightly different, and has a different emphasis than in most social science research. This is largely because this book is speaking to data scientists, where the primary goal is prediction rather than theory testing (that I don’t view these as too different is a different story). The discussion about hypothesis generation vs. confirmation is interesting. Too little emphasis is placed on hypothesis generation in social science. The importance of out of sample testing also receives too little emphasis in political science. And from this discussion it should be clear that many papers in social science are hypothesis generation masquerading as hypothesis confirmation. "], +["model-basics.html", "18 Model Basics 18.1 Prerequisites 18.2 A simple model 18.3 Visualizing Models 18.4 Formulas and Model Families 18.5 Missing values 18.6 Other model families", " 18 Model Basics Distinction between family of models and fitted model is a useful way to think about models. Especially as we can abstract some families of models to be themselves a fitted model of a more flexible family of models. For example, linear regression is a special case of GLM or Gaussian Processes etc. 18.1 Prerequisites library(tidyverse) library(modelr) options(na.action = na.warn) The option na.action determines how missing values are handled. It is a function. na.warn sets it so that there is a warning if there are any missing values (by default, R will just silently drop them). 18.2 A simple model ggplot(sim1, aes(x, y)) + geom_point() models <- tibble( a1 = runif(250, -20, 40), a2 = runif(250, -5, 5) ) ggplot(sim1, aes(x, y)) + geom_abline(aes(intercept = a1, slope = a2), data = models, alpha = 1/4) + geom_point() model1 <- function(a, data) { a[1] + data$x * a[2] } model1(c(7, 1.5), sim1) #> [1] 8.5 8.5 8.5 10.0 10.0 10.0 11.5 11.5 11.5 13.0 13.0 13.0 14.5 14.5 #> [15] 14.5 16.0 16.0 16.0 17.5 17.5 17.5 19.0 19.0 19.0 20.5 20.5 20.5 22.0 #> [29] 22.0 22.0 measure_distance <- function(mod, data) { diff <- data$y - model1(mod, data) sqrt(mean(diff ^ 2)) } measure_distance(c(7, 1.5), sim1) #> [1] 2.67 sim1_dist <- function(a1, a2) { measure_distance(c(a1, a2), sim1) } models <- models %>% mutate(dist = purrr::map2_dbl(a1, a2, sim1_dist)) models #> # A tibble: 250 × 3 #> a1 a2 dist #> <dbl> <dbl> <dbl> #> 1 -15.15 0.0889 30.8 #> 2 30.06 -0.8274 13.2 #> 3 16.05 2.2695 13.2 #> 4 -10.57 1.3769 18.7 #> 5 -19.56 -1.0359 41.8 #> 6 7.98 4.5948 19.3 #> # ... with 244 more rows ggplot(sim1, aes(x, y)) + geom_point(size = 2, colour = "grey30") + geom_abline( aes(intercept = a1, slope = a2, colour = -dist), data = filter(models, rank(dist) <= 10) ) grid <- expand.grid( a1 = seq(-5, 20, length = 25), a2 = seq(1, 3, length = 25) ) %>% mutate(dist = purrr::map2_dbl(a1, a2, sim1_dist)) grid %>% ggplot(aes(a1, a2)) + geom_point(data = filter(grid, rank(dist) <= 10), size = 4, colour = "red") + geom_point(aes(colour = -dist)) ggplot(sim1, aes(x, y)) + geom_point(size = 2, colour = "grey30") + geom_abline( aes(intercept = a1, slope = a2, colour = -dist), data = filter(grid, rank(dist) <= 10) ) best <- optim(c(0, 0), measure_distance, data = sim1) best$par #> [1] 4.22 2.05 ggplot(sim1, aes(x, y)) + geom_point(size = 2, colour = "grey30") + geom_abline(intercept = best$par[1], slope = best$par[2]) sim1_mod <- lm(y ~ x, data = sim1) coef(sim1_mod) #> (Intercept) x #> 4.22 2.05 18.2.1 Exercises sim1a <- tibble( x = rep(1:10, each = 3), y = x * 1.5 + 6 + rt(length(x), df = 2) ) lm(y ~ x, data = sim1a) #> #> Call: #> lm(formula = y ~ x, data = sim1a) #> #> Coefficients: #> (Intercept) x #> 6.05 1.53 ggplot(sim1a, aes(x = x, y = y)) + geom_point() + geom_smooth(method = "lm", se = FALSE) To re-run this a few times using purrr simt <- function(i) { tibble( x = rep(1:10, each = 3), y = x * 1.5 + 6 + rt(length(x), df = 2), .id = i ) } lm_df <- function(.data) { mod <- lm(y ~ x, data = .data) beta <- coef(mod) tibble(intercept = beta[1], slope = beta[2]) } sims <- map(1:100, simt) %>% map_df(lm_df) ggplot(sims, aes(x = intercept, y = slope)) + geom_point() NOTE It’s not entirely clear what is meant by “visualize the results”. The data are generated from a low-degrees of freedmo t-distribution, so there will be outliers.r4ds Linear regression is One way to make linear models more robust is to use a different distance measure. For example, instead of root-mean-squared distance, you could use mean-absolute distance: measure_distance <- function(mod, data) { diff <- data$y - make_prediction(mod, data) mean(abs(diff)) } To re-run this a few times use purrr simt <- function(i) { tibble( x = rep(1:10, each = 3), y = x * 1.5 + 6 + rt(length(x), df = 2), .id = i ) } lm_df <- function(.data) { mod <- lm(y ~ x, data = .data) beta <- coef(mod) tibble(intercept = beta[1], slope = beta[2]) } sims <- map(1:100, simt) %>% map_df(lm_df) ggplot(sims, aes(x = intercept, y = slope)) + geom_point() One challenge with performing numerical optimisation is that it’s only guaranteed to find one local optima. What’s the problem with optimising a three parameter model like this? model1 <- function(a, data) { a[1] + data$x * a[2] + a[3] } The problem is that you for any values a[1] = a1 and a[3] = a3, any other values of a[1] and a[3] where a[1] + a[3] == (a1 + a3) will have the same fit. 18.3 Visualizing Models More complicated models can be visualized with predictions residuals Notes look at tidyr::complete, tidyr::expand, and modelr::data_grid functions modelr::add_residuals and modelr::add_predictions functions add residuals or predictions to the original data geom_ref_line grid <- sim1 %>% data_grid(x) grid #> # A tibble: 10 × 1 #> x #> <int> #> 1 1 #> 2 2 #> 3 3 #> 4 4 #> 5 5 #> 6 6 #> # ... with 4 more rows grid <- grid %>% add_predictions(sim1_mod) grid #> # A tibble: 10 × 2 #> x pred #> <int> <dbl> #> 1 1 6.27 #> 2 2 8.32 #> 3 3 10.38 #> 4 4 12.43 #> 5 5 14.48 #> 6 6 16.53 #> # ... with 4 more rows ggplot(sim1, aes(x)) + geom_point(aes(y = y)) + geom_line(aes(y = pred), data = grid, colour = "red", size = 1) sim1 <- sim1 %>% add_residuals(sim1_mod) sim1 #> # A tibble: 30 × 3 #> x y resid #> <int> <dbl> <dbl> #> 1 1 4.20 -2.072 #> 2 1 7.51 1.238 #> 3 1 2.13 -4.147 #> 4 2 8.99 0.665 #> 5 2 10.24 1.919 #> 6 2 11.30 2.973 #> # ... with 24 more rows ggplot(sim1, aes(resid)) + geom_freqpoly(binwidth = 0.5) ggplot(sim1, aes(x, resid)) + geom_ref_line(h = 0) + geom_point() 18.3.1 Exercises nstead of using lm() to fit a straight line, you can use loess() to fit a smooth curve. Repeat the process of model fitting, grid generation, predictions, and visualisation on sim1 using loess() instead of lm(). How does the result compare to geom_smooth()? I’ll use add_predictions and add_residuals to add the predictions and residuals from a loess regression to the sim1 data. sim1_loess <- loess(y ~ x, data = sim1) grid_loess <- sim1 %>% add_predictions(sim1_loess) sim1 <- sim1 %>% add_residuals(sim1_loess, var = "resid_loess") %>% add_predictions(sim1_loess, var = "pred_loess") This plots the loess predictions. The loess produces a nonlinear, but smooth line through the data. plot_sim1_loess <- ggplot(sim1, aes(x = x, y = y)) + geom_point() + geom_line(aes(x = x, y = pred), data = grid_loess, colour = "red") plot_sim1_loess The predictions of loess are the same as the default method for geom_smooth because geom_smooth() uses loess() by default; the message even tells us that. plot_sim1_loess + geom_smooth(colour = "blue", se = FALSE, alpha = 0.20) #> `geom_smooth()` using method = 'loess' We can plot the residuals (red), and compare them to the residuals from lm (black). In general, the loess model has smaller residuals within the sample (out of sample is a different issue, and we haven’t considered the uncertainty of these estimates). ggplot(sim1, aes(x = x)) + geom_ref_line(h = 0) + geom_point(aes(y = resid)) + geom_point(aes(y = resid_loess), colour = "red") add_predictions() is paired with gather_predictions() and spread_predictions(). How do these three functions differ? The functions gather_predictions and spread_predictions allow for adding predictions from multiple models at once. What does geom_ref_line() do? What package does it come from? Why is displaying a reference line in plots showing residuals useful and important? The geom geom_ref_line() adds as reference line to a plot. Even though it alters a ggplot2 plot, it is in the modelr package. Putting a reference line at zero for residuals is important because good models (generally) should have residuals centered at zero, with approximately the same variance (or distribution) over the support of x, and no correlation. A zero reference line makes it easier to judge these characteristics visually. Why might you want to look at a frequency polygon of absolute residuals? What are the pros and cons compared to looking at the raw residuals? The frequency polygon makes it easier to judge whether the variance and/or absolute size of the residuals varies with respect to x. This is called heteroskedasticity, and results in incorrect standard errors in inference. In prediction, this provides insight into where the model is working well and where it is not. What is lost, is that since the absolute values are shown, whether the model is over-predicting or underpredicting, or on average correctly predicting in different regions of x cannot be determined. 18.4 Formulas and Model Families df <- tribble( ~y, ~x1, ~x2, 4, 2, 5, 5, 1, 6 ) model_matrix(df, y ~ x1) #> # A tibble: 2 × 2 #> `(Intercept)` x1 #> <dbl> <dbl> #> 1 1 2 #> 2 1 1 model_matrix(df, y ~ x1 - 1) #> # A tibble: 2 × 1 #> x1 #> <dbl> #> 1 2 #> 2 1 18.4.1 Categorical Variables df <- tribble( ~ sex, ~ response, "male", 1, "female", 2, "male", 1 ) model_matrix(df, response ~ sex) #> # A tibble: 3 × 2 #> `(Intercept)` sexmale #> <dbl> <dbl> #> 1 1 1 #> 2 1 0 #> 3 1 1 ggplot(sim2) + geom_point(aes(x, y)) mod2 <- lm(y ~ x, data = sim2) grid <- sim2 %>% data_grid(x) %>% add_predictions(mod2) grid #> # A tibble: 4 × 2 #> x pred #> <chr> <dbl> #> 1 a 1.15 #> 2 b 8.12 #> 3 c 6.13 #> 4 d 1.91 ggplot(sim3, aes(x1, y)) + geom_point(aes(colour = x2)) mod1 <- lm(y ~ x1 + x2, data = sim3) mod2 <- lm(y ~ x1 * x2, data = sim3) grid <- sim3 %>% data_grid(x1, x2) %>% gather_predictions(mod1, mod2) grid #> # A tibble: 80 × 4 #> model x1 x2 pred #> <chr> <int> <fctr> <dbl> #> 1 mod1 1 a 1.67 #> 2 mod1 1 b 4.56 #> 3 mod1 1 c 6.48 #> 4 mod1 1 d 4.03 #> 5 mod1 2 a 1.48 #> 6 mod1 2 b 4.37 #> # ... with 74 more rows ggplot(sim3, aes(x1, y, colour = x2)) + geom_point() + geom_line(data = grid, aes(y = pred)) + facet_wrap(~ model) sim3 <- sim3 %>% gather_residuals(mod1, mod2) ggplot(sim3, aes(x1, resid, colour = x2)) + geom_point() + facet_grid(model ~ x2) mod1 <- lm(y ~ x1 + x2, data = sim4) mod2 <- lm(y ~ x1 * x2, data = sim4) grid <- sim4 %>% data_grid( x1 = seq_range(x1, 5), x2 = seq_range(x2, 5) ) %>% gather_predictions(mod1, mod2) grid #> # A tibble: 50 × 4 #> model x1 x2 pred #> <chr> <dbl> <dbl> <dbl> #> 1 mod1 -1.0 -1.0 0.996 #> 2 mod1 -1.0 -0.5 -0.395 #> 3 mod1 -1.0 0.0 -1.786 #> 4 mod1 -1.0 0.5 -3.177 #> 5 mod1 -1.0 1.0 -4.569 #> 6 mod1 -0.5 -1.0 1.907 #> # ... with 44 more rows Function seq_range is useful. ggplot(grid, aes(x1, x2)) + geom_tile(aes(fill = pred)) + facet_wrap(~ model) ggplot(grid, aes(x1, pred, colour = x2, group = x2)) + geom_line() + facet_wrap(~ model) ggplot(grid, aes(x2, pred, colour = x1, group = x1)) + geom_line() + facet_wrap(~ model) TODO We should visualize interactions with plotly 18.4.2 Exercises 18.5 Missing values TODO Need to write a tidyverse compliant na.omit function. 18.6 Other model families NOTE It’s worth mentioning these as more general models. Though they don’t appear as much in social science work. I should try to explain that. I can think of several reasons preference for easy to explain models (though I think that’s wrong–most people can’t visualize high-dimensional space well, and interpret results marginally even though they are conditional) status-quo bias and path dependence combined with lack of knowledge of work outside the field and median lack of technical ability to understand or use these models. the most principled reason is that those modre complicated models really excel in prediction. If we take an agnostic approach to regression, as in the Angrist and Pischke books, then regression isn’t being used to fit \\(f(y | x)\\), its being used to fit \\(E(f(y | x))\\), and more specifically to get some sort of average effect for a change in a specific variable. "], +["communicate-intro.html", "19 Introduction", " 19 Introduction "], +["r-markdown.html", "20 R Markdown 20.1 R Markdown Basics 20.2 Text formatting with R Markdown", " 20 R Markdown 20.1 R Markdown Basics Doesn’t describe what YAML is. https://en.wikipedia.org/wiki/YAML The Ansible Guide to YAML is pretty simple; you don’t need to know what Ansible is: http://docs.ansible.com/ansible/YAMLSyntax.html https://learnxinyminutes.com/docs/yaml/ http://codebeautify.org/yaml-validator https://docs.saltstack.com/en/latest/topics/yaml/ 20.1.1 Exercise Create a new notebook using File > New File > R Notebook. Read the instructions. Practice running the chunks. Verify that you can modify the code, re-run it, and see modified output. Nothing to show Create a new R Markdown document with File > New File > R Markdown… Knit it by clicking the appropriate button. Knit it by using the appropriate keyboard short cut. Verify that you can modify the input and see the output update. Compare and contrast the R notebook and R markdown files you created above. How are the outputs similar? How are they different? How are the inputs similar? How are they different? What happens if you copy the YAML header from one to the other? R notebook files show the output inside the editor, while hiding the console. R markdown files shows the output inside the console, and does not show output inside the editor. They differ in the value of output in their YAML headers. The YAML header for the R notebook is ouptut: html_notebook while the header for the R markdown file is ouptut: html_document Create one new R Markdown document for each of the three built-in formats: HTML, PDF and Word. Knit each of the three documents. How does the output differ? How does the input differ? (You may need to install LaTeX in order to build the PDF output — RStudio will prompt you if this is necessary.) They produce different outputs, both in the final documents and intermediate files (notably the type of plots produced). The only difference in the inputs is the value of output in the YAML header: word_document for Word documents, pdf_document for PDF documents, and html_document for HTML documents. 20.2 Text formatting with R Markdown Continue "], +["r-markdown-formats.html", "21 R Markdown Formats", " 21 R Markdown Formats No exercises. This document was built with bookdown. You can see the source at https://github.com/jrnold/e4qf. "], +["r-markdown-workflow.html", "22 R Markdown Workflow", " 22 R Markdown Workflow Notes Find reproducible research articles Need good documentation on YAML and what it is No exercises "] ] diff --git a/docs/strings.html b/docs/strings.html index 6693dba4..29cada8b 100644 --- a/docs/strings.html +++ b/docs/strings.html @@ -108,218 +108,228 @@
                                                                                                                                                                                                                                      • Welcome
                                                                                                                                                                                                                                      • -
                                                                                                                                                                                                                                      • 1 Visualize
                                                                                                                    • -

                                                                                                                      9.4.2 Exercises

                                                                                                                      +

                                                                                                                      10.4.2 Exercises

                                                                                                                      1. For each of the following challenges, try solving it by using both a single regular expression, and a combination of multiple str_detect() calls.

                                                                                                                      2. Find all words that start or end with x.
                                                                                                                      3. @@ -725,7 +735,7 @@

                                                                                                                        9.4.2 Exercises

                                                                                                                        #> [1] "a"
                                                                                                                      -

                                                                                                                      9.4.3 Extract Matches

                                                                                                                      +

                                                                                                                      10.4.3 Extract Matches

                                                                                                                      The Harvard sentences:

                                                                                                                      length(sentences)
                                                                                                                       #> [1] 720
                                                                                                                      @@ -772,7 +782,7 @@ 

                                                                                                                      9.4.3 Extract Matches

                                                                                                                      #> [2,] "a" "b" "" #> [3,] "a" "b" "c"
                                                                                                                      -

                                                                                                                      9.4.3.1 Exercises

                                                                                                                      +

                                                                                                                      10.4.3.1 Exercises

                                                                                                                      1. In the previous example, you might have noticed that the regular expression matched “flickered”, which is not a colour. Modify the regex to fix the problem.
                                                                                                                      @@ -804,7 +814,7 @@

                                                                                                                      9.4.3.1 Exercises

                                                                                                                      -

                                                                                                                      9.4.4 Grouped Matches

                                                                                                                      +

                                                                                                                      10.4.4 Grouped Matches

                                                                                                                      noun <- "(a|the) ([^ ]+)"
                                                                                                                       has_noun <- sentences %>%
                                                                                                                         str_subset(noun) %>%
                                                                                                                      @@ -842,7 +852,7 @@ 

                                                                                                                      9.4.4 Grouped Matches

                                                                                                                      #> 6 The juice of lemons makes fine punch. <NA> <NA> #> # ... with 714 more rows
                                                                                                                      -

                                                                                                                      9.4.4.1 Exercises

                                                                                                                      +

                                                                                                                      10.4.4.1 Exercises

                                                                                                                      1. Find all words that come after a “number” like “one”, “two”, “three” etc. Pull out both the number and the word.
                                                                                                                      @@ -872,9 +882,9 @@

                                                                                                                      9.4.4.1 Exercises

                                                                                                                      -

                                                                                                                      9.4.5 Splitting

                                                                                                                      +

                                                                                                                      10.4.5 Splitting

                                                                                                                      -

                                                                                                                      9.4.5.1 Exercises

                                                                                                                      +

                                                                                                                      10.4.5.1 Exercises

                                                                                                                      1. Split up a string like "apples, pears, and bananas" into individual components.
                                                                                                                      @@ -895,9 +905,9 @@

                                                                                                                      9.4.5.1 Exercises

                                                                                                                      -

                                                                                                                      9.5 Other types of patterns

                                                                                                                      +

                                                                                                                      10.5 Other types of patterns

                                                                                                                      -

                                                                                                                      9.5.1 Exercises

                                                                                                                      +

                                                                                                                      10.5.1 Exercises

                                                                                                                      1. How would you find all strings containing \ with regex() vs. with fixed()?
                                                                                                                      @@ -927,9 +937,9 @@

                                                                                                                      9.5.1 Exercises

                                                                                                                      -

                                                                                                                      9.6 stringi

                                                                                                                      +

                                                                                                                      10.6 stringi

                                                                                                                      -

                                                                                                                      9.6.1 Exercises

                                                                                                                      +

                                                                                                                      10.6.1 Exercises

                                                                                                                      1. Find the stringi functions that:

                                                                                                                          diff --git a/docs/tibbles.html b/docs/tibbles.html index eb981748..4978f74c 100644 --- a/docs/tibbles.html +++ b/docs/tibbles.html @@ -108,218 +108,228 @@
                                                                                                                        1. Welcome
                                                                                                                        2. -
                                                                                                                        3. 1 Visualize