# Logical vectors

## Comparisions

In [1]:
library(tidyverse)
library(nycflights13)

── [1mAttaching core tidyverse packages[22m ────────────────────────────────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.4     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ──────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors


## Summaries

In [2]:
flights |> 
  group_by(year, month, day) |> 
  summarize(
    proportion_delayed = mean(dep_delay <= 60, na.rm = TRUE),
    count_long_delay = sum(arr_delay >= 300, na.rm = TRUE),
    .groups = "drop"
  )

[38;5;246m# A tibble: 365 × 5[39m
    year month   day proportion_delayed count_long_delay
   [3m[38;5;246m<int>[39m[23m [3m[38;5;246m<int>[39m[23m [3m[38;5;246m<int>[39m[23m              [3m[38;5;246m<dbl>[39m[23m            [3m[38;5;246m<int>[39m[23m
[38;5;250m 1[39m  [4m2[24m013     1     1              0.939                3
[38;5;250m 2[39m  [4m2[24m013     1     2              0.914                3
[38;5;250m 3[39m  [4m2[24m013     1     3              0.941                0
[38;5;250m 4[39m  [4m2[24m013     1     4              0.953                0
[38;5;250m 5[39m  [4m2[24m013     1     5              0.964                1
[38;5;250m 6[39m  [4m2[24m013     1     6              0.959                0
[38;5;250m 7[39m  [4m2[24m013     1     7              0.956                1
[38;5;250m 8[39m  [4m2[24m013     1     8              0.975                0
[38;5;250m 9[39m  [4m2[24m013     1     9              0.986     

## Conditional transformations

In [3]:
x <- c(-3:3, NA)

In [4]:
case_when(
  x == 0 ~ "0",
  x < 0  ~ "-ve",
  x > 0  ~ "+ve",
  is.na(x) ~ "???"
)

[1] "-ve" "-ve" "-ve" "0"   "+ve" "+ve" "+ve" "???"

# Numbers

## Making numbers

In [5]:
x <- c("1.2", "5.6", "1e3")
parse_double(x)

[1]    1.2    5.6 1000.0

In [None]:
x <- c("$1,234", "USD 3,513", "59%")
parse_double(x)

警告: 3 parsing failures.
row col               expected    actual
  1  -- a double               $1,234   
  2  -- a double               USD 3,513
  3  -- no trailing characters 59%      



[1] NA NA NA
attr(,"problems")
[38;5;246m# A tibble: 3 × 4[39m
    row   col expected               actual   
  [3m[38;5;246m<int>[39m[23m [3m[38;5;246m<int>[39m[23m [3m[38;5;246m<chr>[39m[23m                  [3m[38;5;246m<chr>[39m[23m    
[38;5;250m1[39m     1    [31mNA[39m a double               $1,234   
[38;5;250m2[39m     2    [31mNA[39m a double               USD 3,513
[38;5;250m3[39m     3    [31mNA[39m no trailing characters 59%      

In [7]:
parse_number(x)

[1] 1234 3513   59

## Counts

In [9]:
flights |> 
  count(dest)

[38;5;246m# A tibble: 105 × 2[39m
   dest      n
   [3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<int>[39m[23m
[38;5;250m 1[39m ABQ     254
[38;5;250m 2[39m ACK     265
[38;5;250m 3[39m ALB     439
[38;5;250m 4[39m ANC       8
[38;5;250m 5[39m ATL   [4m1[24m[4m7[24m215
[38;5;250m 6[39m AUS    [4m2[24m439
[38;5;250m 7[39m AVL     275
[38;5;250m 8[39m BDL     443
[38;5;250m 9[39m BGR     375
[38;5;250m10[39m BHM     297
[38;5;246m# ℹ 95 more rows[39m
[38;5;246m# ℹ Use `print(n = ...)` to see more rows[39m

In [10]:
flights |> 
  group_by(dest) |> 
  summarize(
    n = n(),
    delay = mean(arr_delay, na.rm = TRUE)
  )

[38;5;246m# A tibble: 105 × 3[39m
   dest      n delay
   [3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<int>[39m[23m [3m[38;5;246m<dbl>[39m[23m
[38;5;250m 1[39m ABQ     254  4.38
[38;5;250m 2[39m ACK     265  4.85
[38;5;250m 3[39m ALB     439 14.4 
[38;5;250m 4[39m ANC       8 -[31m2[39m[31m.[39m[31m5[39m 
[38;5;250m 5[39m ATL   [4m1[24m[4m7[24m215 11.3 
[38;5;250m 6[39m AUS    [4m2[24m439  6.02
[38;5;250m 7[39m AVL     275  8.00
[38;5;250m 8[39m BDL     443  7.05
[38;5;250m 9[39m BGR     375  8.03
[38;5;250m10[39m BHM     297 16.9 
[38;5;246m# ℹ 95 more rows[39m
[38;5;246m# ℹ Use `print(n = ...)` to see more rows[39m

In [11]:
flights |> 
  group_by(tailnum) |> 
  summarize(miles = sum(distance))

[38;5;246m# A tibble: 4,044 × 2[39m
   tailnum  miles
   [3m[38;5;246m<chr>[39m[23m    [3m[38;5;246m<dbl>[39m[23m
[38;5;250m 1[39m D942DN    [4m3[24m418
[38;5;250m 2[39m N0EGMQ  [4m2[24m[4m5[24m[4m0[24m866
[38;5;250m 3[39m N10156  [4m1[24m[4m1[24m[4m5[24m966
[38;5;250m 4[39m N102UW   [4m2[24m[4m5[24m722
[38;5;250m 5[39m N103US   [4m2[24m[4m4[24m619
[38;5;250m 6[39m N104UW   [4m2[24m[4m5[24m157
[38;5;250m 7[39m N10575  [4m1[24m[4m5[24m[4m0[24m194
[38;5;250m 8[39m N105UW   [4m2[24m[4m3[24m618
[38;5;250m 9[39m N107US   [4m2[24m[4m1[24m677
[38;5;250m10[39m N108UW   [4m3[24m[4m2[24m070
[38;5;246m# ℹ 4,034 more rows[39m
[38;5;246m# ℹ Use `print(n = ...)` to see more rows[39m

In [12]:
flights |> count(tailnum, wt = distance)

[38;5;246m# A tibble: 4,044 × 2[39m
   tailnum      n
   [3m[38;5;246m<chr>[39m[23m    [3m[38;5;246m<dbl>[39m[23m
[38;5;250m 1[39m D942DN    [4m3[24m418
[38;5;250m 2[39m N0EGMQ  [4m2[24m[4m5[24m[4m0[24m866
[38;5;250m 3[39m N10156  [4m1[24m[4m1[24m[4m5[24m966
[38;5;250m 4[39m N102UW   [4m2[24m[4m5[24m722
[38;5;250m 5[39m N103US   [4m2[24m[4m4[24m619
[38;5;250m 6[39m N104UW   [4m2[24m[4m5[24m157
[38;5;250m 7[39m N10575  [4m1[24m[4m5[24m[4m0[24m194
[38;5;250m 8[39m N105UW   [4m2[24m[4m3[24m618
[38;5;250m 9[39m N107US   [4m2[24m[4m1[24m677
[38;5;250m10[39m N108UW   [4m3[24m[4m2[24m070
[38;5;246m# ℹ 4,034 more rows[39m
[38;5;246m# ℹ Use `print(n = ...)` to see more rows[39m

## Numeric transformations

In [2]:
library(tidyverse)
df <- tribble(
  ~x, ~y,
  1,  3,
  5,  2,
  7, NA,
  )

── [1mAttaching core tidyverse packages[22m ────────────────────────────────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.4     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ──────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors


In [3]:
df

[38;5;246m# A tibble: 3 × 2[39m
      x     y
  [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m
[38;5;250m1[39m     1     3
[38;5;250m2[39m     5     2
[38;5;250m3[39m     7    [31mNA[39m

In [4]:
df |> 
  mutate(
    min = pmin(x, y, na.rm = TRUE),
    max = pmax(x, y, na.rm = TRUE)
  )

[38;5;246m# A tibble: 3 × 4[39m
      x     y   min   max
  [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m
[38;5;250m1[39m     1     3     1     3
[38;5;250m2[39m     5     2     2     5
[38;5;250m3[39m     7    [31mNA[39m     7     7

In [5]:
x <- c(1, 2, 5, 10, 15, 20)
cut(x, breaks = c(0, 5, 10, 15, 20))

[1] (0,5]   (0,5]   (0,5]   (5,10]  (10,15] (15,20]
Levels: (0,5] (5,10] (10,15] (15,20]

In [7]:
cut(
  x,
  breaks = c(0, 5, 10, 15, 20),
  labels = c("q1", "q2", "q3", "q4")
)

[1] q1 q1 q1 q2 q3 q4
Levels: q1 q2 q3 q4

In [8]:
y <- c(NA, -10, 5, 10, 100)
cut(
  y,
  breaks = c(0, 5, 10, 15, 20)
)

[1] <NA>   <NA>   (0,5]  (5,10] <NA>  
Levels: (0,5] (5,10] (10,15] (15,20]

In [9]:
x <- 1:10
cumsum(x)

 [1]  1  3  6 10 15 21 28 36 45 55

In [10]:
x <- c(1, 2, 2, 3 ,4, NA)
min_rank(x)

[1]  1  2  2  4  5 NA

In [12]:
desc(x)

[1] -1 -2 -2 -3 -4 NA

In [11]:
min_rank(desc(x))

[1]  5  3  3  2  1 NA

In [13]:
x <- c(2, 5, 11, 11, 19, 35)

In [14]:
lag(x)

[1] NA  2  5 11 11 19

In [15]:
lead(x)

[1]  5 11 11 19 35 NA

## Numeric summaries

In [16]:
library(nycflights13)

In [17]:
flights |> 
  group_by(origin, dest) |> 
summarize(
  distance_iqr = IQR(distance),
  n = n(),
  .group = "drop"
)

[1m[22m`summarise()` has grouped output by 'origin'. You can override using the `.groups` argument.


[38;5;246m# A tibble: 224 × 5[39m
[38;5;246m# Groups:   origin [3][39m
   origin dest  distance_iqr     n .group
   [3m[38;5;246m<chr>[39m[23m  [3m[38;5;246m<chr>[39m[23m        [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<int>[39m[23m [3m[38;5;246m<chr>[39m[23m 
[38;5;250m 1[39m EWR    ALB              0   439 drop  
[38;5;250m 2[39m EWR    ANC              0     8 drop  
[38;5;250m 3[39m EWR    ATL              0  [4m5[24m022 drop  
[38;5;250m 4[39m EWR    AUS              0   968 drop  
[38;5;250m 5[39m EWR    AVL              0   265 drop  
[38;5;250m 6[39m EWR    BDL              0   443 drop  
[38;5;250m 7[39m EWR    BNA              0  [4m2[24m336 drop  
[38;5;250m 8[39m EWR    BOS              0  [4m5[24m327 drop  
[38;5;250m 9[39m EWR    BQN              0   297 drop  
[38;5;250m10[39m EWR    BTV              0   931 drop  
[38;5;246m# ℹ 214 more rows[39m
[38;5;246m# ℹ Use `print(n = ...)` to see more rows[39m

# Strings

In [18]:
install.packages("babynames")

试开URL’https://cran.rstudio.com/bin/macosx/big-sur-arm64/contrib/4.4/babynames_1.0.1.tgz'
Content type 'application/x-gzip' length 5533286 bytes (5.3 MB)
downloaded 5.3 MB




下载的二进制程序包在
	/var/folders/fv/c0dh9y6s7638rh5fdr3lgcmr0000gn/T//RtmpiKiA0a/downloaded_packages里


In [19]:
library(tidyverse)
library(babynames)

## Creating a string

In [22]:
tricky <- r"(double_quote <- "\"" # or '"'
single_quote <- '\'' # or "'")"
str_view(tricky)

[90m[1] │[39m double_quote <- "\"" # or '"'
    [90m│[39m single_quote <- '\'' # or "'"

## Creating many strings from data

In [23]:
str_c("x", "y", "1")

[1] "xy1"

In [24]:
x <- c("x", "y", "z")
str_c(x)

[1] "x" "y" "z"

## Extracting data from strings

In [25]:
df1 <- tibble(x = c("a,b,c", "d,e", "f"))

In [26]:
df1

[38;5;246m# A tibble: 3 × 1[39m
  x    
  [3m[38;5;246m<chr>[39m[23m
[38;5;250m1[39m a,b,c
[38;5;250m2[39m d,e  
[38;5;250m3[39m f    

In [27]:
df1 |> 
  separate_longer_delim(x, delim = ",")

[38;5;246m# A tibble: 6 × 1[39m
  x    
  [3m[38;5;246m<chr>[39m[23m
[38;5;250m1[39m a    
[38;5;250m2[39m b    
[38;5;250m3[39m c    
[38;5;250m4[39m d    
[38;5;250m5[39m e    
[38;5;250m6[39m f    

In [28]:
df1 |> 
  mutate(y = c("q1", "q2", "q3")) |> 
  separate_longer_delim(x, delim = ",")

[38;5;246m# A tibble: 6 × 2[39m
  x     y    
  [3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<chr>[39m[23m
[38;5;250m1[39m a     q1   
[38;5;250m2[39m b     q1   
[38;5;250m3[39m c     q1   
[38;5;250m4[39m d     q2   
[38;5;250m5[39m e     q2   
[38;5;250m6[39m f     q3   

In [31]:
df3 <- tibble(x = c("a10.1.2022", "b10.2.2011", "e15.1.2015"))
df3

[38;5;246m# A tibble: 3 × 1[39m
  x         
  [3m[38;5;246m<chr>[39m[23m     
[38;5;250m1[39m a10.1.2022
[38;5;250m2[39m b10.2.2011
[38;5;250m3[39m e15.1.2015

In [32]:
df3 |> 
  separate_wider_delim(
    x,
    delim = ".",
    names = c("code", "edition", "year")
  )

[38;5;246m# A tibble: 3 × 3[39m
  code  edition year 
  [3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<chr>[39m[23m   [3m[38;5;246m<chr>[39m[23m
[38;5;250m1[39m a10   1       2022 
[38;5;250m2[39m b10   2       2011 
[38;5;250m3[39m e15   1       2015 

In [33]:
df3 |> 
  separate_wider_delim(
    x,
    delim = ".",
    names = c("code", NA, "year")
  )

[38;5;246m# A tibble: 3 × 2[39m
  code  year 
  [3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<chr>[39m[23m
[38;5;250m1[39m a10   2022 
[38;5;250m2[39m b10   2011 
[38;5;250m3[39m e15   2015 

In [34]:
df4 <- tibble(x = c("202215TX", "202122LA", "202325CA")) 
df4

[38;5;246m# A tibble: 3 × 1[39m
  x       
  [3m[38;5;246m<chr>[39m[23m   
[38;5;250m1[39m 202215TX
[38;5;250m2[39m 202122LA
[38;5;250m3[39m 202325CA

In [35]:
df4 |> 
  separate_wider_position(
    x,
    widths = c(year = 4, age = 2, state = 2)
  )

[38;5;246m# A tibble: 3 × 3[39m
  year  age   state
  [3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<chr>[39m[23m
[38;5;250m1[39m 2022  15    TX   
[38;5;250m2[39m 2021  22    LA   
[38;5;250m3[39m 2023  25    CA   

In [39]:
df4 |> 
  separate_wider_position(
    x,
    widths = c(year = 4, 2, state = 2)
  )

[38;5;246m# A tibble: 3 × 2[39m
  year  state
  [3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<chr>[39m[23m
[38;5;250m1[39m 2022  TX   
[38;5;250m2[39m 2021  LA   
[38;5;250m3[39m 2023  CA   

## Letters

In [40]:
babynames

[38;5;246m# A tibble: 1,924,665 × 5[39m
    year sex   name          n   prop
   [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<chr>[39m[23m     [3m[38;5;246m<int>[39m[23m  [3m[38;5;246m<dbl>[39m[23m
[38;5;250m 1[39m  [4m1[24m880 F     Mary       [4m7[24m065 0.072[4m4[24m
[38;5;250m 2[39m  [4m1[24m880 F     Anna       [4m2[24m604 0.026[4m7[24m
[38;5;250m 3[39m  [4m1[24m880 F     Emma       [4m2[24m003 0.020[4m5[24m
[38;5;250m 4[39m  [4m1[24m880 F     Elizabeth  [4m1[24m939 0.019[4m9[24m
[38;5;250m 5[39m  [4m1[24m880 F     Minnie     [4m1[24m746 0.017[4m9[24m
[38;5;250m 6[39m  [4m1[24m880 F     Margaret   [4m1[24m578 0.016[4m2[24m
[38;5;250m 7[39m  [4m1[24m880 F     Ida        [4m1[24m472 0.015[4m1[24m
[38;5;250m 8[39m  [4m1[24m880 F     Alice      [4m1[24m414 0.014[4m5[24m
[38;5;250m 9[39m  [4m1[24m880 F     Bertha     [4m1[24m320 0.013[4m5[24m
[38;5;250m10[39m  [4m1[2

In [41]:
x <- c("Apple", "Banana", "Pear")

In [43]:
str_sub(x, start = 1, end = 3)

[1] "App" "Ban" "Pea"

In [44]:
charToRaw("Hadley")

[1] 48 61 64 6c 65 79

# Locale-dependent functions

## Regular expressions

In [45]:
fruit

 [1] "apple"             "apricot"           "avocado"           "banana"            "bell pepper"      
 [6] "bilberry"          "blackberry"        "blackcurrant"      "blood orange"      "blueberry"        
[11] "boysenberry"       "breadfruit"        "canary melon"      "cantaloupe"        "cherimoya"        
[16] "cherry"            "chili pepper"      "clementine"        "cloudberry"        "coconut"          
[21] "cranberry"         "cucumber"          "currant"           "damson"            "date"             
[26] "dragonfruit"       "durian"            "eggplant"          "elderberry"        "feijoa"           
[31] "fig"               "goji berry"        "gooseberry"        "grape"             "grapefruit"       
[36] "guava"             "honeydew"          "huckleberry"       "jackfruit"         "jambul"           
[41] "jujube"            "kiwi fruit"        "kumquat"           "lemon"             "lime"             
[46] "loquat"            "lychee"            "mandarine

In [46]:
str_view(fruit, "berry")

[90m [6] │[39m bil[36m<berry>[39m
[90m [7] │[39m black[36m<berry>[39m
[90m[10] │[39m blue[36m<berry>[39m
[90m[11] │[39m boysen[36m<berry>[39m
[90m[19] │[39m cloud[36m<berry>[39m
[90m[21] │[39m cran[36m<berry>[39m
[90m[29] │[39m elder[36m<berry>[39m
[90m[32] │[39m goji [36m<berry>[39m
[90m[33] │[39m goose[36m<berry>[39m
[90m[38] │[39m huckle[36m<berry>[39m
[90m[50] │[39m mul[36m<berry>[39m
[90m[70] │[39m rasp[36m<berry>[39m
[90m[73] │[39m salal [36m<berry>[39m
[90m[76] │[39m straw[36m<berry>[39m

## Extract variables

In [47]:
df <- tribble(
  ~str,
  "<Sheryl>-F_34",
  "<Kisha>-F_45", 
  "<Brandon>-N_33",
  "<Sharon>-F_38", 
  "<Penny>-F_58",
  "<Justin>-M_41", 
  "<Patricia>-F_84", 
)

In [48]:
df

[38;5;246m# A tibble: 7 × 1[39m
  str            
  [3m[38;5;246m<chr>[39m[23m          
[38;5;250m1[39m <Sheryl>-F_34  
[38;5;250m2[39m <Kisha>-F_45   
[38;5;250m3[39m <Brandon>-N_33 
[38;5;250m4[39m <Sharon>-F_38  
[38;5;250m5[39m <Penny>-F_58   
[38;5;250m6[39m <Justin>-M_41  
[38;5;250m7[39m <Patricia>-F_84

In [None]:
dot <- "\\."

In [50]:
str_view(c("abc", "a.c", "bef"), "a\\.c")

[90m[2] │[39m [36m<a.c>[39m

In [51]:
str_view(c("abc", "a.c", "bef"), r"{a\.c}")

[90m[2] │[39m [36m<a.c>[39m

In [52]:
x <- "a\\b"
str_view(x)

[90m[1] │[39m a\b

In [53]:
str_view(x, "\\\\")

[90m[1] │[39m a[36m<\>[39mb

In [54]:
str_view(x, r"{\\}")

[90m[1] │[39m a[36m<\>[39mb

In [55]:
x <- c("abb", "abab")
str_view(x, "ab+")

[90m[1] │[39m [36m<abb>[39m
[90m[2] │[39m [36m<ab>[39m[36m<ab>[39m

## Pattern control

# Factors

In [57]:
gss_cat

[38;5;246m# A tibble: 21,483 × 9[39m
    year marital         age race  rincome        partyid            relig              denom      tvhours
   [3m[38;5;246m<int>[39m[23m [3m[38;5;246m<fct>[39m[23m         [3m[38;5;246m<int>[39m[23m [3m[38;5;246m<fct>[39m[23m [3m[38;5;246m<fct>[39m[23m          [3m[38;5;246m<fct>[39m[23m              [3m[38;5;246m<fct>[39m[23m              [3m[38;5;246m<fct>[39m[23m        [3m[38;5;246m<int>[39m[23m
[38;5;250m 1[39m  [4m2[24m000 Never married    26 White $8000 to 9999  Ind,near rep       Protestant         Southern …      12
[38;5;250m 2[39m  [4m2[24m000 Divorced         48 White $8000 to 9999  Not str republican Protestant         Baptist-d…      [31mNA[39m
[38;5;250m 3[39m  [4m2[24m000 Widowed          67 White Not applicable Independent        Protestant         No denomi…       2
[38;5;250m 4[39m  [4m2[24m000 Never married    39 White Not applicable Ind,near rep       Orthodox-christian Not

In [58]:
gss_cat$marital

   [1] Never married Divorced      Widowed       Never married Divorced      Married       Never married
   [8] Divorced      Married       Married       Married       Married       Married       Married      
  [15] Divorced      Married       Widowed       Never married Married       Married       Married      
  [22] Married       Never married Widowed       Widowed       Widowed       Widowed       Widowed      
  [29] Divorced      Widowed       Widowed       Married       Married       Never married Married      
  [36] Never married Never married Never married Never married Never married Married       Married      
  [43] Divorced      Never married Never married Never married Married       Married       Married      
  [50] Married       Never married Married       Married       Married       Married       Divorced     
  [57] Divorced      Divorced      Never married Never married Married       Married       Never married
  [64] Divorced      Never married Widowed       Divorc

In [59]:
class(gss_cat$marital)

[1] "factor"

In [61]:
levels(gss_cat$marital)

[1] "No answer"     "Never married" "Separated"     "Divorced"      "Widowed"       "Married"      

## Modifying factor order

In [63]:
relig_summary <- gss_cat |>
group_by(relig) |>
summarize(
tvhours = mean(tvhours, na.rm = TRUE),
n = n()
)

In [64]:
relig_summary

[38;5;246m# A tibble: 15 × 3[39m
   relig                   tvhours     n
   [3m[38;5;246m<fct>[39m[23m                     [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<int>[39m[23m
[38;5;250m 1[39m No answer                  2.72    93
[38;5;250m 2[39m Don't know                 4.62    15
[38;5;250m 3[39m Inter-nondenominational    2.87   109
[38;5;250m 4[39m Native american            3.46    23
[38;5;250m 5[39m Christian                  2.79   689
[38;5;250m 6[39m Orthodox-christian         2.42    95
[38;5;250m 7[39m Moslem/islam               2.44   104
[38;5;250m 8[39m Other eastern              1.67    32
[38;5;250m 9[39m Hinduism                   1.89    71
[38;5;250m10[39m Buddhism                   2.38   147
[38;5;250m11[39m Other                      2.73   224
[38;5;250m12[39m None                       2.71  [4m3[24m523
[38;5;250m13[39m Jewish                     2.52   388
[38;5;250m14[39m Catholic                   2.96  [4m5

## Modifying factor levels

In [66]:
ordered(c("a", "b", "c"))

[1] a b c
Levels: a < b < c

In [67]:
fct_relevel(c("a", "b", "c"), c("b", "c", "a"))

[1] a b c
Levels: b c a

# Dates and times

In [1]:
library(tidyverse)
library(nycflights13)

── [1mAttaching core tidyverse packages[22m ────────────────────────────────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.4     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ──────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors


## Creating date/times

In [2]:
today()

[1] "2025-02-12"

In [3]:
now()

[1] "2025-02-12 09:15:17 CST"

In [4]:
dmy("1-09-24")

[1] "2024-09-01"

In [5]:
dmy("1 Sep 24")

[1] "2024-09-01"

In [6]:
date1 <- mdy_hm("01/31/2017 08:01")

In [7]:
date1

[1] "2017-01-31 08:01:00 UTC"

In [14]:
floor_date(date1, "year")

[1] "2017-01-01 UTC"

In [9]:
round_date(date1)

[1] "2017-01-31 08:01:00 UTC"

In [10]:
ceiling_date(date1)

[1] "2017-01-31 08:01:00 UTC"

## Time spans

In [15]:
dseconds(18)

[1] "18s"

In [16]:
dminutes(10)

[1] "600s (~10 minutes)"

In [17]:
dhours(c(12, 24))

[1] "43200s (~12 hours)" "86400s (~1 days)"  

In [18]:
ddays(2)

[1] "172800s (~2 days)"

In [19]:
dweeks(3)

[1] "1814400s (~3 weeks)"

In [20]:
dyears(1)

[1] "31557600s (~1 years)"

## Time zones

In [21]:
Sys.timezone()

[1] "Asia/Shanghai"

In [23]:
x1 <- ymd_hms("2024-06-01 12:00:00", tz = "America/New_York")
x1

[1] "2024-06-01 12:00:00 EDT"

In [24]:
x1 <- ymd_hms("2024-06-01 12:00:00", tz = "Asia/Shanghai")
x1

[1] "2024-06-01 12:00:00 CST"

# Missing values

## Explicit missing values

In [25]:
treatment <- tribble(
  ~person,           ~treatment, ~response,
  "Derrick Whitmore", 1,         7,
  NA,                 2,         10,
  NA,                 3,         NA,
  "Katherine Burke",  1,         4
)

In [26]:
treatment

[38;5;246m# A tibble: 4 × 3[39m
  person           treatment response
  [3m[38;5;246m<chr>[39m[23m                [3m[38;5;246m<dbl>[39m[23m    [3m[38;5;246m<dbl>[39m[23m
[38;5;250m1[39m Derrick Whitmore         1        7
[38;5;250m2[39m [31mNA[39m                       2       10
[38;5;250m3[39m [31mNA[39m                       3       [31mNA[39m
[38;5;250m4[39m Katherine Burke          1        4

In [27]:
treatment |> 
  fill(everything())

[38;5;246m# A tibble: 4 × 3[39m
  person           treatment response
  [3m[38;5;246m<chr>[39m[23m                [3m[38;5;246m<dbl>[39m[23m    [3m[38;5;246m<dbl>[39m[23m
[38;5;250m1[39m Derrick Whitmore         1        7
[38;5;250m2[39m Derrick Whitmore         2       10
[38;5;250m3[39m Derrick Whitmore         3       10
[38;5;250m4[39m Katherine Burke          1        4

In [28]:
x <- c(1, 4, 5, 7, NA)

In [29]:
coalesce(x, 0)

[1] 1 4 5 7 0

In [30]:
x <- c(1, 4, 5, 7, -99)

In [32]:
na_if(x, -99)

[1]  1  4  5  7 NA

## Implicit missing values

In [33]:
stocks <- tibble(
  year  = c(2020, 2020, 2020, 2020, 2021, 2021, 2021),
  qtr   = c(   1,    2,    3,    4,    2,    3,    4),
  price = c(1.88, 0.59, 0.35,   NA, 0.92, 0.17, 2.66)
)

In [34]:
stocks

[38;5;246m# A tibble: 7 × 3[39m
   year   qtr price
  [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m
[38;5;250m1[39m  [4m2[24m020     1  1.88
[38;5;250m2[39m  [4m2[24m020     2  0.59
[38;5;250m3[39m  [4m2[24m020     3  0.35
[38;5;250m4[39m  [4m2[24m020     4 [31mNA[39m   
[38;5;250m5[39m  [4m2[24m021     2  0.92
[38;5;250m6[39m  [4m2[24m021     3  0.17
[38;5;250m7[39m  [4m2[24m021     4  2.66

In [35]:
stocks |> 
  pivot_wider(
    names_from = qtr,
    values_from = price
  )

[38;5;246m# A tibble: 2 × 5[39m
   year   `1`   `2`   `3`   `4`
  [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m
[38;5;250m1[39m  [4m2[24m020  1.88  0.59  0.35 [31mNA[39m   
[38;5;250m2[39m  [4m2[24m021 [31mNA[39m     0.92  0.17  2.66

In [36]:
stocks |> 
  complete(year, qtr)

[38;5;246m# A tibble: 8 × 3[39m
   year   qtr price
  [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m
[38;5;250m1[39m  [4m2[24m020     1  1.88
[38;5;250m2[39m  [4m2[24m020     2  0.59
[38;5;250m3[39m  [4m2[24m020     3  0.35
[38;5;250m4[39m  [4m2[24m020     4 [31mNA[39m   
[38;5;250m5[39m  [4m2[24m021     1 [31mNA[39m   
[38;5;250m6[39m  [4m2[24m021     2  0.92
[38;5;250m7[39m  [4m2[24m021     3  0.17
[38;5;250m8[39m  [4m2[24m021     4  2.66

## Factors and empty groups

In [38]:
health <- tibble(
  name   = c("Ikaia", "Oletta", "Leriah", "Dashay", "Tresaun"),
  smoker = factor(c("no", "no", "no", "no", "no"), levels = c("yes", "no")),
  age    = c(34, 88, 75, 47, 56),
)

In [39]:
health

[38;5;246m# A tibble: 5 × 3[39m
  name    smoker   age
  [3m[38;5;246m<chr>[39m[23m   [3m[38;5;246m<fct>[39m[23m  [3m[38;5;246m<dbl>[39m[23m
[38;5;250m1[39m Ikaia   no        34
[38;5;250m2[39m Oletta  no        88
[38;5;250m3[39m Leriah  no        75
[38;5;250m4[39m Dashay  no        47
[38;5;250m5[39m Tresaun no        56

In [40]:
health |> count(smoker)

[38;5;246m# A tibble: 1 × 2[39m
  smoker     n
  [3m[38;5;246m<fct>[39m[23m  [3m[38;5;246m<int>[39m[23m
[38;5;250m1[39m no         5

# Joins

## Keys

In [41]:
planes

[38;5;246m# A tibble: 3,322 × 9[39m
   tailnum  year type                    manufacturer     model     engines seats speed engine   
   [3m[38;5;246m<chr>[39m[23m   [3m[38;5;246m<int>[39m[23m [3m[38;5;246m<chr>[39m[23m                   [3m[38;5;246m<chr>[39m[23m            [3m[38;5;246m<chr>[39m[23m       [3m[38;5;246m<int>[39m[23m [3m[38;5;246m<int>[39m[23m [3m[38;5;246m<int>[39m[23m [3m[38;5;246m<chr>[39m[23m    
[38;5;250m 1[39m N10156   [4m2[24m004 Fixed wing multi engine EMBRAER          EMB-145XR       2    55    [31mNA[39m Turbo-fan
[38;5;250m 2[39m N102UW   [4m1[24m998 Fixed wing multi engine AIRBUS INDUSTRIE A320-214        2   182    [31mNA[39m Turbo-fan
[38;5;250m 3[39m N103US   [4m1[24m999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214        2   182    [31mNA[39m Turbo-fan
[38;5;250m 4[39m N104UW   [4m1[24m999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214        2   182    [31mNA[39m Turbo-fan
[38;5;25

In [42]:
planes |> 
  count(tailnum)

[38;5;246m# A tibble: 3,322 × 2[39m
   tailnum     n
   [3m[38;5;246m<chr>[39m[23m   [3m[38;5;246m<int>[39m[23m
[38;5;250m 1[39m N10156      1
[38;5;250m 2[39m N102UW      1
[38;5;250m 3[39m N103US      1
[38;5;250m 4[39m N104UW      1
[38;5;250m 5[39m N10575      1
[38;5;250m 6[39m N105UW      1
[38;5;250m 7[39m N107US      1
[38;5;250m 8[39m N108UW      1
[38;5;250m 9[39m N109UW      1
[38;5;250m10[39m N110UW      1
[38;5;246m# ℹ 3,312 more rows[39m
[38;5;246m# ℹ Use `print(n = ...)` to see more rows[39m

In [43]:
planes |> 
  count(tailnum) |> 
  filter(n > 1)

[38;5;246m# A tibble: 0 × 2[39m
[38;5;246m# ℹ 2 variables: tailnum <chr>, n <int>[39m

## Basic joins

In [44]:
flights2 <- flights |> 
  select(year, time_hour, origin, dest, tailnum, carrier)
flights2

[38;5;246m# A tibble: 336,776 × 6[39m
    year time_hour           origin dest  tailnum carrier
   [3m[38;5;246m<int>[39m[23m [3m[38;5;246m<dttm>[39m[23m              [3m[38;5;246m<chr>[39m[23m  [3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<chr>[39m[23m   [3m[38;5;246m<chr>[39m[23m  
[38;5;250m 1[39m  [4m2[24m013 2013-01-01 [38;5;246m05:00:00[39m EWR    IAH   N14228  UA     
[38;5;250m 2[39m  [4m2[24m013 2013-01-01 [38;5;246m05:00:00[39m LGA    IAH   N24211  UA     
[38;5;250m 3[39m  [4m2[24m013 2013-01-01 [38;5;246m05:00:00[39m JFK    MIA   N619AA  AA     
[38;5;250m 4[39m  [4m2[24m013 2013-01-01 [38;5;246m05:00:00[39m JFK    BQN   N804JB  B6     
[38;5;250m 5[39m  [4m2[24m013 2013-01-01 [38;5;246m06:00:00[39m LGA    ATL   N668DN  DL     
[38;5;250m 6[39m  [4m2[24m013 2013-01-01 [38;5;246m05:00:00[39m EWR    ORD   N39463  UA     
[38;5;250m 7[39m  [4m2[24m013 2013-01-01 [38;5;246m06:00:00[39m EWR    FLL   N516JB  B6     


In [46]:
airlines

[38;5;246m# A tibble: 16 × 2[39m
   carrier name                       
   [3m[38;5;246m<chr>[39m[23m   [3m[38;5;246m<chr>[39m[23m                      
[38;5;250m 1[39m 9E      Endeavor Air Inc.          
[38;5;250m 2[39m AA      American Airlines Inc.     
[38;5;250m 3[39m AS      Alaska Airlines Inc.       
[38;5;250m 4[39m B6      JetBlue Airways            
[38;5;250m 5[39m DL      Delta Air Lines Inc.       
[38;5;250m 6[39m EV      ExpressJet Airlines Inc.   
[38;5;250m 7[39m F9      Frontier Airlines Inc.     
[38;5;250m 8[39m FL      AirTran Airways Corporation
[38;5;250m 9[39m HA      Hawaiian Airlines Inc.     
[38;5;250m10[39m MQ      Envoy Air                  
[38;5;250m11[39m OO      SkyWest Airlines Inc.      
[38;5;250m12[39m UA      United Air Lines Inc.      
[38;5;250m13[39m US      US Airways Inc.            
[38;5;250m14[39m VX      Virgin America             
[38;5;250m15[39m WN      Southwest Airlines Co.     
[38;5;250m1

In [45]:
flights2 |> 
  left_join(airlines)

[1m[22mJoining with `by = join_by(carrier)`


[38;5;246m# A tibble: 336,776 × 7[39m
    year time_hour           origin dest  tailnum carrier name                    
   [3m[38;5;246m<int>[39m[23m [3m[38;5;246m<dttm>[39m[23m              [3m[38;5;246m<chr>[39m[23m  [3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<chr>[39m[23m   [3m[38;5;246m<chr>[39m[23m   [3m[38;5;246m<chr>[39m[23m                   
[38;5;250m 1[39m  [4m2[24m013 2013-01-01 [38;5;246m05:00:00[39m EWR    IAH   N14228  UA      United Air Lines Inc.   
[38;5;250m 2[39m  [4m2[24m013 2013-01-01 [38;5;246m05:00:00[39m LGA    IAH   N24211  UA      United Air Lines Inc.   
[38;5;250m 3[39m  [4m2[24m013 2013-01-01 [38;5;246m05:00:00[39m JFK    MIA   N619AA  AA      American Airlines Inc.  
[38;5;250m 4[39m  [4m2[24m013 2013-01-01 [38;5;246m05:00:00[39m JFK    BQN   N804JB  B6      JetBlue Airways         
[38;5;250m 5[39m  [4m2[24m013 2013-01-01 [38;5;246m06:00:00[39m LGA    ATL   N668DN  DL      Delta Air Lines Inc.    

In [47]:
summary6 <- function(data, var) {
  data |> summarize(
  min = min({{ var }}, na.rm = TRUE),
  mean = mean({{ var }}, na.rm = TRUE),
  median = median({{ var }}, na.rm = TRUE),
  max = max({{ var }}, na.rm = TRUE),
  n = n(),
  n_miss = sum(is.na({{ var }})),
  .groups = "drop"
  )
  }
  diamonds |> summary6(carat)

[38;5;246m# A tibble: 1 × 6[39m
    min  mean median   max     n n_miss
  [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m  [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<int>[39m[23m  [3m[38;5;246m<int>[39m[23m
[38;5;250m1[39m   0.2 0.798    0.7  5.01 [4m5[24m[4m3[24m940      0

In [48]:
summary6 <- function(data, var) {
  data |> summarize(
  min = min(var, na.rm = TRUE),
  mean = mean({{ var }}, na.rm = TRUE),
  median = median({{ var }}, na.rm = TRUE),
  max = max({{ var }}, na.rm = TRUE),
  n = n(),
  n_miss = sum(is.na({{ var }})),
  .groups = "drop"
  )
  }
  diamonds |> summary6(carat)

: [1m[33mError[39m in `summarize()`:[22m
[1m[22m[36mℹ[39m In argument: `min = min(var, na.rm = TRUE)`.
[1mCaused by error:[22m
[33m![39m 找不到对象'carat'