# Маніпулювання даними за допомогою `dplyr`

## 0. Інсталяція та завантаження пакету

In [1]:
# install.packages("dplyr")

In [2]:
library(dplyr)


Attaching package: 'dplyr'


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union




## 1. Оператор `pipe`: `|>`, `%>%`

In [3]:
set.seed(2025)
data <- data.frame(
    Gender = c("M", "M", "F", "F", "M", "F"),
    Age = sample(12:40, size = 6),
    X = c(T,F,F,F,T,T))
data

Gender,Age,X
<chr>,<int>,<lgl>
M,24,True
M,23,False
F,15,False
F,37,False
M,12,True
F,34,True


In [4]:
data |> select(Gender, Age)

Gender,Age
<chr>,<int>
M,24
M,23
F,15
F,37
M,12
F,34


In [5]:
data |> 
    select(Gender, Age) |> 
    filter(Age > 18)

Gender,Age
<chr>,<int>
M,24
M,23
F,37
F,34


In [6]:
data |> 
    select(Gender, Age) |> 
    filter(Age > 18) |>
    group_by(Gender) |>
    summarise(avg_age = mean(Age))

Gender,avg_age
<chr>,<dbl>
F,35.5
M,23.5


## 3. Огляд набору даних

In [7]:
# install.packages("gapminder")

In [8]:
library(gapminder)

In [9]:
?gapminder

0,1
gapminder {gapminder},R Documentation


In [10]:
str(gapminder)

tibble [1,704 x 6] (S3: tbl_df/tbl/data.frame)
 $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
 $ year     : int [1:1704] 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
 $ lifeExp  : num [1:1704] 28.8 30.3 32 34 36.1 ...
 $ pop      : int [1:1704] 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
 $ gdpPercap: num [1:1704] 779 821 853 836 740 ...


In [11]:
glimpse(gapminder)

Rows: 1,704
Columns: 6
$ country   [3m[90m<fct>[39m[23m "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", ~
$ continent [3m[90m<fct>[39m[23m Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, ~
$ year      [3m[90m<int>[39m[23m 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, ~
$ lifeExp   [3m[90m<dbl>[39m[23m 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8~
$ pop       [3m[90m<int>[39m[23m 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12~
$ gdpPercap [3m[90m<dbl>[39m[23m 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, ~


In [12]:
head(gapminder)

country,continent,year,lifeExp,pop,gdpPercap
<fct>,<fct>,<int>,<dbl>,<int>,<dbl>
Afghanistan,Asia,1952,28.801,8425333,779.4453
Afghanistan,Asia,1957,30.332,9240934,820.853
Afghanistan,Asia,1962,31.997,10267083,853.1007
Afghanistan,Asia,1967,34.02,11537966,836.1971
Afghanistan,Asia,1972,36.088,13079460,739.9811
Afghanistan,Asia,1977,38.438,14880372,786.1134


In [13]:
tail(gapminder)

country,continent,year,lifeExp,pop,gdpPercap
<fct>,<fct>,<int>,<dbl>,<int>,<dbl>
Zimbabwe,Africa,1982,60.363,7636524,788.855
Zimbabwe,Africa,1987,62.351,9216418,706.1573
Zimbabwe,Africa,1992,60.377,10704340,693.4208
Zimbabwe,Africa,1997,46.809,11404948,792.45
Zimbabwe,Africa,2002,39.989,11926563,672.0386
Zimbabwe,Africa,2007,43.487,12311143,469.7093


In [14]:
summary(gapminder)

        country        continent        year         lifeExp     
 Afghanistan:  12   Africa  :624   Min.   :1952   Min.   :23.60  
 Albania    :  12   Americas:300   1st Qu.:1966   1st Qu.:48.20  
 Algeria    :  12   Asia    :396   Median :1980   Median :60.71  
 Angola     :  12   Europe  :360   Mean   :1980   Mean   :59.47  
 Argentina  :  12   Oceania : 24   3rd Qu.:1993   3rd Qu.:70.85  
 Australia  :  12                  Max.   :2007   Max.   :82.60  
 (Other)    :1632                                                
      pop              gdpPercap       
 Min.   :6.001e+04   Min.   :   241.2  
 1st Qu.:2.794e+06   1st Qu.:  1202.1  
 Median :7.024e+06   Median :  3531.8  
 Mean   :2.960e+07   Mean   :  7215.3  
 3rd Qu.:1.959e+07   3rd Qu.:  9325.5  
 Max.   :1.319e+09   Max.   :113523.1  
                                       

## `filter`

In [15]:
data <- gapminder

In [16]:
canada <- filter(data, country == "Canada")

In [17]:
canada <- data |> filter(country == "Canada")

In [18]:
# |> 4.1 
# %>% < 4.09

In [19]:
canada

country,continent,year,lifeExp,pop,gdpPercap
<fct>,<fct>,<int>,<dbl>,<int>,<dbl>
Canada,Americas,1952,68.75,14785584,11367.16
Canada,Americas,1957,69.96,17010154,12489.95
Canada,Americas,1962,71.3,18985849,13462.49
Canada,Americas,1967,72.13,20819767,16076.59
Canada,Americas,1972,72.88,22284500,18970.57
Canada,Americas,1977,74.21,23796400,22090.88
Canada,Americas,1982,75.76,25201900,22898.79
Canada,Americas,1987,76.86,26549700,26626.52
Canada,Americas,1992,77.95,28523502,26342.88
Canada,Americas,1997,78.61,30305843,28954.93


In [20]:
filter(data, lifeExp < 31)

country,continent,year,lifeExp,pop,gdpPercap
<fct>,<fct>,<int>,<dbl>,<int>,<dbl>
Afghanistan,Asia,1952,28.801,8425333,779.4453
Afghanistan,Asia,1957,30.332,9240934,820.853
Angola,Africa,1952,30.015,4232095,3520.6103
Gambia,Africa,1952,30.0,284320,485.2307
Rwanda,Africa,1992,23.599,7290203,737.0686
Sierra Leone,Africa,1952,30.331,2143249,879.7877


In [21]:
filter(data, country == "Austria", year > 1990)

country,continent,year,lifeExp,pop,gdpPercap
<fct>,<fct>,<int>,<dbl>,<int>,<dbl>
Austria,Europe,1992,76.04,7914969,27042.02
Austria,Europe,1997,77.51,8069876,29095.92
Austria,Europe,2002,78.98,8148312,32417.61
Austria,Europe,2007,79.829,8199783,36126.49


In [22]:
filter(data, country %in% c("Austria", "Canada"), year > 1990)

country,continent,year,lifeExp,pop,gdpPercap
<fct>,<fct>,<int>,<dbl>,<int>,<dbl>
Austria,Europe,1992,76.04,7914969,27042.02
Austria,Europe,1997,77.51,8069876,29095.92
Austria,Europe,2002,78.98,8148312,32417.61
Austria,Europe,2007,79.829,8199783,36126.49
Canada,Americas,1992,77.95,28523502,26342.88
Canada,Americas,1997,78.61,30305843,28954.93
Canada,Americas,2002,79.77,31902268,33328.97
Canada,Americas,2007,80.653,33390141,36319.24


In [23]:
head(data)

country,continent,year,lifeExp,pop,gdpPercap
<fct>,<fct>,<int>,<dbl>,<int>,<dbl>
Afghanistan,Asia,1952,28.801,8425333,779.4453
Afghanistan,Asia,1957,30.332,9240934,820.853
Afghanistan,Asia,1962,31.997,10267083,853.1007
Afghanistan,Asia,1967,34.02,11537966,836.1971
Afghanistan,Asia,1972,36.088,13079460,739.9811
Afghanistan,Asia,1977,38.438,14880372,786.1134


In [24]:
data |> head()

country,continent,year,lifeExp,pop,gdpPercap
<fct>,<fct>,<int>,<dbl>,<int>,<dbl>
Afghanistan,Asia,1952,28.801,8425333,779.4453
Afghanistan,Asia,1957,30.332,9240934,820.853
Afghanistan,Asia,1962,31.997,10267083,853.1007
Afghanistan,Asia,1967,34.02,11537966,836.1971
Afghanistan,Asia,1972,36.088,13079460,739.9811
Afghanistan,Asia,1977,38.438,14880372,786.1134


## `select`

In [25]:
data |> 
    select(year, country, pop) |>
    slice(1:12)

year,country,pop
<int>,<fct>,<int>
1952,Afghanistan,8425333
1957,Afghanistan,9240934
1962,Afghanistan,10267083
1967,Afghanistan,11537966
1972,Afghanistan,13079460
1977,Afghanistan,14880372
1982,Afghanistan,12881816
1987,Afghanistan,13867957
1992,Afghanistan,16317921
1997,Afghanistan,22227415


In [26]:
data |> 
    filter(country == "Canada") |>
    select(year, country, pop) 

year,country,pop
<int>,<fct>,<int>
1952,Canada,14785584
1957,Canada,17010154
1962,Canada,18985849
1967,Canada,20819767
1972,Canada,22284500
1977,Canada,23796400
1982,Canada,25201900
1987,Canada,26549700
1992,Canada,28523502
1997,Canada,30305843


In [27]:
data[data$country == "Canada", c("year", "country", "pop")]

year,country,pop
<int>,<fct>,<int>
1952,Canada,14785584
1957,Canada,17010154
1962,Canada,18985849
1967,Canada,20819767
1972,Canada,22284500
1977,Canada,23796400
1982,Canada,25201900
1987,Canada,26549700
1992,Canada,28523502
1997,Canada,30305843


In [28]:
data |> 
    filter(country == "Canada") |>
    select(-gdpPercap, -pop) 

country,continent,year,lifeExp
<fct>,<fct>,<int>,<dbl>
Canada,Americas,1952,68.75
Canada,Americas,1957,69.96
Canada,Americas,1962,71.3
Canada,Americas,1967,72.13
Canada,Americas,1972,72.88
Canada,Americas,1977,74.21
Canada,Americas,1982,75.76
Canada,Americas,1987,76.86
Canada,Americas,1992,77.95
Canada,Americas,1997,78.61


In [29]:
data |>
    select(!where(is.numeric)) |>
    distinct() |>
    head()

country,continent
<fct>,<fct>
Afghanistan,Asia
Albania,Europe
Algeria,Africa
Angola,Africa
Argentina,Americas
Australia,Oceania


## `sample_n`

In [30]:
data_10 <- data |>
    sample_n(10)

In [31]:
data_10

country,continent,year,lifeExp,pop,gdpPercap
<fct>,<fct>,<int>,<dbl>,<int>,<dbl>
Thailand,Asia,1997,67.521,60216677,5852.6255
Bulgaria,Europe,1982,71.08,8892098,8224.1916
Rwanda,Africa,1977,45.0,4657072,670.0806
Egypt,Africa,1972,51.137,34807417,2024.0081
Mauritania,Africa,2007,64.164,3270065,1803.1515
Tunisia,Africa,1962,49.579,4286552,1660.3032
Liberia,Africa,1962,40.502,1112796,634.1952
Somalia,Africa,1972,40.973,3840161,1254.5761
Sri Lanka,Asia,1962,62.192,10421936,1074.472
Germany,Europe,1982,73.8,78335266,22031.5327


In [32]:
data_10_perc <- data |>
    sample_frac(0.1)

In [33]:
nrow(data_10_perc)

In [34]:
data_10_perc |> head()

country,continent,year,lifeExp,pop,gdpPercap
<fct>,<fct>,<int>,<dbl>,<int>,<dbl>
Libya,Africa,1982,62.155,3344074,17364.275
Botswana,Africa,1962,51.52,512764,983.654
Bolivia,Americas,1997,62.05,7693188,3326.143
Gabon,Africa,1992,61.366,985739,13522.158
United Kingdom,Europe,1962,70.76,53292000,12477.177
Japan,Asia,1972,73.42,107188273,14778.786


In [36]:
str(data)

tibble [1,704 x 6] (S3: tbl_df/tbl/data.frame)
 $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
 $ year     : int [1:1704] 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
 $ lifeExp  : num [1:1704] 28.8 30.3 32 34 36.1 ...
 $ pop      : int [1:1704] 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
 $ gdpPercap: num [1:1704] 779 821 853 836 740 ...


## `slice_`

In [42]:
data |> slice(seq(1, 10, 5))

country,continent,year,lifeExp,pop,gdpPercap
<fct>,<fct>,<int>,<dbl>,<int>,<dbl>
Afghanistan,Asia,1952,28.801,8425333,779.4453
Afghanistan,Asia,1977,38.438,14880372,786.1134


In [45]:
data |> slice_head(n = 8)

country,continent,year,lifeExp,pop,gdpPercap
<fct>,<fct>,<int>,<dbl>,<int>,<dbl>
Afghanistan,Asia,1952,28.801,8425333,779.4453
Afghanistan,Asia,1957,30.332,9240934,820.853
Afghanistan,Asia,1962,31.997,10267083,853.1007
Afghanistan,Asia,1967,34.02,11537966,836.1971
Afghanistan,Asia,1972,36.088,13079460,739.9811
Afghanistan,Asia,1977,38.438,14880372,786.1134
Afghanistan,Asia,1982,39.854,12881816,978.0114
Afghanistan,Asia,1987,40.822,13867957,852.3959


In [47]:
data |> slice_tail(n = 8)

country,continent,year,lifeExp,pop,gdpPercap
<fct>,<fct>,<int>,<dbl>,<int>,<dbl>
Zimbabwe,Africa,1972,55.635,5861135,799.3622
Zimbabwe,Africa,1977,57.674,6642107,685.5877
Zimbabwe,Africa,1982,60.363,7636524,788.855
Zimbabwe,Africa,1987,62.351,9216418,706.1573
Zimbabwe,Africa,1992,60.377,10704340,693.4208
Zimbabwe,Africa,1997,46.809,11404948,792.45
Zimbabwe,Africa,2002,39.989,11926563,672.0386
Zimbabwe,Africa,2007,43.487,12311143,469.7093


In [50]:
data |> slice_sample(n = 3)

country,continent,year,lifeExp,pop,gdpPercap
<fct>,<fct>,<int>,<dbl>,<int>,<dbl>
"Congo, Rep.",Africa,2007,55.322,3800610,3632.558
Oman,Asia,1957,40.08,561977,2242.747
Lebanon,Asia,1987,67.926,3089353,5377.091


In [52]:
data |> slice_min(lifeExp, n = 5)

country,continent,year,lifeExp,pop,gdpPercap
<fct>,<fct>,<int>,<dbl>,<int>,<dbl>
Rwanda,Africa,1992,23.599,7290203,737.0686
Afghanistan,Asia,1952,28.801,8425333,779.4453
Gambia,Africa,1952,30.0,284320,485.2307
Angola,Africa,1952,30.015,4232095,3520.6103
Sierra Leone,Africa,1952,30.331,2143249,879.7877


In [55]:
data |> slice_max(lifeExp, n = 10)

country,continent,year,lifeExp,pop,gdpPercap
<fct>,<fct>,<int>,<dbl>,<int>,<dbl>
Japan,Asia,2007,82.603,127467972,31656.07
"Hong Kong, China",Asia,2007,82.208,6980412,39724.98
Japan,Asia,2002,82.0,127065841,28604.59
Iceland,Europe,2007,81.757,301931,36180.79
Switzerland,Europe,2007,81.701,7554661,37506.42
"Hong Kong, China",Asia,2002,81.495,6762476,30209.02
Australia,Oceania,2007,81.235,20434176,34435.37
Spain,Europe,2007,80.941,40448191,28821.06
Sweden,Europe,2007,80.884,9031088,33859.75
Israel,Asia,2007,80.745,6426679,25523.28


## `arrange()`

In [57]:
tmp <- data |> filter(year == 2007)

In [59]:
tmp |> head()

country,continent,year,lifeExp,pop,gdpPercap
<fct>,<fct>,<int>,<dbl>,<int>,<dbl>
Afghanistan,Asia,2007,43.828,31889923,974.5803
Albania,Europe,2007,76.423,3600523,5937.0295
Algeria,Africa,2007,72.301,33333216,6223.3675
Angola,Africa,2007,42.731,12420476,4797.2313
Argentina,Americas,2007,75.32,40301927,12779.3796
Australia,Oceania,2007,81.235,20434176,34435.3674


In [63]:
tmp |> arrange(pop)|> tail(10)

country,continent,year,lifeExp,pop,gdpPercap
<fct>,<fct>,<int>,<dbl>,<int>,<dbl>
Mexico,Americas,2007,76.195,108700891,11977.575
Japan,Asia,2007,82.603,127467972,31656.068
Nigeria,Africa,2007,46.859,135031164,2013.977
Bangladesh,Asia,2007,64.062,150448339,1391.254
Pakistan,Asia,2007,65.483,169270617,2605.948
Brazil,Americas,2007,72.39,190010647,9065.801
Indonesia,Asia,2007,70.65,223547000,3540.652
United States,Americas,2007,78.242,301139947,42951.653
India,Asia,2007,64.698,1110396331,2452.21
China,Asia,2007,72.961,1318683096,4959.115


In [66]:
tmp |> arrange(desc(pop))|> head(10)

country,continent,year,lifeExp,pop,gdpPercap
<fct>,<fct>,<int>,<dbl>,<int>,<dbl>
China,Asia,2007,72.961,1318683096,4959.115
India,Asia,2007,64.698,1110396331,2452.21
United States,Americas,2007,78.242,301139947,42951.653
Indonesia,Asia,2007,70.65,223547000,3540.652
Brazil,Americas,2007,72.39,190010647,9065.801
Pakistan,Asia,2007,65.483,169270617,2605.948
Bangladesh,Asia,2007,64.062,150448339,1391.254
Nigeria,Africa,2007,46.859,135031164,2013.977
Japan,Asia,2007,82.603,127467972,31656.068
Mexico,Americas,2007,76.195,108700891,11977.575


In [71]:
tmp |> arrange(continent, country)|> tail(10)

country,continent,year,lifeExp,pop,gdpPercap
<fct>,<fct>,<int>,<dbl>,<int>,<dbl>
Serbia,Europe,2007,74.002,10150265,9786.535
Slovak Republic,Europe,2007,74.663,5447502,18678.314
Slovenia,Europe,2007,77.926,2009245,25768.258
Spain,Europe,2007,80.941,40448191,28821.064
Sweden,Europe,2007,80.884,9031088,33859.748
Switzerland,Europe,2007,81.701,7554661,37506.419
Turkey,Europe,2007,71.777,71158647,8458.276
United Kingdom,Europe,2007,79.425,60776238,33203.261
Australia,Oceania,2007,81.235,20434176,34435.367
New Zealand,Oceania,2007,80.204,4115771,25185.009


## `mutate`

In [84]:
data_2007 <- data |>
    filter(year == 2007)

In [85]:
data_2007 <- data_2007 |>
    mutate(gdpTotal = pop * gdpPercap)


In [76]:
data_2007 |> head()

country,continent,year,lifeExp,pop,gdpPercap,gdpTotal
<fct>,<fct>,<int>,<dbl>,<int>,<dbl>,<dbl>
Afghanistan,Asia,2007,43.828,31889923,974.5803,31079291949
Albania,Europe,2007,76.423,3600523,5937.0295,21376411360
Algeria,Africa,2007,72.301,33333216,6223.3675,207444851958
Angola,Africa,2007,42.731,12420476,4797.2313,59583895818
Argentina,Americas,2007,75.32,40301927,12779.3796,515033625357
Australia,Oceania,2007,81.235,20434176,34435.3674,703658358894


In [86]:
data_2007 <- data_2007 |>
    mutate(gdpTotal = pop * gdpPercap,
          country = toupper(country),
          lifeExp = round(lifeExp))
data_2007 |> head()

country,continent,year,lifeExp,pop,gdpPercap,gdpTotal
<chr>,<fct>,<int>,<dbl>,<int>,<dbl>,<dbl>
AFGHANISTAN,Asia,2007,44,31889923,974.5803,31079291949
ALBANIA,Europe,2007,76,3600523,5937.0295,21376411360
ALGERIA,Africa,2007,72,33333216,6223.3675,207444851958
ANGOLA,Africa,2007,43,12420476,4797.2313,59583895818
ARGENTINA,Americas,2007,75,40301927,12779.3796,515033625357
AUSTRALIA,Oceania,2007,81,20434176,34435.3674,703658358894


`Europe` -> `EU`

In [87]:
data_2007 <- data_2007 |>
    mutate(
        continent = as.character(continent),
        continent = ifelse(continent == "Europe", "EU", continent),
        continent = as.factor(continent))

In [89]:
data_2007 |> head()

country,continent,year,lifeExp,pop,gdpPercap,gdpTotal
<chr>,<fct>,<int>,<dbl>,<int>,<dbl>,<dbl>
AFGHANISTAN,Asia,2007,44,31889923,974.5803,31079291949
ALBANIA,EU,2007,76,3600523,5937.0295,21376411360
ALGERIA,Africa,2007,72,33333216,6223.3675,207444851958
ANGOLA,Africa,2007,43,12420476,4797.2313,59583895818
ARGENTINA,Americas,2007,75,40301927,12779.3796,515033625357
AUSTRALIA,Oceania,2007,81,20434176,34435.3674,703658358894


In [93]:
data_2007 |>
    transmute(gdpTotal = pop * gdpPercap) |>
    head()

gdpTotal
<dbl>
31079291949
21376411360
207444851958
59583895818
515033625357
703658358894


## `rename`

In [102]:
data <- gapminder
data |> head()

country,continent,year,lifeExp,pop,gdpPercap
<fct>,<fct>,<int>,<dbl>,<int>,<dbl>
Afghanistan,Asia,1952,28.801,8425333,779.4453
Afghanistan,Asia,1957,30.332,9240934,820.853
Afghanistan,Asia,1962,31.997,10267083,853.1007
Afghanistan,Asia,1967,34.02,11537966,836.1971
Afghanistan,Asia,1972,36.088,13079460,739.9811
Afghanistan,Asia,1977,38.438,14880372,786.1134


In [103]:
data <- data |>
    rename(population = pop)

In [104]:
data |> head()

country,continent,year,lifeExp,population,gdpPercap
<fct>,<fct>,<int>,<dbl>,<int>,<dbl>
Afghanistan,Asia,1952,28.801,8425333,779.4453
Afghanistan,Asia,1957,30.332,9240934,820.853
Afghanistan,Asia,1962,31.997,10267083,853.1007
Afghanistan,Asia,1967,34.02,11537966,836.1971
Afghanistan,Asia,1972,36.088,13079460,739.9811
Afghanistan,Asia,1977,38.438,14880372,786.1134


In [105]:
# rename_if

In [107]:
# rename_at

## `group_by` + `summarise`

In [109]:
data |> 
    filter(year == 2007) |>
    group_by(continent) |>
    summarise(
        lifeExp2007 = mean(lifeExp),
        countriesCount = n()
    )

continent,lifeExp2007,countriesCount
<fct>,<dbl>,<int>
Africa,54.80604,52
Americas,73.60812,25
Asia,70.72848,33
Europe,77.6486,30
Oceania,80.7195,2


In [112]:
data |> 
    filter(year > 2000) |>
    group_by(continent, year) |>
    summarise(
        lifeExp2007 = mean(lifeExp),
        countriesCount = n(),
        population = sum(population)
    )

[1m[22m`summarise()` has grouped output by 'continent'. You can override using the
`.groups` argument.


continent,year,lifeExp2007,countriesCount,population
<fct>,<int>,<dbl>,<int>,<dbl>
Africa,2002,53.32523,52,833723916
Africa,2007,54.80604,52,929539692
Americas,2002,72.42204,25,849772762
Americas,2007,73.60812,25,898871184
Asia,2002,69.23388,33,3601802203
Asia,2007,70.72848,33,3811953827
Europe,2002,76.7006,30,578223869
Europe,2007,77.6486,30,586098529
Oceania,2002,79.74,2,23454829
Oceania,2007,80.7195,2,24549947


## `bind_`

In [117]:
data07 <- data |> 
    filter(year == 2007) |>
    group_by(continent) |>
    summarise(
        lifeExp = mean(lifeExp),
        countriesCount = n()
    )
data07 |> head()

continent,lifeExp,countriesCount
<fct>,<dbl>,<int>
Africa,54.80604,52
Americas,73.60812,25
Asia,70.72848,33
Europe,77.6486,30
Oceania,80.7195,2


In [119]:
data02 <- data |> 
    filter(year == 2002) |>
    group_by(continent) |>
    summarise(
        lifeExp = mean(lifeExp),
        countriesCount = n()
    )
data02 |> head()

continent,lifeExp,countriesCount
<fct>,<dbl>,<int>
Africa,53.32523,52
Americas,72.42204,25
Asia,69.23388,33
Europe,76.7006,30
Oceania,79.74,2


In [124]:
data_all <- data02 |> bind_rows(data07)
data_all |> head(10)

continent,lifeExp,countriesCount
<fct>,<dbl>,<int>
Africa,53.32523,52
Americas,72.42204,25
Asia,69.23388,33
Europe,76.7006,30
Oceania,79.74,2
Africa,54.80604,52
Americas,73.60812,25
Asia,70.72848,33
Europe,77.6486,30
Oceania,80.7195,2


In [125]:
data07 <- data |> 
    filter(year == 2007) |>
    group_by(continent) |>
    summarise(
        lifeExp = mean(lifeExp),
        countriesCount = n()
    )
data07 |> head()

continent,lifeExp,countriesCount
<fct>,<dbl>,<int>
Africa,54.80604,52
Americas,73.60812,25
Asia,70.72848,33
Europe,77.6486,30
Oceania,80.7195,2


In [127]:
data07_pop <- data |> 
    filter(year == 2007) |>
    group_by(continent) |>
    summarise(
        populatiom = sum(population)
    )
data07_pop |> head()

continent,populatiom
<fct>,<dbl>
Africa,929539692
Americas,898871184
Asia,3811953827
Europe,586098529
Oceania,24549947


In [130]:
data_all <- data07 |> 
    bind_cols(data07_pop) |>
    select(-4) |>
    rename(continent = `continent...1`)
data_all |> head()

[1m[22mNew names:
[36m*[39m `continent` -> `continent...1`
[36m*[39m `continent` -> `continent...4`


continent,lifeExp,countriesCount,populatiom
<fct>,<dbl>,<int>,<dbl>
Africa,54.80604,52,929539692
Americas,73.60812,25,898871184
Asia,70.72848,33,3811953827
Europe,77.6486,30,586098529
Oceania,80.7195,2,24549947


In [134]:
data07_pop <- data07_pop |> arrange(populatiom)
data07_pop |> head()

continent,populatiom
<fct>,<dbl>
Oceania,24549947
Europe,586098529
Americas,898871184
Africa,929539692
Asia,3811953827


In [136]:
data_all <- data07 |> 
    bind_cols(data07_pop) 
data_all |> head()

[1m[22mNew names:
[36m*[39m `continent` -> `continent...1`
[36m*[39m `continent` -> `continent...4`


continent...1,lifeExp,countriesCount,continent...4,populatiom
<fct>,<dbl>,<int>,<fct>,<dbl>
Africa,54.80604,52,Oceania,24549947
Americas,73.60812,25,Europe,586098529
Asia,70.72848,33,Americas,898871184
Europe,77.6486,30,Africa,929539692
Oceania,80.7195,2,Asia,3811953827


## `join`

In [137]:
data07 <- data |> 
    filter(year == 2007) |>
    group_by(continent) |>
    summarise(
        lifeExp = mean(lifeExp),
        countriesCount = n()
    )
data07 |> head()

continent,lifeExp,countriesCount
<fct>,<dbl>,<int>
Africa,54.80604,52
Americas,73.60812,25
Asia,70.72848,33
Europe,77.6486,30
Oceania,80.7195,2


In [139]:
data07_pop <- data07_pop |> arrange(populatiom)
data07_pop |> head()

continent,populatiom
<fct>,<dbl>
Oceania,24549947
Europe,586098529
Americas,898871184
Africa,929539692
Asia,3811953827


In [141]:
data_all <- data07 |>
    left_join(data07_pop, by = "continent")
data_all

continent,lifeExp,countriesCount,populatiom
<fct>,<dbl>,<int>,<dbl>
Africa,54.80604,52,929539692
Americas,73.60812,25,898871184
Asia,70.72848,33,3811953827
Europe,77.6486,30,586098529
Oceania,80.7195,2,24549947


In [143]:
first_df <- data.frame(Letter = c("A", "B", "C", "D", "E"),
                      Value = c(1:5))

second_df <- data.frame(Letter = c("A", "B", "C", "D", "F"),
                      Value = c(12, 7, 4, 1, 5))
first_df
second_df 

Letter,Value
<chr>,<int>
A,1
B,2
C,3
D,4
E,5


Letter,Value
<chr>,<dbl>
A,12
B,7
C,4
D,1
F,5


In [145]:
first_df |> left_join(second_df, by = "Letter")

Letter,Value.x,Value.y
<chr>,<int>,<dbl>
A,1,12.0
B,2,7.0
C,3,4.0
D,4,1.0
E,5,


In [147]:
first_df |> right_join(second_df, by = "Letter")

Letter,Value.x,Value.y
<chr>,<int>,<dbl>
A,1.0,12
B,2.0,7
C,3.0,4
D,4.0,1
F,,5


In [149]:
first_df |> inner_join(second_df, by = "Letter")

Letter,Value.x,Value.y
<chr>,<int>,<dbl>
A,1,12
B,2,7
C,3,4
D,4,1


In [151]:
first_df |> full_join(second_df, by = "Letter")

Letter,Value.x,Value.y
<chr>,<int>,<dbl>
A,1.0,12.0
B,2.0,7.0
C,3.0,4.0
D,4.0,1.0
E,5.0,
F,,5.0


In [155]:
not_good_data <- data.frame(Name = c("Nick", "Jake", "Anna", "Jane", "Dina"),
                           q1_2021 = c(12442, 22131, 21343, 22111, 14123),
                           q2_2021 = c(13442, 22871, 20343, 22222, 14456),
                           q3_2021 = c(15482, 22031, 22456, 22444, 14533),
                           q4_2021 = c(14511, 20031, 21741, 22333, 14511))
not_good_data

Name,q1_2021,q2_2021,q3_2021,q4_2021
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
Nick,12442,13442,15482,14511
Jake,22131,22871,22031,20031
Anna,21343,20343,22456,21741
Jane,22111,22222,22444,22333
Dina,14123,14456,14533,14511


In [156]:
library(tidyr)

In [158]:
better_data <- not_good_data |>
    gather(quater, salary, 2:5)
better_data

Name,quater,salary
<chr>,<chr>,<dbl>
Nick,q1_2021,12442
Jake,q1_2021,22131
Anna,q1_2021,21343
Jane,q1_2021,22111
Dina,q1_2021,14123
Nick,q2_2021,13442
Jake,q2_2021,22871
Anna,q2_2021,20343
Jane,q2_2021,22222
Dina,q2_2021,14456


In [161]:
better_data <- not_good_data |>
    gather(quater, salary, q1_2021:q4_2021)
better_data |> head()

Unnamed: 0_level_0,Name,quater,salary
Unnamed: 0_level_1,<chr>,<chr>,<dbl>
1,Nick,q1_2021,12442
2,Jake,q1_2021,22131
3,Anna,q1_2021,21343
4,Jane,q1_2021,22111
5,Dina,q1_2021,14123
6,Nick,q2_2021,13442


In [164]:
best_data <- better_data |>
    separate(quater, c("quater", "year"), sep = "_") |>
    mutate(year = as.numeric(year),
          quater = substr(better_data$quater,2,2),
          quater = as.numeric(quater))
best_data

Name,quater,year,salary
<chr>,<dbl>,<dbl>,<dbl>
Nick,1,2021,12442
Jake,1,2021,22131
Anna,1,2021,21343
Jane,1,2021,22111
Dina,1,2021,14123
Nick,2,2021,13442
Jake,2,2021,22871
Anna,2,2021,20343
Jane,2,2021,22222
Dina,2,2021,14456


In [166]:
best_data |> filter(Name == "Jane")

Name,quater,year,salary
<chr>,<dbl>,<dbl>,<dbl>
Jane,1,2021,22111
Jane,2,2021,22222
Jane,3,2021,22444
Jane,4,2021,22333


In [169]:
united_data <- best_data |>
    unite(Qt_Y, quater, year, sep = "#", remove = F)
united_data

Name,Qt_Y,quater,year,salary
<chr>,<chr>,<dbl>,<dbl>,<dbl>
Nick,1#2021,1,2021,12442
Jake,1#2021,1,2021,22131
Anna,1#2021,1,2021,21343
Jane,1#2021,1,2021,22111
Dina,1#2021,1,2021,14123
Nick,2#2021,2,2021,13442
Jake,2#2021,2,2021,22871
Anna,2#2021,2,2021,20343
Jane,2#2021,2,2021,22222
Dina,2#2021,2,2021,14456


In [171]:
final_data <- better_data |>
    spread(quater, salary)
final_data

Name,q1_2021,q2_2021,q3_2021,q4_2021
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
Anna,21343,20343,22456,21741
Dina,14123,14456,14533,14511
Jake,22131,22871,22031,20031
Jane,22111,22222,22444,22333
Nick,12442,13442,15482,14511


In [172]:
gapminder |>
    select(country, lifeExp, year) |>
    spread(year, lifeExp) |>
    head()

country,1952,1957,1962,1967,1972,1977,1982,1987,1992,1997,2002,2007
<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Afghanistan,28.801,30.332,31.997,34.02,36.088,38.438,39.854,40.822,41.674,41.763,42.129,43.828
Albania,55.23,59.28,64.82,66.22,67.69,68.93,70.42,72.0,71.581,72.95,75.651,76.423
Algeria,43.077,45.685,48.303,51.407,54.518,58.014,61.368,65.799,67.744,69.152,70.994,72.301
Angola,30.015,31.999,34.0,35.985,37.928,39.483,39.942,39.906,40.647,40.963,41.003,42.731
Argentina,62.485,64.399,65.142,65.634,67.065,68.481,69.942,70.774,71.868,73.275,74.34,75.32
Australia,69.12,70.33,70.93,71.1,71.93,73.49,74.74,76.32,77.56,78.83,80.37,81.235
