In [1]:
library(tidyverse)
fname = file.path("ME315", "voos", "flights_small.csv.zip")
file.exists(fname)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.2.1 ──
[32m✔[39m [34mggplot2[39m 3.2.0     [32m✔[39m [34mpurrr  [39m 0.3.2
[32m✔[39m [34mtibble [39m 2.1.3     [32m✔[39m [34mdplyr  [39m 0.8.3
[32m✔[39m [34mtidyr  [39m 0.8.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.4.0
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


In [2]:
#Exemplo 1: Leitura em partições

input = read_csv(fname, n_max = 1000)
input %>% head

Parsed with column specification:
cols(
  .default = col_double(),
  AIRLINE = [31mcol_character()[39m,
  TAIL_NUMBER = [31mcol_character()[39m,
  ORIGIN_AIRPORT = [31mcol_character()[39m,
  DESTINATION_AIRPORT = [31mcol_character()[39m,
  SCHEDULED_DEPARTURE = [31mcol_character()[39m,
  DEPARTURE_TIME = [31mcol_character()[39m,
  WHEELS_OFF = [31mcol_character()[39m,
  WHEELS_ON = [31mcol_character()[39m,
  SCHEDULED_ARRIVAL = [31mcol_character()[39m,
  ARRIVAL_TIME = [31mcol_character()[39m,
  CANCELLATION_REASON = [31mcol_character()[39m
)
See spec(...) for full column specifications.


YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,⋯,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
2015,7,8,3,WN,2414,N8634A,DEN,MCO,650,⋯,1225,0,0,0,,,,,,
2015,3,11,3,US,1850,N187US,PHL,MCO,650,⋯,913,-17,0,0,,,,,,
2015,8,4,2,MQ,3429,N663MQ,CMI,ORD,710,⋯,752,-31,0,0,,,,,,
2015,6,4,4,AS,658,N409AS,SEA,DFW,645,⋯,1225,-15,0,0,,,,,,
2015,4,7,2,DL,1061,N3758Y,LAX,MCO,1025,⋯,1821,18,0,0,,0.0,0.0,2.0,16.0,0.0
2015,7,17,5,DL,913,N317US,TVC,MSP,1627,⋯,1641,-8,0,0,,,,,,


In [8]:
# Aqui estou querendo calcular a quantidade de voos mensais por companhia aérea
# Para tanto: 1) agrupamos por MONTH e AIRLINE e 2) somamos a quantidade de voos
# Para o item 2) podemos utilizar summarize, count ou tally

#Aqui estamos utilizando apenas as 1000 primeiras linhas do banco de dados

res1 = input %>% drop_na(MONTH, AIRLINE) %>%
    group_by(MONTH, AIRLINE) %>%
    tally()

In [5]:
#Agora, vamos criar uma função para iniciar a leitura por partições.
#Anteriormente estudamos as 1000 linhas. Agora, iremos ler essas mil linhas usando o príncipio de partições.

#Criando função getStats
#Observe que o código DENTRO DA FUNÇÃO é o mesmo que criamos anteriormente!!!
getStats = function(inout, pos){
    input %>% drop_na(MONTH, AIRLINE) %>%
        group_by(MONTH, AIRLINE) %>%
        tally()
}

suficientes = read_csv_chunked(fname, chunk_size = 1000, callback = DataFrameCallback$new(getStats))

Parsed with column specification:
cols(
  .default = col_double(),
  AIRLINE = [31mcol_character()[39m,
  TAIL_NUMBER = [31mcol_character()[39m,
  ORIGIN_AIRPORT = [31mcol_character()[39m,
  DESTINATION_AIRPORT = [31mcol_character()[39m,
  SCHEDULED_DEPARTURE = [31mcol_character()[39m,
  DEPARTURE_TIME = [31mcol_character()[39m,
  WHEELS_OFF = [31mcol_character()[39m,
  WHEELS_ON = [31mcol_character()[39m,
  SCHEDULED_ARRIVAL = [31mcol_character()[39m,
  ARRIVAL_TIME = [31mcol_character()[39m,
  CANCELLATION_REASON = [31mcol_character()[39m
)
See spec(...) for full column specifications.


In [6]:
suficientes %>% head

MONTH,AIRLINE,n
<dbl>,<chr>,<int>
1,AA,9
1,AS,2
1,B6,3
1,DL,19
1,EV,5
1,F9,1


In [10]:
head(res1)

MONTH,AIRLINE,n
<dbl>,<chr>,<int>
1,AA,9
1,AS,2
1,B6,3
1,DL,19
1,EV,5
1,F9,1


In [38]:
#Agora, iremos ver os os voos mensais por companhia aérea:

computeStats = function(input)
    input %>% group_by(AIRLINE, MONTH) %>%
        summarise(n=sum(n))
suficientes %>% computeStats

AIRLINE,MONTH,n
<chr>,<dbl>,<int>
AA,1,90
AA,2,50
AA,3,100
AA,4,100
AA,5,90
AA,6,60
AA,7,180
AA,8,110
AA,9,170
AA,10,160


In [14]:
#Exemplo 2: Tipos de join

x = data.frame(c1 = c(1,2,3), x1 = rnorm(3))
y = data.frame(c1 = c(0,2,3), y2 = rnorm(3))
x
y

c1,x1
<dbl>,<dbl>
1,-1.207334
2,2.618244
3,-1.163824


c1,y2
<dbl>,<dbl>
0,1.5711702
2,-2.3410132
3,0.7706199


In [15]:
#semi_join: todas as colinas da tabela 1; todos os registros da interseção

x %>% semi_join(y, by = "c1")

c1,x1
<dbl>,<dbl>
2,2.618244
3,-1.163824


In [16]:
#anti_join: todas as colinas da tabela 1; todos os registros que não existirem da tabela 2

x %>% anti_join(y, by = "c1")

c1,x1
<dbl>,<dbl>
1,-1.207334


In [17]:
#left_join: todas as colunas de ambas as tabelas; todos os registros da esquerda

x %>% left_join(y, by = "c1")

c1,x1,y2
<dbl>,<dbl>,<dbl>
1,-1.207334,
2,2.618244,-2.3410132
3,-1.163824,0.7706199


In [18]:
#right_join: todas as colunas de ambas as tabelas; todos os registros da direita

x %>% right_join(y, by = "c1")

c1,x1,y2
<dbl>,<dbl>,<dbl>
0,,1.5711702
2,2.618244,-2.3410132
3,-1.163824,0.7706199


In [19]:
#inner_join: todas as colunas de ambas as tabelas; todos os registros da interseção

x %>% inner_join(y, by = "c1")

c1,x1,y2
<dbl>,<dbl>,<dbl>
2,2.618244,-2.3410132
3,-1.163824,0.7706199


In [20]:
#full_join: todas as colunas de ambas as tabelas; todos os registros de ambas as tabelas

x %>% full_join(y, by = "c1")

c1,x1,y2
<dbl>,<dbl>,<dbl>
1,-1.207334,
2,2.618244,-2.3410132
3,-1.163824,0.7706199
0,,1.5711702


In [None]:
#Primeiramente devemos identificar a chave para fazer a ligação entre as tabelas.
#Se as tabelas tiverem nomes diferentes, fazemos:
#Tabela 1: chama RG; tabela 2: chama RegistroGeral
#x %>% left_join(y, by = "RG"="RegistroGeral")

In [29]:
#Exemplo 3: Gather

taxas = tibble(Pais = c("Brasil", "Argentina"), 
                  "1999" = c(10, 20),
                  "2000" = c(20, 30))
taxas

Pais,1999,2000
<chr>,<dbl>,<dbl>
Brasil,10,20
Argentina,20,30


In [31]:
#A ideia do gather é que o nome das colunas são os valores de uma variável
#Aqui estamos dizendo o nome das colunas explicitamente

taxas %>% gather(Ano, Valor, "1999":"2000")

Pais,Ano,Valor
<chr>,<chr>,<dbl>
Brasil,1999,10
Argentina,1999,20
Brasil,2000,20
Argentina,2000,30


In [32]:
#Ou então podemos selecionar todas a colunas MENOS A COLUNA PAÍS
taxas %>% gather(Ano, Valor, -Pais)

Pais,Ano,Valor
<chr>,<chr>,<dbl>
Brasil,1999,10
Argentina,1999,20
Brasil,2000,20
Argentina,2000,30


In [36]:
#Exemplo 4: Spread

dat2 = tibble(Pais=c("Brasil", "Brasil", 
                     "Argentina", "Argentina"),
             Obs=c("Casos", "Populacao", "Casos", "Populacao"),
             Qtde = c (10, 100000000, 1, 10000000))
dat2

Pais,Obs,Qtde
<chr>,<chr>,<dbl>
Brasil,Casos,10.0
Brasil,Populacao,100000000.0
Argentina,Casos,1.0
Argentina,Populacao,10000000.0


In [37]:
#Não está no formato tidy. Para consertá-lo, utilizamos o comando spread!

dat2 %>% spread(Obs, Qtde)

Pais,Casos,Populacao
<chr>,<dbl>,<dbl>
Argentina,1,10000000.0
Brasil,10,100000000.0
