We will work on the `flights`  dataset for the exercise.


* Select all flights on January 1st

```r
jan1 <- filter(flights,month == 1, day == 1)
```

* Find flights that weren’t delayed (on arrival or departure) by more than two hours
```r
filter(flights, !(arr_delay > 120 | dep_delay > 120))
filter(flights, arr_delay <= 120, dep_delay <= 120)
```



* How many flights have a missing `dep_time`?  What might these rows represent?

```r
# cancelled flights
filter(flights, is.na(dep_time))
```



```r
summary(flights)
filter(flights, !is.na(dep_time),is.na(arr_time))
```


* How does `arrange()` work with missing values? How could you use `arrange()` to sort all missing values to the start?

```r
arrange(flights, dep_time) %>% tail()
arrange(flights, desc(dep_time))
arrange(flights, desc(is.na(dep_time)),dep_time)

```


* Sort flights to find the most delayed flights. 

```r
arrange(flights,desc(dep_delay))
```


* Sort flights to find the fastest flights.(in terms of flight time and speed)
```r
arrange(flights,air_time)
arrange(flights, desc(distance/air_time))
flights %>% mutate(speed = distance/air_time) %>%
  select(-ends_with("delay"),-contains("time")) %>% 
  arrange(desc(speed))
```

* Currently `dep_time` and `sched_dep_time` are convenient to look at, but hard to compute with because they’re not really continuous numbers. Convert them to a more convenient representation of number of minutes since midnight.

```r
 flights %>% 
    mutate(dep_time_mins = (dep_time %/% 100 * 60 + dep_time %% 100) %% 1440,
  sched_dep_time_mins = (sched_dep_time %/% 100 * 60 +sched_dep_time %% 100) %% 1440
)  %>% 
select( dep_time, dep_time_mins, sched_dep_time,
  sched_dep_time_mins
)
```

* Compare `air_time` with `arr_time - dep_time`. What do you expect to see? What do you see? What do you need to do to fix it?

```r
flights_airtime <-
  mutate(flights,
    dep_time = (dep_time %/% 100 * 60 + dep_time %% 100) %% 1440,
    arr_time = (arr_time %/% 100 * 60 + arr_time %% 100) %% 1440,
    air_time_diff = air_time - arr_time + dep_time
  )

nrow(filter(flights_airtime, air_time_diff != 0))
```

<!-- The flight passes midnight, so arr_time < dep_time. In these cases, the difference in airtime should be by 24 hours (1,440 minutes). -->

<!-- The flight crosses time zones, and the total air time will be off by hours (multiples of 60).  -->

```{r}
ggplot(flights_airtime, aes(x = air_time_diff)) +
  geom_histogram(binwidth = 1)

```

```{r}
ggplot(filter(flights_airtime, dest == "LAX"), aes(x = air_time_diff)) +
  geom_histogram(binwidth = 1)

```

* Compare `dep_time`, `sched_dep_time`, and `dep_delay`. How would you expect those three numbers to be related?

```r
flights_deptime <-
  mutate(flights,
    dep_time_min = (dep_time %/% 100 * 60 + dep_time %% 100) %% 1440,
    sched_dep_time_min = (sched_dep_time %/% 100 * 60 +
      sched_dep_time %% 100) %% 1440,
    dep_delay_diff = dep_delay - dep_time_min + sched_dep_time_min
  )

filter(flights_deptime, dep_delay_diff != 0)

```

```r
ggplot(
  filter(flights_deptime, dep_delay_diff > 0),
  aes(y = sched_dep_time_min, x = dep_delay_diff)
) +
  geom_point()
```

* Use the `summarize()` function to get the mean of `dep_delay` for each day.

```r
flights %>% 
  group_by(year, month, day) %>% 
  summarize(mean = mean(dep_delay,na.rm = TRUE))
```


* Obtain the average delays of the planes (identified by their tail number). Which one has the  highest delay? Plot a histogram for the average delays.
```r
delays <- flights %>% 
  group_by(tailnum) %>% 
  summarize(
    delay = mean(arr_delay, na.rm=TRUE),
    n = n()
  ) 
arrange(delays,desc(delay))

 ggplot(data = delays, mapping = aes(x = delay)) + 
  geom_histogram(binwidth = 10)

 tail(arrange(delays,desc(delay)))
flights %>% filter(tailnum == "N768SK")

 ggplot(data = filter(delays,!is.na(delay)), mapping = aes(x = delay)) + 
  geom_histogram(binwidth = 10)
```

*   When do the first and last flight leave each day?

```{r}
not_cancelled <- flights %>% 
  filter(!is.na(arr_delay),!is.na(dep_delay))
```


```{r}
not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarize(
    first = min(dep_time),
    last = max(dep_time)
  )
```



```{r}
not_cancelled %>% ggplot(aes(x = dep_time)) +
  geom_histogram(binwidth = 10)

not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarize(
    first = min(sched_dep_time),
    last = max(sched_dep_time)
  )
```


* Which destinations have the most carriers?
```r
not_cancelled %>% 
  group_by(dest) %>% 
  summarize(carriers = n_distinct(carrier)) %>% 
  arrange(desc(carriers))

 filter(airports, faa %in% c("ATL","BOS","CLT"))
```


*  What proportion of flights are delayed by more than an hour each day?
```r
not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarize(hour_prop = mean(arr_delay > 60)) %>% 
  arrange(desc(hour_prop))
```

```r
weather %>% 
  filter(month==3,day==8) %>% 
  select(-c(origin,year)) %>% 
  print(n=100)

ggplot(weather)+
 geom_bar(aes(x = visib))

ggplot(weather)+
 geom_histogram(aes(x = wind_speed))+
  scale_x_continuous(limits=c(0,50))

```

*  Which carrier has the worst delays? 
```r
not_cancelled %>%
  group_by(carrier) %>%
  summarize(arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
  arrange(desc(arr_delay))

 airlines %>% filter(carrier == "F9")
```


* What time of a day should you fly if you want to avoid delays as much as possible?

```r
not_cancelled %>%
  group_by(hour) %>%
  summarize(arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
  arrange(arr_delay)

```


*  Look at the number of cancelled flights per day. Is there a pattern? Is the proportion of cancelled flights related to the average delay?

```r
cancelled_per_day <- 
  flights %>%
  mutate(cancelled = (is.na(arr_delay) & is.na(dep_delay))) %>%
  group_by(year, month, day) %>%
  summarize(
    cancelled_num = sum(cancelled),
    flights_num = n(),
  )


ggplot(cancelled_per_day) +
  geom_point(aes(x = flights_num, y = cancelled_num)) 
```

```r
cancelled_and_delays <- 
  flights %>%
  mutate(cancelled = (is.na(arr_delay) & is.na(dep_delay))) %>%
  group_by(year, month, day) %>%
  summarize(
    cancelled_prop = mean(cancelled),
    avg_dep_delay = mean(dep_delay, na.rm = TRUE),
    avg_arr_delay = mean(arr_delay, na.rm = TRUE)
  )


ggplot(cancelled_and_delays) +
  geom_point(aes(x = avg_dep_delay, y = cancelled_prop))
```


* Find all destinations that are flown by at least two carriers. Use that information to rank the carriers.

```r
flights %>% summarize(n_distinct(dest))

flights %>%
   # find all airports with > 1 carrier
   group_by(dest) %>%
   mutate(n_carriers = n_distinct(carrier)) %>%
   filter(n_carriers > 1) %>%
   # rank carriers by numer of destinations
   group_by(carrier) %>%
   summarize(n_dest = n_distinct(dest)) %>%
   arrange(desc(n_dest))


flights %>%
  # rank carriers by numer of destinations
  group_by(carrier) %>%
  summarize(n_dest = n_distinct(dest)) %>%
  arrange(desc(n_dest))

airlines %>% filter(carrier %in% c("EV","9E","UA"))

```