# Flight delays

https://www.kaggle.com/usdot/flight-delays

### Description of data 

- YEAR, MONTH, DAY, DAY_OF_WEEK: dates of the flight 
- AIRLINE: An identification number assigned by US DOT to identify a unique airline 
- ORIGIN_AIRPORT and DESTINATION_AIRPORT: code attributed by IATA to identify the airports 
- SCHEDULED_DEPARTURE and SCHEDULED_ARRIVAL : scheduled times of take-off and landing 
- DEPARTURE_TIME and ARRIVAL_TIME: real times at which take-off and landing took place 
- DEPARTURE_DELAY and ARRIVAL_DELAY: difference (in minutes) between planned and real times 
- DISTANCE: distance (in miles) 

### Python DataViz

- MATPLOTLIB: Available directly in pandas
- SEABORN: A layer on top of matplotlib. Makes it pretty. Use it with pandas.
- BOKEH: To build complex interactive applications.
- PLOTLY / DASH: Recent competitor of bokeh.
- ALTAIR: Nice alternative to matplotlib. Resulting charts are cleaner.


In [2]:
import pandas as pd
import numpy as np
import altair as alt    #https://altair-viz.github.io/


In [1]:
## Data 

# Local Load
import os
folder = 'flight_data'
airlines = pd.read_csv(os.path.join(folder, 'airlines.csv'))
airports = pd.read_csv(os.path.join(folder, 'airports.csv'))
flights = pd.read_csv(os.path.join(folder, 'flights.csv'))

# Load from internet
#airlines = pd.read_csv("https://srv-file20.gofile.io/download/8Awos9/airlines.csv")
#airports = pd.read_csv("https://srv-file20.gofile.io/download/8Awos9/airports.csv")
#flights = pd.read_csv("https://srv-file20.gofile.io/download/8Awos9/flights.csv")


## 3. Merge the airlines and the flights dataset
flights = flights.rename(columns={'AIRLINE': 'IATA_CODE'})
merged = flights.merge(airlines, left_on='IATA_CODE', right_on='IATA_CODE')


NameError: name 'pd' is not defined

## Graphs With Altair

### Let's plot ORIGIN_AIRPORT vs DESTINATION_AIRPORT

Objective: let's explore if some pairs of origin / destination are particularly late

In [5]:
basic_scatter = alt.Chart(flights.head(500), title='Distance vs Arrival Delay').mark_point().encode(
    alt.X('DESTINATION_AIRPORT', title="Destination"),
    alt.Y('ORIGIN_AIRPORT', title="Origin"),
    size='count()',
)
basic_scatter

### Let's keep only the top 20 airports

In [6]:
# top 20 airports
top_airports = flights.groupby(['ORIGIN_AIRPORT'])['YEAR'].count().sort_values(ascending=False).head(20).index

# computing a few metrics on each airport pair
airport_pairs = (flights.loc[(flights.ORIGIN_AIRPORT.isin(top_airports)) & (flights.DESTINATION_AIRPORT.isin(top_airports))]
         .groupby(['ORIGIN_AIRPORT', 'DESTINATION_AIRPORT'])
         .agg({
                     'ARRIVAL_DELAY': np.median,
                     'DISTANCE': 'mean',
                     'DEPARTURE_DELAY': 'count'

                 })
         .reset_index()
         .rename(columns={'ARRIVAL_DELAY': 'median_arrival_delay', 
                          'DISTANCE': 'distance', 
                          'DEPARTURE_DELAY': 'nb_flights' })
                 
)
airport_pairs.head()

Unnamed: 0,ORIGIN_AIRPORT,DESTINATION_AIRPORT,median_arrival_delay,distance,nb_flights
0,ATL,BOS,-5.0,946,85
1,ATL,CLT,-6.0,226,107
2,ATL,DEN,-1.0,1199,92
3,ATL,DFW,-2.5,731,132
4,ATL,DTW,-3.0,594,88


### Now we get a simple scatter plot that fit on the screen

In [7]:
basic_scatter = alt.Chart(airport_pairs, title='Arrival Delay by airport Pair').mark_point().encode(
    alt.X('DESTINATION_AIRPORT', title="Destination"),
    alt.Y('ORIGIN_AIRPORT', title="Origin"),
    size='nb_flights',
)

basic_scatter

## Adding a color palette

In [8]:
basic_scatter = alt.Chart(airport_pairs, title='Arrival Delay by airport Pair').mark_point().encode(
    alt.X('DESTINATION_AIRPORT', title="Destination"),
    alt.Y('ORIGIN_AIRPORT', title="Origin"),
    size='nb_flights',
    ## New
    color=alt.Color('median_arrival_delay', 
                    scale=alt.Scale(scheme='inferno'), 
                    legend=alt.Legend(title="Median Arrival Delay")
                  )
)

basic_scatter



### Adding a Tooltip

In [27]:
less_basic_scatter = alt.Chart(airport_pairs, title='Arrival Delay by airport Pair').mark_point().encode(
    alt.X('DESTINATION_AIRPORT', title="Destination"),
    alt.Y('ORIGIN_AIRPORT', title="Origin"),
    size='nb_flights',
    color=alt.Color('median_arrival_delay', 
                     scale=alt.Scale(scheme='inferno'), 
                     legend=alt.Legend(title="Median Arrival Delay")
                  ),
    tooltip=[  alt.Tooltip('ORIGIN_AIRPORT')
             , alt.Tooltip('DESTINATION_AIRPORT')
             , alt.Tooltip('nb_flights')
             , alt.Tooltip('median_arrival_delay')
            ],     
)

less_basic_scatter

### Interactive Charts: selection v1

In [28]:
## SELECTION
airport_pairs['distance_bins'] = pd.cut(airport_pairs['distance'], 4).astype(str)
selection = alt.selection_multi(fields=['distance_bins'], bind='legend')

## CHART
scatter_w_selection = alt.Chart(airport_pairs, title='Arrival Delay by airport Pair').mark_point().encode(
    alt.X('DESTINATION_AIRPORT', title="Destination"),
    alt.Y('ORIGIN_AIRPORT', title="Origin"),
    size='distance_bins',  ### NEW
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2)), ### NEW
    color=alt.Color('median_arrival_delay', 
                     scale=alt.Scale(scheme='inferno'), 
                     legend=alt.Legend(title="Median Arrival Delay")
                  ),
    tooltip=[  alt.Tooltip('ORIGIN_AIRPORT')
             , alt.Tooltip('DESTINATION_AIRPORT')
             , alt.Tooltip('nb_flights')
             , alt.Tooltip('median_arrival_delay')
            ],    
### NEW    
).add_selection(  
    selection
)

scatter_w_selection

### Interactive Charts: selection but nice

In [31]:
## SELECTION
bins = pd.cut(airport_pairs['distance'], 4, 
              labels=['1. short (<812km)', '2. medium (<1447km)', '3. long (<2082km)', '4. v.long (<2727km)']
             )
airport_pairs['distance_bins'] = bins
selection = alt.selection_multi(fields=['distance_bins'], bind='legend')

## CHART (NO CHANGE)
scatter_w_selection_nice = alt.Chart(airport_pairs, title='Arrival Delay by airport Pair').mark_point().encode(
    alt.X('DESTINATION_AIRPORT', title="Destination"),
    alt.Y('ORIGIN_AIRPORT', title="Origin"),
    size='distance_bins',
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),    
    color=alt.Color('median_arrival_delay', 
                           scale=alt.Scale(scheme='inferno'), 
                           legend=alt.Legend(title="Median Arrival Delay")
                          ),
    tooltip=[  alt.Tooltip('ORIGIN_AIRPORT')
             , alt.Tooltip('DESTINATION_AIRPORT')
             , alt.Tooltip('nb_flights')
             , alt.Tooltip('median_arrival_delay')

            ],     
).add_selection(
selection
)

scatter_w_selection_nice

In [1]:
## ML ?
