Jupyter notebook for testing out Plotly graphs. (Once I have the graph code working on this page, I can then add it to my app.py page so that it will get included in my Dash app.)

In [66]:
import pandas as pd
import plotly.express as px
import nbformat
import numpy as np
import plotly.graph_objects as go

In [55]:
airline_color_map = {
    "DL":"#E3132C",
    "UA":"#005DAA",
    "AA":"gray",
    "WN":"#F9B612",
    "AS":"green",
    "B6":"#003876",
    "Other":"#804000"
}
# See https://plotly.com/python/discrete-color/

# RGB color sources:
# Delta: Schemecolor at https://www.schemecolor.com/delta-airlines-logo-colors.php
# United: Keshav Naidu at https://www.schemecolor.com/united-airlines-logo-blue-color.php
# JetBlue: Schemecolor at https://www.schemecolor.com/jetblue-airways-logo-color.php
# AA: I had initially used the gray color provided at 
# https://coloropedia.com/american-airlines-group-colors-logo-codes/ ,
# but that proved to be too light, so I chose a generic gray instead.
# Southwest (WN): https://www.schemecolor.com/southwest-airlines-logo-colors.php
# Color for 'other': https://en.wikipedia.org/wiki/Brown 

## Creating a bar chart showing the top 20 airports by passenger arrivals in 2018:

In [3]:
df_top_20_airports = pd.read_csv('top_20_airports_by_pax_arrivals_2018.csv')
df_top_20_airports

Unnamed: 0,Airport,Route_Type,Passengers,Rank
0,ATL,Domestic,41606158.0,1
1,ATL,International,5782717.0,1
2,LAX,Domestic,29643615.0,2
3,LAX,International,11859213.0,2
4,ORD,Domestic,30105889.0,3
5,ORD,International,6466318.0,3
6,DFW,Domestic,26907771.0,4
7,DFW,International,3915980.0,4
8,JFK,Domestic,14100771.0,5
9,JFK,International,15644170.0,5


In [4]:
top_20_airports_list = list(df_top_20_airports['Airport'].unique())
top_20_airports_list

['ATL',
 'LAX',
 'ORD',
 'DFW',
 'JFK',
 'DEN',
 'SFO',
 'LAS',
 'MCO',
 'SEA',
 'EWR',
 'PHX',
 'IAH',
 'MIA',
 'CLT',
 'BOS',
 'FLL',
 'MSP',
 'DTW',
 'PHL']

In [5]:
fig_top_20_airports_2018 = px.bar(df_top_20_airports, x="Airport", y="Passengers")

In [6]:
fig_top_20_airports_2018

In [7]:
df_aaa = pd.read_csv('local_copy_of_airports_airlines_aircraft_2018.csv')
df_aaa

Unnamed: 0,Airline,Origin_Dest,Plane_Type_Text,Passengers,Airport 1,Airport 2
0,HA,HNL_OGG,Boeing 717-200,1954139.0,HNL,OGG
1,DL,ATL_MCO,Boeing 757-200,1417832.0,ATL,MCO
2,HA,HNL_KOA,Boeing 717-200,1281221.0,HNL,KOA
3,WN,DAL_HOU,Boeing 737-700/700LR/Max 7,1260362.0,DAL,HOU
4,AA,DFW_LAX,Airbus Industrie A321/Lr,1257512.0,DFW,LAX
...,...,...,...,...,...,...
2799,NK,FLL_LGA,Airbus Industrie A320-100/200,100381.0,FLL,LGA
2800,EI,DUB_MCO,Airbus Industrie A330-200,100329.0,DUB,MCO
2801,YX,DCA_MCI,Embraer ERJ-175,100303.0,DCA,MCI
2802,WN,BNA_CLT,Boeing 737-700/700LR/Max 7,100282.0,BNA,CLT


In [8]:
# Consider building a graph that lets you pivot by airline name, airport, and plane type and then display the output in both chart form and table form.

In [9]:
# You could also try creating a chart that lets you compare the presence of a given set of airlines (maybe up to 5) for a given set of airports (maybe up to 20. The charts could be either grouped bar charts or stacked bar charts (to show the airline/airport relationship).

In [10]:
## Top 20 airlines in 2018:

df_top_20_airlines = pd.read_csv('top_20_airlines_by_passengers_2018.csv')
df_top_20_airlines

Unnamed: 0,Airline,Route_Type,Passengers,Rank
0,WN,Domestic,160578652.0,1
1,WN,International,4560615.0,1
2,AA,Domestic,119081636.0,2
3,AA,International,27186303.0,2
4,DL,Domestic,119611221.0,3
5,DL,International,23437824.0,3
6,UA,Domestic,84532551.0,4
7,UA,International,25853375.0,4
8,B6,Domestic,33547937.0,5
9,B6,International,8087960.0,5


Creating a list of the top 4 airlines (which will be useful for a later graph):

In [11]:
top_airlines_list = list(df_top_20_airlines['Airline'][0:4])
top_airlines_list

['WN', 'WN', 'AA', 'AA']

In [12]:
fig_top_20_airlines_2018 = px.bar(df_top_20_airlines, x="Airline", y="Passengers")
fig_top_20_airlines_2018

In [13]:
df_airline_airport_pairs = pd.read_csv('airport_airline_pairs_2018.csv')
df_airline_airport_pairs

Unnamed: 0,Airline,Airport,Route_Type,Destination_Region,Passengers
0,DL,ATL,Domestic,Domestic,30464398.0
1,AA,DFW,Domestic,Domestic,19283327.0
2,AA,CLT,Domestic,Domestic,12510203.0
3,UA,ORD,Domestic,Domestic,10421303.0
4,WN,MDW,Domestic,Domestic,9834058.0
...,...,...,...,...,...
7933,VJT,NRT,International,International,1.0
7934,0WQ,MIA,Domestic,Domestic,1.0
7935,LF,CVG,Domestic,Domestic,1.0
7936,04Q,ORL,Domestic,Domestic,1.0


In [14]:
top_airline_list_as_string = ("|".join(top_airlines_list)) # Converts the airlines in the list to a string value that the following np.where statement can use to create an 'Other' category of airlines
top_airline_list_as_string

'WN|WN|AA|AA'

In [15]:
df_airline_airport_pairs

Unnamed: 0,Airline,Airport,Route_Type,Destination_Region,Passengers
0,DL,ATL,Domestic,Domestic,30464398.0
1,AA,DFW,Domestic,Domestic,19283327.0
2,AA,CLT,Domestic,Domestic,12510203.0
3,UA,ORD,Domestic,Domestic,10421303.0
4,WN,MDW,Domestic,Domestic,9834058.0
...,...,...,...,...,...
7933,VJT,NRT,International,International,1.0
7934,0WQ,MIA,Domestic,Domestic,1.0
7935,LF,CVG,Domestic,Domestic,1.0
7936,04Q,ORL,Domestic,Domestic,1.0


In [16]:
df_top_airlines_and_airports = df_airline_airport_pairs.query("Airport in @top_20_airports_list").copy().reset_index(drop=True)
df_top_airlines_and_airports['Airline'] = np.where(df_top_airlines_and_airports['Airline'].str.contains(top_airline_list_as_string) == False, 'Other', df_top_airlines_and_airports['Airline'])
df_top_airlines_and_airports

Unnamed: 0,Airline,Airport,Route_Type,Destination_Region,Passengers
0,Other,ATL,Domestic,Domestic,30464398.0
1,AA,DFW,Domestic,Domestic,19283327.0
2,AA,CLT,Domestic,Domestic,12510203.0
3,Other,ORD,Domestic,Domestic,10421303.0
4,WN,DEN,Domestic,Domestic,9189061.0
...,...,...,...,...,...
1672,Other,MSP,Domestic,Domestic,1.0
1673,Other,LAX,Domestic,Domestic,1.0
1674,Other,MIA,International,Domestic,1.0
1675,Other,MIA,Domestic,Domestic,1.0


In [17]:
df_top_airlines_and_airports = df_top_airlines_and_airports.pivot_table(index = ["Airline", "Airport"], values = "Passengers", aggfunc = "sum").reset_index()


In [18]:
df_top_airlines_and_airports['Airline'].value_counts()

AA       20
Other    20
WN       18
Name: Airline, dtype: int64

In [19]:
airport_ranks = df_top_20_airports[['Airport', 'Rank']]
airport_ranks

Unnamed: 0,Airport,Rank
0,ATL,1
1,ATL,1
2,LAX,2
3,LAX,2
4,ORD,3
5,ORD,3
6,DFW,4
7,DFW,4
8,JFK,5
9,JFK,5


In [20]:
df_top_airlines_and_airports = df_top_airlines_and_airports.merge(airport_ranks, left_on = "Airport", right_on = "Airport")


In [21]:
df_top_airlines_and_airports.sort_values("Rank", inplace = True)
df_top_airlines_and_airports

Unnamed: 0,Airline,Airport,Passengers,Rank
0,AA,ATL,1272128.0,1
1,AA,ATL,1272128.0,1
2,Other,ATL,41023409.0,1
3,Other,ATL,41023409.0,1
4,WN,ATL,5093338.0,1
...,...,...,...,...
95,Other,PHL,6140048.0,20
94,Other,PHL,6140048.0,20
93,AA,PHL,7463574.0,20
92,AA,PHL,7463574.0,20


## Top 20 US Airports by Airline Share

In [22]:
df_top_20_airports

Unnamed: 0,Airport,Route_Type,Passengers,Rank
0,ATL,Domestic,41606158.0,1
1,ATL,International,5782717.0,1
2,LAX,Domestic,29643615.0,2
3,LAX,International,11859213.0,2
4,ORD,Domestic,30105889.0,3
5,ORD,International,6466318.0,3
6,DFW,Domestic,26907771.0,4
7,DFW,International,3915980.0,4
8,JFK,Domestic,14100771.0,5
9,JFK,International,15644170.0,5


In [23]:
# Top 20 US Airports by Airline Share:

top_airlines_list = list(df_top_20_airlines['Airline'].unique()[0:5])
top_airline_list_as_string = ("|".join(top_airlines_list)) # Converts the airlines in the list to a string value that the following np.where statement can use to create an 'Other' category of airlines
# unique() tags are needed to remove duplicate entries for each airport and
# airline. (These duplicates were created through the addition of 
# domestic/international travel breakdowns for each of the top airports
# and airlines.)

top_20_airports_list = list(df_top_20_airports['Airport'].unique())
print(top_airlines_list, top_20_airports_list)

['WN', 'AA', 'DL', 'UA', 'B6'] ['ATL', 'LAX', 'ORD', 'DFW', 'JFK', 'DEN', 'SFO', 'LAS', 'MCO', 'SEA', 'EWR', 'PHX', 'IAH', 'MIA', 'CLT', 'BOS', 'FLL', 'MSP', 'DTW', 'PHL']


In [24]:
df_airline_airport_pairs = pd.read_csv("airport_airline_pairs_2018.csv")


df_top_airlines_and_airports = df_airline_airport_pairs.query("Airport in @top_20_airports_list").copy().reset_index(drop=True)
df_top_airlines_and_airports['Airline'] = np.where(df_top_airlines_and_airports['Airline'].str.contains(top_airline_list_as_string) == False, 'Other', df_top_airlines_and_airports['Airline'])
df_top_airlines_and_airports = df_top_airlines_and_airports.pivot_table(index = ["Airline", "Airport"], values = "Passengers", aggfunc = "sum").reset_index()

In [25]:
airport_ranks = df_top_20_airports[['Airport', 'Rank']].drop_duplicates()
airport_ranks

Unnamed: 0,Airport,Rank
0,ATL,1
2,LAX,2
4,ORD,3
6,DFW,4
8,JFK,5
10,DEN,6
12,SFO,7
14,LAS,8
16,MCO,9
18,SEA,10


In [None]:
df_top_airlines_and_airports = df_top_airlines_and_airports.merge(airport_ranks, left_on = "Airport", right_on = "Airport")

In [48]:



df_top_airlines_and_airports.sort_values("Rank", inplace = True)

fig_t4_airline_presence_at_t20_airports = px.bar(df_top_airlines_and_airports, x="Airport", y="Passengers", color="Airline", color_discrete_map=airline_color_map, title="Top 20 US Airports by Airline Share in 2018")
fig_t4_airline_presence_at_t20_airports

In [27]:
df_top_hubs = df_airline_airport_pairs.pivot_table(index = ['Airline', 'Airport'], values = 'Passengers', aggfunc = 'sum').reset_index().sort_values('Passengers', ascending = False)
df_top_hubs.head(20)
df_top_hubs['Hub'] = df_top_hubs['Airline'] + ' ' + df_top_hubs['Airport']
df_top_hubs

Unnamed: 0,Airline,Airport,Passengers,Hub
3035,DL,ATL,35049728.0,DL ATL
1910,AA,DFW,21814244.0,AA DFW
1894,AA,CLT,13853081.0,AA CLT
1981,AA,MIA,12595599.0,AA MIA
5826,UA,ORD,12153983.0,UA ORD
...,...,...,...,...
1088,27Q,CAE,1.0,27Q CAE
1087,27Q,BYH,1.0,27Q BYH
4507,LXQ,BOS,1.0,LXQ BOS
1081,27Q,BRO,1.0,27Q BRO


In [57]:
fig_top_hubs = px.bar(df_top_hubs.iloc[0:20, :], x = 'Hub', y = 'Passengers', color = 'Airline', color_discrete_map=airline_color_map)
fig_top_hubs.update_xaxes(categoryorder = 'total descending', title = "Top 20 US Airport Hubs in 2018 by Arriving Passengers") # See https://plotly.com/python/categorical-axes/

## Determining the top international hubs:

In [29]:
df_top_intl_hubs = df_airline_airport_pairs.query("Route_Type == 'International' & Destination_Region == 'Domestic'").pivot_table(index = ['Airline', 'Airport'], values = 'Passengers', aggfunc = 'sum').reset_index().sort_values('Passengers', ascending = False)
# I'd only like to show US airports within this chart, so I chose to filter it to include only domestic airports.
df_top_intl_hubs['Hub'] = df_top_intl_hubs['Airline'] + ' ' + df_top_intl_hubs['Airport']
df_top_intl_hubs.head(20)

Unnamed: 0,Airline,Airport,Passengers,Hub
542,AA,MIA,5324448.0,AA MIA
890,DL,ATL,4585330.0,DL ATL
1695,UA,EWR,3549291.0,UA EWR
1703,UA,IAH,2962868.0,UA IAH
525,AA,DFW,2530917.0,AA DFW
924,DL,JFK,2392013.0,DL JFK
732,B6,JFK,1850171.0,B6 JFK
1731,UA,SFO,1742889.0,UA SFO
1721,UA,ORD,1732680.0,UA ORD
524,AA,CLT,1342878.0,AA CLT


In [56]:
fig_top_intl_hubs = px.bar(df_top_intl_hubs.iloc[0:20, :], x = 'Hub', y = 'Passengers', color = 'Airline', color_discrete_map=airline_color_map)
# fig_top_intl_hubs.update_traces(marker_line=dict(width=1,color='black'))
# See https://plotly.com/python/marker-style/
fig_top_intl_hubs.update_xaxes(categoryorder = 'total descending', title = "Top 20 US Airport Hubs in 2018 by Arriving International Passengers") # See https://plotly.com/python/categorical-axes/

# Creating a table of departures for a given airport:

In [62]:
df_dest_by_origin = pd.read_csv('dest_to_origin_2018.csv')

In [81]:
def create_departures_table(dest_airport):
    df_airport_by_origin = df_dest_by_origin.query("Airport == @dest_airport").sort_values('Passengers', ascending = False).reset_index(drop=True)
    if len(df_airport_by_origin) > 20:
        other_row = [dest_airport, 'Other', sum(df_airport_by_origin.iloc[20:,]['Passengers'])]
        df_airport_by_origin = df_airport_by_origin.iloc[0:20]
        df_airport_by_origin.loc[len(df_airport_by_origin)] = other_row
    pax_sum = sum(df_airport_by_origin['Passengers'])    
    df_airport_by_origin['Share'] = 100*df_airport_by_origin['Passengers'] / pax_sum  
    return (df_airport_by_origin)


In [96]:
df_origins_for_airport = create_departures_table('ABQ')
df_origins_for_airport

Unnamed: 0,Airport,Origin,Passengers,Share
0,ABQ,PHX,327011.0,12.045691
1,ABQ,DEN,294131.0,10.834532
2,ABQ,DFW,287698.0,10.597568
3,ABQ,DAL,190543.0,7.018792
4,ABQ,LAX,185873.0,6.846769
5,ABQ,LAS,166267.0,6.124567
6,ABQ,ATL,133990.0,4.93562
7,ABQ,ORD,115181.0,4.242777
8,ABQ,HOU,115137.0,4.241156
9,ABQ,SAN,95127.0,3.504073


In [97]:
sum(df_origins_for_airport['Share'])

100.0

In [98]:
df = df_origins_for_airport
go.Figure(data=go.Table(
    header = dict(
        values = list(df.columns)
        ), 
    cells = dict(
        values = [df['Airport'], df['Origin'], df['Passengers'], df['Share']] )))

In [99]:
fig_top_origins_for_airport = px.bar(df_origins_for_airport, x = 'Origin', y = 'Passengers')
fig_top_origins_for_airport