Jupyter notebook for testing out Plotly graphs. (Once I have the graph code working on this page, I can then add it to my app.py page so that it will get included in my Dash app.)

In [1]:
import pandas as pd
import plotly.express as px
import nbformat
import numpy as np
import plotly.graph_objects as go

In [2]:
airline_color_map = {
    "DL":"#E3132C",
    "UA":"#005DAA",
    "AA":"gray",
    "WN":"#F9B612",
    "AS":"green",
    "B6":"#003876",
    "Other":"#804000"
}
# See https://plotly.com/python/discrete-color/

# RGB color sources:
# Delta: Schemecolor at https://www.schemecolor.com/delta-airlines-logo-colors.php
# United: Keshav Naidu at https://www.schemecolor.com/united-airlines-logo-blue-color.php
# JetBlue: Schemecolor at https://www.schemecolor.com/jetblue-airways-logo-color.php
# AA: I had initially used the gray color provided at 
# https://coloropedia.com/american-airlines-group-colors-logo-codes/ ,
# but that proved to be too light, so I chose a generic gray instead.
# Southwest (WN): https://www.schemecolor.com/southwest-airlines-logo-colors.php
# Color for 'other': https://en.wikipedia.org/wiki/Brown 

## Interactive air traffic graph:

In [3]:
df_airline_airport_pairs = pd.read_csv('airport_airline_pairs_2018.csv')
all_data_value = 'All_Traffic'
df_airline_airport_pairs[all_data_value] = all_data_value

In [68]:
def update_chart(pivot_values, color_value, route_types_to_show, airports_to_graph, airlines_to_graph):
    # These arguments correspond to the input values
    # listed (in the same order).
    # return f'You have selected {pivot_values}'

    # The following code creates a pivot table version of the DataFrame that 
    # can be used for creating bar charts. It takes the specified pivot values
    # and color values as inputs, and then uses those values to group the
    # data accordingly. The code works with different numbers of pivot values,
    # including zero pivot values.
    # In order to represent all of the specified values, the code creates a 
    # column describing all (or almost all) of the pivot index variables
    # in the other columns, which then gets fed 
    # into the x axis parameter of the bar chart. However, if a color value is
    # also specified, this item does not get added into this column, since this
    # data will already get represented in the bar chart (by means of the color
    # legend). Removing this value helps
    # simplify the final chart output.

    data_source = df_airline_airport_pairs.copy()
    data_source_filtered = data_source.query("Route_Type in @route_types_to_show & Airport in @airports_to_graph & Airline in @airlines_to_graph").copy()

    # The following lines convert dropdown text to 
    # DataFrame column variables where discrepancies
    # exist between the two.
    pivot_values = ['Route_Type' if entry == 'Route Type' else entry for entry in pivot_values]

    if color_value == 'Route Type':
        color_value = 'Route_Type'
    
    color_value = color_value # This color value must also be present
    # within the pivot_values table.
    # group_value = 'Airline'
    if len(pivot_values) == 0:
        data_source_pivot = data_source_filtered.pivot_table(index = 'All_Traffic', values = 'Passengers', aggfunc = 'sum').reset_index()
    else:
        data_source_pivot = data_source_filtered.pivot_table(index = pivot_values, values = 'Passengers', aggfunc = 'sum').reset_index()

    # The following lines create a column containing the values of each of the
    # columns (other than the 'Passengers') column present in the bar chart. A
    # for loop is used so that this column can adapt to different variable
    # choices and different numbers of columns.
    if len(pivot_values) == 0:
        data_descriptor = all_data_value
    else:
        data_descriptor_values = pivot_values.copy()
        print("data_descriptor_values:", data_descriptor_values)
        if (color_value != 'None') & (len(data_descriptor_values) > 1):
            data_descriptor_values.remove(color_value) # If a value will be assigned a
            # color component in the graph, it doesn't need to be assigned a 
            # group component, since it will show up in the graph regardless. Removing 
            # it here helps simplify the graph.
        print("data_descriptor_values:", data_descriptor_values)
        data_descriptor = data_source_pivot[data_descriptor_values[0]].copy() # This copy() statement
        # is needed in order to avoid  modifying this column when the group column
        # gets chosen.
        for i in range(1, len(data_descriptor_values)):
            data_descriptor += ' ' + data_source_pivot[data_descriptor_values[i]]

    data_source_pivot['Group'] = data_descriptor

    data_source_pivot.head(5)

    output_histogram = px.histogram(data_source_pivot, x = 'Group', y = 'Passengers', color = None if color_value == 'None' else color_value, barmode = 'group', color_discrete_map=airline_color_map)

    return output_histogram

update_chart(pivot_values = ['Airport', 'Airline'], color_value = 'Airline', route_types_to_show = ['Domestic', 'International'], airports_to_graph = ['ATL', 'LAX', 'ORD', 'DFW', 'JFK'], airlines_to_graph = ['WN', 'AA', 'DL', 'UA'])

data_descriptor_values: ['Airport', 'Airline']
data_descriptor_values: ['Airport']


## Creating a more interactive version of the top airports pivot table:

In [5]:
df_airline_airport_pairs

Unnamed: 0,Airline,Airport,Route_Type,Destination_Region,Passengers,All_Traffic
0,DL,ATL,Domestic,Domestic,30464398.0,All_Traffic
1,AA,DFW,Domestic,Domestic,19283327.0,All_Traffic
2,AA,CLT,Domestic,Domestic,12510203.0,All_Traffic
3,UA,ORD,Domestic,Domestic,10421303.0,All_Traffic
4,WN,MDW,Domestic,Domestic,9834058.0,All_Traffic
...,...,...,...,...,...,...
7933,VJT,NRT,International,International,1.0,All_Traffic
7934,0WQ,MIA,Domestic,Domestic,1.0,All_Traffic
7935,LF,CVG,Domestic,Domestic,1.0,All_Traffic
7936,04Q,ORL,Domestic,Domestic,1.0,All_Traffic


In [6]:
def create_top_airports_list(route_types_to_show, airports_graph_airports_limit):
    data_source = df_airline_airport_pairs.query("Destination_Region == 'Domestic'").copy()
    data_source = data_source.query("Route_Type in @route_types_to_show").copy()

    if airports_graph_airports_limit == None:
        airports_graph_airports_limit = 100
    if ((1 <= airports_graph_airports_limit <= 100) == False):
        airports_graph_airports_limit = 100

    df_airline_airport_pivot = data_source.pivot_table(index = 'Airport', values = 'Passengers', aggfunc = 'sum').sort_values('Passengers', ascending = False).reset_index()
    airports_to_keep = list(df_airline_airport_pivot['Airport'][0:airports_graph_airports_limit].copy())
    return airports_to_keep

In [7]:
create_top_airports_list(route_types_to_show = ['Domestic', 'International'], 
airports_graph_airports_limit = 10)

['ATL', 'LAX', 'ORD', 'DFW', 'JFK', 'DEN', 'SFO', 'LAS', 'MCO', 'SEA']

In [8]:
airline_color_map = {
    "DL":"#E3132C",
    "UA":"#005DAA",
    "AA":"gray",
    "WN":"#F9B612",
    "AS":"green",
    "B6":"#003876",
    "Other":"#804000"
}

In [9]:
# top_airports_graph_options = ['show_airline_comparison', 'show_route_type']
# top_airports_graph_options = ['show_route_type']
top_airports_graph_options = ['show_airline_comparison', 'show_route_type']
# top_airports_graph_options = []


top_airports_graph_options
airports_graph_airports_limit = 20

In [10]:
data_source = df_airline_airport_pairs.copy() # Using a new copy
if airports_graph_airports_limit == None:
    airports_graph_airports_limit = 100
if ((1 <= airports_graph_airports_limit <= 100) == False):
    airports_graph_airports_limit = 100
print(f"Calling create_top_20_airports_graph with the following graph options: {top_airports_graph_options} and the following airports limit: {airports_graph_airports_limit}")

Calling create_top_20_airports_graph with the following graph options: ['show_airline_comparison', 'show_route_type'] and the following airports limit: 20


In [11]:
top_airlines = list(data_source.pivot_table(index = 'Airline', values = 'Passengers', aggfunc = 'sum').sort_values('Passengers', ascending = False).index[0:4])
top_airlines_as_string = ("|".join(top_airlines)) # Converts the airlines in the list to a string value that the following np.where statement can use to create an 'Other' category of airlines. # See
# https://docs.python.org/3/library/stdtypes.html#str.join
print(top_airlines_as_string)

WN|AA|DL|UA


In [12]:
top_airlines

['WN', 'AA', 'DL', 'UA']

In [13]:
data_source['Airline'] = np.where(data_source['Airline'].isin(top_airlines) == False, 'Other', data_source['Airline'])
# See https://pandas.pydata.org/docs/reference/api/pandas.Series.str.contains.html
# regarding the use of the pipe operator here.
data_source

Unnamed: 0,Airline,Airport,Route_Type,Destination_Region,Passengers,All_Traffic
0,DL,ATL,Domestic,Domestic,30464398.0,All_Traffic
1,AA,DFW,Domestic,Domestic,19283327.0,All_Traffic
2,AA,CLT,Domestic,Domestic,12510203.0,All_Traffic
3,UA,ORD,Domestic,Domestic,10421303.0,All_Traffic
4,WN,MDW,Domestic,Domestic,9834058.0,All_Traffic
...,...,...,...,...,...,...
7933,Other,NRT,International,International,1.0,All_Traffic
7934,Other,MIA,Domestic,Domestic,1.0,All_Traffic
7935,Other,CVG,Domestic,Domestic,1.0,All_Traffic
7936,Other,ORL,Domestic,Domestic,1.0,All_Traffic


In [14]:
data_source['Airline'].value_counts()

Other    6884
DL        309
UA        302
AA        288
WN        155
Name: Airline, dtype: int64

In [15]:
df_airline_airport_pivot = data_source.pivot_table(index = 'Airport', values = 'Passengers', aggfunc = 'sum').sort_values('Passengers', ascending = False).reset_index()
df_airline_airport_pivot['airport_rank'] = df_airline_airport_pivot['Passengers'].rank(ascending=False)
df_airline_airport_pivot

Unnamed: 0,Airport,Passengers,airport_rank
0,ATL,47388875.0,1.0
1,LAX,41502828.0,2.0
2,ORD,36572207.0,3.0
3,DFW,30823751.0,4.0
4,JFK,29744941.0,5.0
...,...,...,...
764,LIL,1.0,766.0
765,SIR,1.0,766.0
766,ERF,1.0,766.0
767,RSD,1.0,766.0


In [16]:
airports_to_keep = list(df_airline_airport_pivot['Airport'][0:airports_graph_airports_limit].copy())
airports_to_keep
data_source_filtered = data_source.query("Airport in @airports_to_keep").copy()
data_source_filtered = data_source_filtered.merge(df_airline_airport_pivot[['Airport', 'airport_rank']], on = 'Airport')
data_source_filtered

Unnamed: 0,Airline,Airport,Route_Type,Destination_Region,Passengers,All_Traffic,airport_rank
0,DL,ATL,Domestic,Domestic,30464398.0,All_Traffic,1.0
1,WN,ATL,Domestic,Domestic,5003515.0,All_Traffic,1.0
2,DL,ATL,International,Domestic,4585330.0,All_Traffic,1.0
3,AA,ATL,Domestic,Domestic,1271729.0,All_Traffic,1.0
4,Other,ATL,Domestic,Domestic,1188868.0,All_Traffic,1.0
...,...,...,...,...,...,...,...
1672,Other,FLL,Domestic,Domestic,8.0,All_Traffic,17.0
1673,Other,FLL,Domestic,Domestic,8.0,All_Traffic,17.0
1674,Other,FLL,International,Domestic,6.0,All_Traffic,17.0
1675,Other,FLL,International,Domestic,2.0,All_Traffic,17.0


In [17]:
# The following code creates a pivot table based on the parameters specified above.

pivot_values = ['Airport', 'airport_rank']

if 'show_airline_comparison' in top_airports_graph_options:
    pivot_values.append('Airline')

if 'show_route_type' in top_airports_graph_options:
    pivot_values.append('Route_Type')

data_source_filtered_pivot = data_source_filtered.pivot_table(index = pivot_values,
values = 'Passengers', aggfunc = 'sum').reset_index()
data_source_filtered_pivot

if ('show_airline_comparison' in top_airports_graph_options) and ('show_route_type' in top_airports_graph_options):
    data_source_filtered_pivot['Airport_Route_Pair'] = data_source_filtered_pivot['Airport'] + ' ' + data_source_filtered_pivot['Route_Type']


if 'show_route_type' in top_airports_graph_options:
    data_source_filtered_pivot.sort_values(by = ['airport_rank', 'Airport', 'Route_Type'], inplace = True, ascending = [True, True, True])
# See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_values.html 
# regarding the list passed to the 'ascending' option here.

else:
    data_source_filtered_pivot.sort_values(by = ['airport_rank', 'Airport'], inplace = True, ascending = [True, True])


data_source_filtered_pivot

Unnamed: 0,Airport,airport_rank,Airline,Route_Type,Passengers,Airport_Route_Pair
0,ATL,1.0,AA,Domestic,1271729.0,ATL Domestic
2,ATL,1.0,DL,Domestic,30464398.0,ATL Domestic
4,ATL,1.0,Other,Domestic,4299765.0,ATL Domestic
6,ATL,1.0,UA,Domestic,566751.0,ATL Domestic
7,ATL,1.0,WN,Domestic,5003515.0,ATL Domestic
...,...,...,...,...,...,...
143,PHL,20.0,AA,International,1196002.0,PHL International
145,PHL,20.0,DL,International,6612.0,PHL International
147,PHL,20.0,Other,International,659751.0,PHL International
149,PHL,20.0,UA,International,1579.0,PHL International


In [18]:
# Since there are two different top_airports_graph_options items that
    # can be chosen, there are in turn four possible graphs that can be created. 
    # Thus, the following code creates four separate bar charts.

if ('show_airline_comparison' in top_airports_graph_options) and ('show_route_type' in top_airports_graph_options):

    x_val = 'Airport_Route_Pair'
    color_val = 'Airline'

if top_airports_graph_options == ['show_airline_comparison']:
    color_val = 'Airline'
    x_val = 'Airport'

if top_airports_graph_options == ['show_route_type']:
    color_val = 'Route_Type'
    x_val = 'Airport'

if top_airports_graph_options == []:
    color_val = 'Airport'
    x_val = 'Airport'

top_airports_graph = px.histogram(data_source_filtered_pivot, x = x_val, y = 'Passengers', color = color_val)
top_airports_graph.update_xaxes(categoryorder = 'array', 
categoryarray = data_source_filtered_pivot[x_val])


top_airports_graph

In [19]:
df_airline_airport_pairs.query("Destination_Region == 'Domestic'")

Unnamed: 0,Airline,Airport,Route_Type,Destination_Region,Passengers,All_Traffic
0,DL,ATL,Domestic,Domestic,30464398.0,All_Traffic
1,AA,DFW,Domestic,Domestic,19283327.0,All_Traffic
2,AA,CLT,Domestic,Domestic,12510203.0,All_Traffic
3,UA,ORD,Domestic,Domestic,10421303.0,All_Traffic
4,WN,MDW,Domestic,Domestic,9834058.0,All_Traffic
...,...,...,...,...,...,...
7932,26Q,MIA,International,Domestic,1.0,All_Traffic
7934,0WQ,MIA,Domestic,Domestic,1.0,All_Traffic
7935,LF,CVG,Domestic,Domestic,1.0,All_Traffic
7936,04Q,ORL,Domestic,Domestic,1.0,All_Traffic


## Creating a pivot table that shows all grouped rows:

In [20]:
sample_list = ['Airline', 'Route_Type']


In [21]:
sample_list = ['Route_Type' if entry == 'Route_Type' else entry for entry in sample_list]
sample_list

['Airline', 'Route_Type']

In [22]:
all_data_value = 'All_Traffic'
df_t5_t4_2018 = pd.read_csv('t5_airports_t4_airlines_2018.csv')
df_t5_t4_2018[all_data_value] = all_data_value # This column will allow
# the code to show all values when no pivot value is selected.
df_t5_t4_2018.head(5)

Unnamed: 0,Airport,Airline,Route_Type,Passengers,All_Traffic
0,ATL,AA,Domestic,1271729.0,All_Traffic
1,ATL,AA,International,399.0,All_Traffic
2,ATL,DL,Domestic,30464398.0,All_Traffic
3,ATL,DL,International,4585330.0,All_Traffic
4,ATL,UA,Domestic,566751.0,All_Traffic


## Method that creates a group for every column:

In [23]:
pivot_values = ['Airline', 'Airport', 'Route_Type']
color_value = ['Airport']
group_value = ['Airline']
if len(pivot_values) == 0:
    df_t5_t4_2018_pivot = df_t5_t4_2018.pivot_table(index = 'All_Traffic', values = 'Passengers', aggfunc = 'sum').reset_index()
else:
    df_t5_t4_2018_pivot = df_t5_t4_2018.pivot_table(index = pivot_values, values = 'Passengers', aggfunc = 'sum').reset_index()

# The following lines create a column containing the values of each of the
# columns (other than the 'Passengers') column present in the bar chart. A
# for loop is used so that this column can adapt to different variable
# choices and different numbers of columns.
data_descriptor = df_t5_t4_2018_pivot.iloc[:,0].copy() # This copy() statement
# is needed in order to avoid  modifying this column when the group column
# gets chosen.
for i in range(1, len(df_t5_t4_2018_pivot.columns) - 1):
    data_descriptor += ' ' + df_t5_t4_2018_pivot.iloc[:, i]

df_t5_t4_2018_pivot['Group'] = data_descriptor

df_t5_t4_2018_pivot.head(5)

Unnamed: 0,Airline,Airport,Route_Type,Passengers,Group
0,AA,ATL,Domestic,1271729.0,AA ATL Domestic
1,AA,ATL,International,399.0,AA ATL International
2,AA,DFW,Domestic,19283327.0,AA DFW Domestic
3,AA,DFW,International,2530917.0,AA DFW International
4,AA,JFK,Domestic,2040369.0,AA JFK Domestic


In [24]:
# Group/color example:
px.histogram(df_t5_t4_2018_pivot, x = 'Airport', y = 'Passengers', color = 'Airline', barmode = 'group')

In [25]:
px.histogram(df_t5_t4_2018_pivot, x = 'Group', y = 'Passengers', color = 'Airline')

In [26]:
px.histogram(df_t5_t4_2018_pivot, x = 'Airport', y = 'Passengers', color = 'Airline', barmode = 'group')

## Method that only creates a group for columns not present in the color section:

I think I'l still need to plot the group column, but I can simplify it by removing the color column value from it.

In [27]:
# The following code creates a pivot table version of the DataFrame that 
# can be used for creating bar charts. It takes the specified pivot values
# and color values as inputs, and then uses those values to group the
# data accordingly. The code works with different numbers of pivot values,
# including zero pivot values.
# In order to represent all of the specified values, the code creates a 
# column describing all (or almost all) of the pivot index variables
# in the other columns, which then gets fed 
# into the x axis parameter of the bar chart. However, if a color value is
# also specified, this item does not get added into this column, since this
# data will already get represented in the bar chart (by means of the color
# legend). Removing this value helps
# simplify the final chart output.

pivot_values = ['Airline', 'Airport', 'Route_Type']
color_value = 'None' # This color value must also be present
# within the pivot_values table.
# group_value = 'Airline'
if len(pivot_values) == 0:
    df_t5_t4_2018_pivot = df_t5_t4_2018.pivot_table(index = 'All_Traffic', values = 'Passengers', aggfunc = 'sum').reset_index()
else:
    df_t5_t4_2018_pivot = df_t5_t4_2018.pivot_table(index = pivot_values, values = 'Passengers', aggfunc = 'sum').reset_index()

# The following lines create a column containing the values of each of the
# columns (other than the 'Passengers') column present in the bar chart. A
# for loop is used so that this column can adapt to different variable
# choices and different numbers of columns.
if len(pivot_values) == 0:
    data_descriptor = all_data_value
else:
    data_descriptor_values = pivot_values.copy()
    if color_value != 'None':
        data_descriptor_values.remove(color_value) # If a value will be assigned a
        # color component in the graph, it doesn't need to be assigned a 
        # group component, since it will show up in the graph regardless. Removing 
        # it here helps simplify the graph.
    print(data_descriptor_values)   
    data_descriptor = df_t5_t4_2018_pivot[data_descriptor_values[0]].copy() # This copy() statement
    # is needed in order to avoid  modifying this column when the group column
    # gets chosen.
    for i in range(1, len(data_descriptor_values)):
        data_descriptor += ' ' + df_t5_t4_2018_pivot[data_descriptor_values[i]]

df_t5_t4_2018_pivot['Group'] = data_descriptor

df_t5_t4_2018_pivot.head(5)

['Airline', 'Airport', 'Route_Type']


Unnamed: 0,Airline,Airport,Route_Type,Passengers,Group
0,AA,ATL,Domestic,1271729.0,AA ATL Domestic
1,AA,ATL,International,399.0,AA ATL International
2,AA,DFW,Domestic,19283327.0,AA DFW Domestic
3,AA,DFW,International,2530917.0,AA DFW International
4,AA,JFK,Domestic,2040369.0,AA JFK Domestic


In [28]:
px.histogram(df_t5_t4_2018_pivot, x = 'Group', y = 'Passengers', color = None if color_value == 'None' else color_value, barmode = 'group')

## Creating a bar chart showing the top 20 airports by passenger arrivals in 2018:

In [29]:
df_top_20_airports = pd.read_csv('top_20_airports_by_pax_arrivals_2018.csv')
df_top_20_airports

Unnamed: 0,Airport,Route_Type,Passengers,Rank
0,ATL,Domestic,41606158.0,1
1,ATL,International,5782717.0,1
2,LAX,Domestic,29643615.0,2
3,LAX,International,11859213.0,2
4,ORD,Domestic,30105889.0,3
5,ORD,International,6466318.0,3
6,DFW,Domestic,26907771.0,4
7,DFW,International,3915980.0,4
8,JFK,Domestic,14100771.0,5
9,JFK,International,15644170.0,5


In [30]:
top_20_airports_list = list(df_top_20_airports['Airport'].unique())
top_20_airports_list

['ATL',
 'LAX',
 'ORD',
 'DFW',
 'JFK',
 'DEN',
 'SFO',
 'LAS',
 'MCO',
 'SEA',
 'EWR',
 'PHX',
 'IAH',
 'MIA',
 'CLT',
 'BOS',
 'FLL',
 'MSP',
 'DTW',
 'PHL']

In [31]:
fig_top_20_airports_2018 = px.bar(df_top_20_airports, x="Airport", y="Passengers")

In [32]:
fig_top_20_airports_2018

In [33]:
df_aaa = pd.read_csv('local_copy_of_airports_airlines_aircraft_2018.csv')
df_aaa

Unnamed: 0,Airline,Origin_Dest,Plane_Type_Text,Passengers,Airport 1,Airport 2
0,HA,HNL_OGG,Boeing 717-200,1954139.0,HNL,OGG
1,DL,ATL_MCO,Boeing 757-200,1417832.0,ATL,MCO
2,HA,HNL_KOA,Boeing 717-200,1281221.0,HNL,KOA
3,WN,DAL_HOU,Boeing 737-700/700LR/Max 7,1260362.0,DAL,HOU
4,AA,DFW_LAX,Airbus Industrie A321/Lr,1257512.0,DFW,LAX
...,...,...,...,...,...,...
2799,NK,FLL_LGA,Airbus Industrie A320-100/200,100381.0,FLL,LGA
2800,EI,DUB_MCO,Airbus Industrie A330-200,100329.0,DUB,MCO
2801,YX,DCA_MCI,Embraer ERJ-175,100303.0,DCA,MCI
2802,WN,BNA_CLT,Boeing 737-700/700LR/Max 7,100282.0,BNA,CLT


In [34]:
# Consider building a graph that lets you pivot by airline name, airport, and plane type and then display the output in both chart form and table form.

In [35]:
# You could also try creating a chart that lets you compare the presence of a given set of airlines (maybe up to 5) for a given set of airports (maybe up to 20. The charts could be either grouped bar charts or stacked bar charts (to show the airline/airport relationship).

In [36]:
## Top 20 airlines in 2018:

df_top_20_airlines = pd.read_csv('top_20_airlines_by_passengers_2018.csv')
df_top_20_airlines

Unnamed: 0,Airline,Route_Type,Passengers,Rank
0,WN,Domestic,160578652.0,1
1,WN,International,4560615.0,1
2,AA,Domestic,119081636.0,2
3,AA,International,27186303.0,2
4,DL,Domestic,119611221.0,3
5,DL,International,23437824.0,3
6,UA,Domestic,84532551.0,4
7,UA,International,25853375.0,4
8,B6,Domestic,33547937.0,5
9,B6,International,8087960.0,5


Creating a list of the top 4 airlines (which will be useful for a later graph):

In [37]:
top_airlines_list = list(df_top_20_airlines['Airline'][0:4])
top_airlines_list

['WN', 'WN', 'AA', 'AA']

In [38]:
fig_top_20_airlines_2018 = px.bar(df_top_20_airlines, x="Airline", y="Passengers")
fig_top_20_airlines_2018

In [39]:
df_airline_airport_pairs = pd.read_csv('airport_airline_pairs_2018.csv')
df_airline_airport_pairs

Unnamed: 0,Airline,Airport,Route_Type,Destination_Region,Passengers
0,DL,ATL,Domestic,Domestic,30464398.0
1,AA,DFW,Domestic,Domestic,19283327.0
2,AA,CLT,Domestic,Domestic,12510203.0
3,UA,ORD,Domestic,Domestic,10421303.0
4,WN,MDW,Domestic,Domestic,9834058.0
...,...,...,...,...,...
7933,VJT,NRT,International,International,1.0
7934,0WQ,MIA,Domestic,Domestic,1.0
7935,LF,CVG,Domestic,Domestic,1.0
7936,04Q,ORL,Domestic,Domestic,1.0


In [40]:
top_airline_list_as_string = ("|".join(top_airlines_list)) # Converts the airlines in the list to a string value that the following np.where statement can use to create an 'Other' category of airlines
top_airline_list_as_string

'WN|WN|AA|AA'

In [41]:
df_airline_airport_pairs

Unnamed: 0,Airline,Airport,Route_Type,Destination_Region,Passengers
0,DL,ATL,Domestic,Domestic,30464398.0
1,AA,DFW,Domestic,Domestic,19283327.0
2,AA,CLT,Domestic,Domestic,12510203.0
3,UA,ORD,Domestic,Domestic,10421303.0
4,WN,MDW,Domestic,Domestic,9834058.0
...,...,...,...,...,...
7933,VJT,NRT,International,International,1.0
7934,0WQ,MIA,Domestic,Domestic,1.0
7935,LF,CVG,Domestic,Domestic,1.0
7936,04Q,ORL,Domestic,Domestic,1.0


In [42]:
df_top_airlines_and_airports = df_airline_airport_pairs.query("Airport in @top_20_airports_list").copy().reset_index(drop=True)
df_top_airlines_and_airports['Airline'] = np.where(df_top_airlines_and_airports['Airline'].str.contains(top_airline_list_as_string) == False, 'Other', df_top_airlines_and_airports['Airline'])
df_top_airlines_and_airports

Unnamed: 0,Airline,Airport,Route_Type,Destination_Region,Passengers
0,Other,ATL,Domestic,Domestic,30464398.0
1,AA,DFW,Domestic,Domestic,19283327.0
2,AA,CLT,Domestic,Domestic,12510203.0
3,Other,ORD,Domestic,Domestic,10421303.0
4,WN,DEN,Domestic,Domestic,9189061.0
...,...,...,...,...,...
1672,Other,MSP,Domestic,Domestic,1.0
1673,Other,LAX,Domestic,Domestic,1.0
1674,Other,MIA,International,Domestic,1.0
1675,Other,MIA,Domestic,Domestic,1.0


In [43]:
df_top_airlines_and_airports = df_top_airlines_and_airports.pivot_table(index = ["Airline", "Airport"], values = "Passengers", aggfunc = "sum").reset_index()


In [44]:
df_top_airlines_and_airports['Airline'].value_counts()

AA       20
Other    20
WN       18
Name: Airline, dtype: int64

In [45]:
airport_ranks = df_top_20_airports[['Airport', 'Rank']]
airport_ranks

Unnamed: 0,Airport,Rank
0,ATL,1
1,ATL,1
2,LAX,2
3,LAX,2
4,ORD,3
5,ORD,3
6,DFW,4
7,DFW,4
8,JFK,5
9,JFK,5


In [46]:
df_top_airlines_and_airports = df_top_airlines_and_airports.merge(airport_ranks, left_on = "Airport", right_on = "Airport")


In [47]:
df_top_airlines_and_airports.sort_values("Rank", inplace = True)
df_top_airlines_and_airports

Unnamed: 0,Airline,Airport,Passengers,Rank
0,AA,ATL,1272128.0,1
1,AA,ATL,1272128.0,1
2,Other,ATL,41023409.0,1
3,Other,ATL,41023409.0,1
4,WN,ATL,5093338.0,1
...,...,...,...,...
95,Other,PHL,6140048.0,20
94,Other,PHL,6140048.0,20
93,AA,PHL,7463574.0,20
92,AA,PHL,7463574.0,20


## Top 20 US Airports by Airline Share

In [48]:
df_top_20_airports

Unnamed: 0,Airport,Route_Type,Passengers,Rank
0,ATL,Domestic,41606158.0,1
1,ATL,International,5782717.0,1
2,LAX,Domestic,29643615.0,2
3,LAX,International,11859213.0,2
4,ORD,Domestic,30105889.0,3
5,ORD,International,6466318.0,3
6,DFW,Domestic,26907771.0,4
7,DFW,International,3915980.0,4
8,JFK,Domestic,14100771.0,5
9,JFK,International,15644170.0,5


In [49]:
# Top 20 US Airports by Airline Share:

top_airlines_list = list(df_top_20_airlines['Airline'].unique()[0:5])
top_airline_list_as_string = ("|".join(top_airlines_list)) # Converts the airlines in the list to a string value that the following np.where statement can use to create an 'Other' category of airlines
# unique() tags are needed to remove duplicate entries for each airport and
# airline. (These duplicates were created through the addition of 
# domestic/international travel breakdowns for each of the top airports
# and airlines.)

top_20_airports_list = list(df_top_20_airports['Airport'].unique())
print(top_airlines_list, top_20_airports_list)

['WN', 'AA', 'DL', 'UA', 'B6'] ['ATL', 'LAX', 'ORD', 'DFW', 'JFK', 'DEN', 'SFO', 'LAS', 'MCO', 'SEA', 'EWR', 'PHX', 'IAH', 'MIA', 'CLT', 'BOS', 'FLL', 'MSP', 'DTW', 'PHL']


In [50]:
df_airline_airport_pairs = pd.read_csv("airport_airline_pairs_2018.csv")


df_top_airlines_and_airports = df_airline_airport_pairs.query("Airport in @top_20_airports_list").copy().reset_index(drop=True)
df_top_airlines_and_airports['Airline'] = np.where(df_top_airlines_and_airports['Airline'].str.contains(top_airline_list_as_string) == False, 'Other', df_top_airlines_and_airports['Airline'])
df_top_airlines_and_airports = df_top_airlines_and_airports.pivot_table(index = ["Airline", "Airport"], values = "Passengers", aggfunc = "sum").reset_index()

In [51]:
list(df_airline_airport_pairs.pivot_table(index = 'Airline', values = 'Passengers', aggfunc = 'sum').sort_values('Passengers', ascending = False).index[0:10])

['WN', 'AA', 'DL', 'UA', 'B6', 'AS', 'NK', 'OO', 'F9', 'YX']

In [52]:
df_airline_airport_pairs

Unnamed: 0,Airline,Airport,Route_Type,Destination_Region,Passengers
0,DL,ATL,Domestic,Domestic,30464398.0
1,AA,DFW,Domestic,Domestic,19283327.0
2,AA,CLT,Domestic,Domestic,12510203.0
3,UA,ORD,Domestic,Domestic,10421303.0
4,WN,MDW,Domestic,Domestic,9834058.0
...,...,...,...,...,...
7933,VJT,NRT,International,International,1.0
7934,0WQ,MIA,Domestic,Domestic,1.0
7935,LF,CVG,Domestic,Domestic,1.0
7936,04Q,ORL,Domestic,Domestic,1.0


In [53]:
airlines_limit = 5
airlines_to_keep = list(df_airline_airport_pairs.pivot_table(index = 'Airline', values = 'Passengers', aggfunc = 'sum').sort_values('Passengers', ascending = False).index[0:airlines_limit])
airlines_to_keep

['WN', 'AA', 'DL', 'UA', 'B6']

In [54]:
new_df_airline_airport_pairs = df_airline_airport_pairs.query("Airline in @airlines_to_keep").copy()
new_df_airline_airport_pairs

Unnamed: 0,Airline,Airport,Route_Type,Destination_Region,Passengers
0,DL,ATL,Domestic,Domestic,30464398.0
1,AA,DFW,Domestic,Domestic,19283327.0
2,AA,CLT,Domestic,Domestic,12510203.0
3,UA,ORD,Domestic,Domestic,10421303.0
4,WN,MDW,Domestic,Domestic,9834058.0
...,...,...,...,...,...
5946,AA,TPA,International,Domestic,48.0
5951,DL,ORD,International,Domestic,47.0
5973,DL,YQB,International,International,46.0
5976,DL,BED,International,Domestic,46.0


In [55]:
airport_ranks = df_top_20_airports[['Airport', 'Rank']].drop_duplicates()
airport_ranks

Unnamed: 0,Airport,Rank
0,ATL,1
2,LAX,2
4,ORD,3
6,DFW,4
8,JFK,5
10,DEN,6
12,SFO,7
14,LAS,8
16,MCO,9
18,SEA,10


In [56]:
df_top_airlines_and_airports = df_top_airlines_and_airports.merge(airport_ranks, left_on = "Airport", right_on = "Airport")

In [57]:



df_top_airlines_and_airports.sort_values("Rank", inplace = True)

fig_t4_airline_presence_at_t20_airports = px.bar(df_top_airlines_and_airports, x="Airport", y="Passengers", color="Airline", color_discrete_map=airline_color_map, title="Top 20 US Airports by Airline Share in 2018")
fig_t4_airline_presence_at_t20_airports

In [58]:
df_top_hubs = df_airline_airport_pairs.pivot_table(index = ['Airline', 'Airport'], values = 'Passengers', aggfunc = 'sum').reset_index().sort_values('Passengers', ascending = False)
df_top_hubs.head(20)
df_top_hubs['Hub'] = df_top_hubs['Airline'] + ' ' + df_top_hubs['Airport']
df_top_hubs

Unnamed: 0,Airline,Airport,Passengers,Hub
3035,DL,ATL,35049728.0,DL ATL
1910,AA,DFW,21814244.0,AA DFW
1894,AA,CLT,13853081.0,AA CLT
1981,AA,MIA,12595599.0,AA MIA
5826,UA,ORD,12153983.0,UA ORD
...,...,...,...,...
1088,27Q,CAE,1.0,27Q CAE
1087,27Q,BYH,1.0,27Q BYH
4507,LXQ,BOS,1.0,LXQ BOS
1081,27Q,BRO,1.0,27Q BRO


In [59]:
fig_top_hubs = px.bar(df_top_hubs.iloc[0:20, :], x = 'Hub', y = 'Passengers', color = 'Airline', color_discrete_map=airline_color_map)
fig_top_hubs.update_xaxes(categoryorder = 'total descending', title = "Top 20 US Airport Hubs in 2018 by Arriving Passengers") # See https://plotly.com/python/categorical-axes/

## Determining the top international hubs:

In [60]:
df_top_intl_hubs = df_airline_airport_pairs.query("Route_Type == 'International' & Destination_Region == 'Domestic'").pivot_table(index = ['Airline', 'Airport'], values = 'Passengers', aggfunc = 'sum').reset_index().sort_values('Passengers', ascending = False)
# I'd only like to show US airports within this chart, so I chose to filter it to include only domestic airports.
df_top_intl_hubs['Hub'] = df_top_intl_hubs['Airline'] + ' ' + df_top_intl_hubs['Airport']
df_top_intl_hubs.head(20)

Unnamed: 0,Airline,Airport,Passengers,Hub
542,AA,MIA,5324448.0,AA MIA
890,DL,ATL,4585330.0,DL ATL
1695,UA,EWR,3549291.0,UA EWR
1703,UA,IAH,2962868.0,UA IAH
525,AA,DFW,2530917.0,AA DFW
924,DL,JFK,2392013.0,DL JFK
732,B6,JFK,1850171.0,B6 JFK
1731,UA,SFO,1742889.0,UA SFO
1721,UA,ORD,1732680.0,UA ORD
524,AA,CLT,1342878.0,AA CLT


In [61]:
fig_top_intl_hubs = px.bar(df_top_intl_hubs.iloc[0:20, :], x = 'Hub', y = 'Passengers', color = 'Airline', color_discrete_map=airline_color_map)
# fig_top_intl_hubs.update_traces(marker_line=dict(width=1,color='black'))
# See https://plotly.com/python/marker-style/
fig_top_intl_hubs.update_xaxes(categoryorder = 'total descending', title = "Top 20 US Airport Hubs in 2018 by Arriving International Passengers") # See https://plotly.com/python/categorical-axes/

# Creating a table of departures for a given airport:

In [62]:
df_dest_by_origin = pd.read_csv('dest_to_origin_2018.csv')

In [63]:
def create_departures_table(dest_airport):
    df_airport_by_origin = df_dest_by_origin.query("Airport == @dest_airport").sort_values('Passengers', ascending = False).reset_index(drop=True)
    if len(df_airport_by_origin) > 20:
        other_row = [dest_airport, 'Other', sum(df_airport_by_origin.iloc[20:,]['Passengers'])]
        df_airport_by_origin = df_airport_by_origin.iloc[0:20]
        df_airport_by_origin.loc[len(df_airport_by_origin)] = other_row
    pax_sum = sum(df_airport_by_origin['Passengers'])    
    df_airport_by_origin['Share'] = 100*df_airport_by_origin['Passengers'] / pax_sum  
    return (df_airport_by_origin)


In [64]:
df_origins_for_airport = create_departures_table('ABQ')
df_origins_for_airport

Unnamed: 0,Airport,Origin,Passengers,Share
0,ABQ,PHX,327011.0,12.045691
1,ABQ,DEN,294131.0,10.834532
2,ABQ,DFW,287698.0,10.597568
3,ABQ,DAL,190543.0,7.018792
4,ABQ,LAX,185873.0,6.846769
5,ABQ,LAS,166267.0,6.124567
6,ABQ,ATL,133990.0,4.93562
7,ABQ,ORD,115181.0,4.242777
8,ABQ,HOU,115137.0,4.241156
9,ABQ,SAN,95127.0,3.504073


In [65]:
sum(df_origins_for_airport['Share'])

100.0

In [66]:
df = df_origins_for_airport
go.Figure(data=go.Table(
    header = dict(
        values = list(df.columns)
        ), 
    cells = dict(
        values = [df['Airport'], df['Origin'], df['Passengers'], df['Share']] )))

In [67]:
fig_top_origins_for_airport = px.bar(df_origins_for_airport, x = 'Origin', y = 'Passengers')
fig_top_origins_for_airport