Jupyter notebook for testing out Plotly graphs. (Once I have the graph code working on this page, I can then add it to my app.py page so that it will get included in my Dash app.)

In [1]:
import pandas as pd
import plotly.express as px
import nbformat
import numpy as np
import plotly.graph_objects as go

In [2]:
airline_color_map = {
    "DL":"#E3132C",
    "UA":"#005DAA",
    "AA":"gray",
    "WN":"#F9B612",
    "AS":"green",
    "B6":"#003876",
    "Other":"#804000"
}
# See https://plotly.com/python/discrete-color/

# RGB color sources:
# Delta: Schemecolor at https://www.schemecolor.com/delta-airlines-logo-colors.php
# United: Keshav Naidu at https://www.schemecolor.com/united-airlines-logo-blue-color.php
# JetBlue: Schemecolor at https://www.schemecolor.com/jetblue-airways-logo-color.php
# AA: I had initially used the gray color provided at 
# https://coloropedia.com/american-airlines-group-colors-logo-codes/ ,
# but that proved to be too light, so I chose a generic gray instead.
# Southwest (WN): https://www.schemecolor.com/southwest-airlines-logo-colors.php
# Color for 'other': https://en.wikipedia.org/wiki/Brown 

## Creating a more interactive version of the top airports pivot table:

In [3]:
df_airline_airport_pairs = pd.read_csv('airport_airline_pairs_2018.csv')
all_data_value = 'All_Traffic'
df_airline_airport_pairs[all_data_value] = all_data_value

In [4]:
airline_color_map = {
    "DL":"#E3132C",
    "UA":"#005DAA",
    "AA":"gray",
    "WN":"#F9B612",
    "AS":"green",
    "B6":"#003876",
    "Other":"#804000"
}

In [5]:
# top_airports_graph_options = ['show_airline_comparison', 'show_route_type']
# top_airports_graph_options = ['show_route_type']
top_airports_graph_options = ['show_airline_comparison']
# top_airports_graph_options = []


top_airports_graph_options
number_of_airports_to_show = 20

In [6]:
top_airlines = list(df_airline_airport_pairs.pivot_table(index = 'Airline', values = 'Passengers', aggfunc = 'sum').sort_values('Passengers', ascending = False).index[0:4])
top_airlines_as_string = ("|".join(top_airlines)) # Converts the airlines in the list to a string value that the following np.where statement can use to create an 'Other' category of airlines. # See
# https://docs.python.org/3/library/stdtypes.html#str.join
top_airlines_as_string

'WN|AA|DL|UA'

In [7]:
df_airline_airport_pairs['Airline'] = np.where(df_airline_airport_pairs['Airline'].str.contains(top_airlines_as_string, regex = True) == False, 'Other', df_airline_airport_pairs['Airline'])
# See https://pandas.pydata.org/docs/reference/api/pandas.Series.str.contains.html
# regarding the use of the pipe operator here.
df_airline_airport_pairs

Unnamed: 0,Airline,Airport,Route_Type,Destination_Region,Passengers,All_Traffic
0,DL,ATL,Domestic,Domestic,30464398.0,All_Traffic
1,AA,DFW,Domestic,Domestic,19283327.0,All_Traffic
2,AA,CLT,Domestic,Domestic,12510203.0,All_Traffic
3,UA,ORD,Domestic,Domestic,10421303.0,All_Traffic
4,WN,MDW,Domestic,Domestic,9834058.0,All_Traffic
...,...,...,...,...,...,...
7933,Other,NRT,International,International,1.0,All_Traffic
7934,Other,MIA,Domestic,Domestic,1.0,All_Traffic
7935,Other,CVG,Domestic,Domestic,1.0,All_Traffic
7936,Other,ORL,Domestic,Domestic,1.0,All_Traffic


In [8]:
airports_to_keep = list(df_airline_airport_pairs.pivot_table(index = 'Airport', values = 'Passengers', aggfunc = 'sum').sort_values('Passengers', ascending = False).index[0:number_of_airports_to_show])
df_airline_airport_pairs_filtered = df_airline_airport_pairs.query("Airport in @airports_to_keep").copy()
print(airports_to_keep)

['ATL', 'LAX', 'ORD', 'DFW', 'JFK', 'DEN', 'SFO', 'LAS', 'MCO', 'SEA', 'EWR', 'PHX', 'IAH', 'MIA', 'CLT', 'BOS', 'FLL', 'MSP', 'DTW', 'PHL']


In [9]:
df_airline_airport_pairs_filtered

Unnamed: 0,Airline,Airport,Route_Type,Destination_Region,Passengers,All_Traffic
0,DL,ATL,Domestic,Domestic,30464398.0,All_Traffic
1,AA,DFW,Domestic,Domestic,19283327.0,All_Traffic
2,AA,CLT,Domestic,Domestic,12510203.0,All_Traffic
3,UA,ORD,Domestic,Domestic,10421303.0,All_Traffic
5,WN,DEN,Domestic,Domestic,9189061.0,All_Traffic
...,...,...,...,...,...,...
7928,Other,MSP,Domestic,Domestic,1.0,All_Traffic
7931,Other,LAX,Domestic,Domestic,1.0,All_Traffic
7932,Other,MIA,International,Domestic,1.0,All_Traffic
7934,Other,MIA,Domestic,Domestic,1.0,All_Traffic


In [10]:
# The following code creates a pivot table based on the parameters specified above.

pivot_values = ['Airport']

if 'show_airline_comparison' in top_airports_graph_options:
    pivot_values.append('Airline')

if 'show_route_type' in top_airports_graph_options:
    pivot_values.append('Route_Type')

df_airline_airport_pairs_filtered_pivot = df_airline_airport_pairs_filtered.pivot_table(index = pivot_values,
values = 'Passengers', aggfunc = 'sum').reset_index()
df_airline_airport_pairs_filtered_pivot

if ('show_airline_comparison' in top_airports_graph_options) and ('show_route_type' in top_airports_graph_options):
    df_airline_airport_pairs_filtered_pivot['Airport_Route_Pair'] = df_airline_airport_pairs_filtered_pivot['Airport'] + ' ' + df_airline_airport_pairs_filtered_pivot['Route_Type']

df_airline_airport_pairs_filtered_pivot


Unnamed: 0,Airport,Airline,Passengers
0,ATL,AA,1272128.0
1,ATL,DL,35049728.0
2,ATL,Other,5406930.0
3,ATL,UA,566751.0
4,ATL,WN,5093338.0
...,...,...,...
93,SFO,AA,1889883.0
94,SFO,DL,2030302.0
95,SFO,Other,10808991.0
96,SFO,UA,10382658.0


In [11]:
# Since there are two different top_airports_graph_options items that
# can be chosen, there are in turn four possible graphs that can be created. 
# Thus, the following code creates four separate bar charts.

if ('show_airline_comparison' in top_airports_graph_options) and ('show_route_type' in top_airports_graph_options):

    x_val = 'Airport_Route_Pair'
    color_val = 'Airline'

if top_airports_graph_options == ['show_airline_comparison']:
    color_val = 'Airline'
    x_val = 'Airport'

if top_airports_graph_options == ['show_route_type']:
    color_val = 'Route_Type'
    x_val = 'Airport'

if top_airports_graph_options == []:
    color_val = 'Airport'
    x_val = 'Airport'

top_airports_graph = px.histogram(df_airline_airport_pairs_filtered_pivot, x = x_val, y = 'Passengers', color = color_val)

top_airports_graph

## Creating a pivot table that shows all grouped rows:

In [None]:
sample_list = ['Airline', 'Route_Type']


In [None]:
sample_list = ['Route_Type' if entry == 'Route_Type' else entry for entry in sample_list]
sample_list

In [None]:
all_data_value = 'All_Traffic'
df_t5_t4_2018 = pd.read_csv('t5_airports_t4_airlines_2018.csv')
df_t5_t4_2018[all_data_value] = all_data_value # This column will allow
# the code to show all values when no pivot value is selected.
df_t5_t4_2018.head(5)

## Method that creates a group for every column:

In [None]:
pivot_values = ['Airline', 'Airport', 'Route_Type']
color_value = ['Airport']
group_value = ['Airline']
if len(pivot_values) == 0:
    df_t5_t4_2018_pivot = df_t5_t4_2018.pivot_table(index = 'All_Traffic', values = 'Passengers', aggfunc = 'sum').reset_index()
else:
    df_t5_t4_2018_pivot = df_t5_t4_2018.pivot_table(index = pivot_values, values = 'Passengers', aggfunc = 'sum').reset_index()

# The following lines create a column containing the values of each of the
# columns (other than the 'Passengers') column present in the bar chart. A
# for loop is used so that this column can adapt to different variable
# choices and different numbers of columns.
data_descriptor = df_t5_t4_2018_pivot.iloc[:,0].copy() # This copy() statement
# is needed in order to avoid  modifying this column when the group column
# gets chosen.
for i in range(1, len(df_t5_t4_2018_pivot.columns) - 1):
    data_descriptor += ' ' + df_t5_t4_2018_pivot.iloc[:, i]

df_t5_t4_2018_pivot['Group'] = data_descriptor

df_t5_t4_2018_pivot.head(5)

In [None]:
# Group/color example:
px.histogram(df_t5_t4_2018_pivot, x = 'Airport', y = 'Passengers', color = 'Airline', barmode = 'group')

In [None]:
px.histogram(df_t5_t4_2018_pivot, x = 'Group', y = 'Passengers', color = 'Airline')

In [None]:
px.histogram(df_t5_t4_2018_pivot, x = 'Airport', y = 'Passengers', color = 'Airline', barmode = 'group')

## Method that only creates a group for columns not present in the color section:

I think I'l still need to plot the group column, but I can simplify it by removing the color column value from it.

In [None]:
# The following code creates a pivot table version of the DataFrame that 
# can be used for creating bar charts. It takes the specified pivot values
# and color values as inputs, and then uses those values to group the
# data accordingly. The code works with different numbers of pivot values,
# including zero pivot values.
# In order to represent all of the specified values, the code creates a 
# column describing all (or almost all) of the pivot index variables
# in the other columns, which then gets fed 
# into the x axis parameter of the bar chart. However, if a color value is
# also specified, this item does not get added into this column, since this
# data will already get represented in the bar chart (by means of the color
# legend). Removing this value helps
# simplify the final chart output.

pivot_values = ['Airline', 'Airport', 'Route_Type']
color_value = 'None' # This color value must also be present
# within the pivot_values table.
# group_value = 'Airline'
if len(pivot_values) == 0:
    df_t5_t4_2018_pivot = df_t5_t4_2018.pivot_table(index = 'All_Traffic', values = 'Passengers', aggfunc = 'sum').reset_index()
else:
    df_t5_t4_2018_pivot = df_t5_t4_2018.pivot_table(index = pivot_values, values = 'Passengers', aggfunc = 'sum').reset_index()

# The following lines create a column containing the values of each of the
# columns (other than the 'Passengers') column present in the bar chart. A
# for loop is used so that this column can adapt to different variable
# choices and different numbers of columns.
if len(pivot_values) == 0:
    data_descriptor = all_data_value
else:
    data_descriptor_values = pivot_values.copy()
    if color_value != 'None':
        data_descriptor_values.remove(color_value) # If a value will be assigned a
        # color component in the graph, it doesn't need to be assigned a 
        # group component, since it will show up in the graph regardless. Removing 
        # it here helps simplify the graph.
    print(data_descriptor_values)   
    data_descriptor = df_t5_t4_2018_pivot[data_descriptor_values[0]].copy() # This copy() statement
    # is needed in order to avoid  modifying this column when the group column
    # gets chosen.
    for i in range(1, len(data_descriptor_values)):
        data_descriptor += ' ' + df_t5_t4_2018_pivot[data_descriptor_values[i]]

df_t5_t4_2018_pivot['Group'] = data_descriptor

df_t5_t4_2018_pivot.head(5)

In [None]:
px.histogram(df_t5_t4_2018_pivot, x = 'Group', y = 'Passengers', color = None if color_value == 'None' else color_value, barmode = 'group')

## Creating a bar chart showing the top 20 airports by passenger arrivals in 2018:

In [None]:
df_top_20_airports = pd.read_csv('top_20_airports_by_pax_arrivals_2018.csv')
df_top_20_airports

In [None]:
top_20_airports_list = list(df_top_20_airports['Airport'].unique())
top_20_airports_list

In [None]:
fig_top_20_airports_2018 = px.bar(df_top_20_airports, x="Airport", y="Passengers")

In [None]:
fig_top_20_airports_2018

In [None]:
df_aaa = pd.read_csv('local_copy_of_airports_airlines_aircraft_2018.csv')
df_aaa

In [None]:
# Consider building a graph that lets you pivot by airline name, airport, and plane type and then display the output in both chart form and table form.

In [None]:
# You could also try creating a chart that lets you compare the presence of a given set of airlines (maybe up to 5) for a given set of airports (maybe up to 20. The charts could be either grouped bar charts or stacked bar charts (to show the airline/airport relationship).

In [None]:
## Top 20 airlines in 2018:

df_top_20_airlines = pd.read_csv('top_20_airlines_by_passengers_2018.csv')
df_top_20_airlines

Creating a list of the top 4 airlines (which will be useful for a later graph):

In [None]:
top_airlines_list = list(df_top_20_airlines['Airline'][0:4])
top_airlines_list

In [None]:
fig_top_20_airlines_2018 = px.bar(df_top_20_airlines, x="Airline", y="Passengers")
fig_top_20_airlines_2018

In [None]:
df_airline_airport_pairs = pd.read_csv('airport_airline_pairs_2018.csv')
df_airline_airport_pairs

In [None]:
top_airline_list_as_string = ("|".join(top_airlines_list)) # Converts the airlines in the list to a string value that the following np.where statement can use to create an 'Other' category of airlines
top_airline_list_as_string

In [None]:
df_airline_airport_pairs

In [None]:
df_top_airlines_and_airports = df_airline_airport_pairs.query("Airport in @top_20_airports_list").copy().reset_index(drop=True)
df_top_airlines_and_airports['Airline'] = np.where(df_top_airlines_and_airports['Airline'].str.contains(top_airline_list_as_string) == False, 'Other', df_top_airlines_and_airports['Airline'])
df_top_airlines_and_airports

In [None]:
df_top_airlines_and_airports = df_top_airlines_and_airports.pivot_table(index = ["Airline", "Airport"], values = "Passengers", aggfunc = "sum").reset_index()


In [None]:
df_top_airlines_and_airports['Airline'].value_counts()

In [None]:
airport_ranks = df_top_20_airports[['Airport', 'Rank']]
airport_ranks

In [None]:
df_top_airlines_and_airports = df_top_airlines_and_airports.merge(airport_ranks, left_on = "Airport", right_on = "Airport")


In [None]:
df_top_airlines_and_airports.sort_values("Rank", inplace = True)
df_top_airlines_and_airports

## Top 20 US Airports by Airline Share

In [None]:
df_top_20_airports

In [None]:
# Top 20 US Airports by Airline Share:

top_airlines_list = list(df_top_20_airlines['Airline'].unique()[0:5])
top_airline_list_as_string = ("|".join(top_airlines_list)) # Converts the airlines in the list to a string value that the following np.where statement can use to create an 'Other' category of airlines
# unique() tags are needed to remove duplicate entries for each airport and
# airline. (These duplicates were created through the addition of 
# domestic/international travel breakdowns for each of the top airports
# and airlines.)

top_20_airports_list = list(df_top_20_airports['Airport'].unique())
print(top_airlines_list, top_20_airports_list)

In [None]:
df_airline_airport_pairs = pd.read_csv("airport_airline_pairs_2018.csv")


df_top_airlines_and_airports = df_airline_airport_pairs.query("Airport in @top_20_airports_list").copy().reset_index(drop=True)
df_top_airlines_and_airports['Airline'] = np.where(df_top_airlines_and_airports['Airline'].str.contains(top_airline_list_as_string) == False, 'Other', df_top_airlines_and_airports['Airline'])
df_top_airlines_and_airports = df_top_airlines_and_airports.pivot_table(index = ["Airline", "Airport"], values = "Passengers", aggfunc = "sum").reset_index()

In [None]:
list(df_airline_airport_pairs.pivot_table(index = 'Airline', values = 'Passengers', aggfunc = 'sum').sort_values('Passengers', ascending = False).index[0:10])

In [None]:
df_airline_airport_pairs

In [None]:
airlines_limit = 5
airlines_to_keep = list(df_airline_airport_pairs.pivot_table(index = 'Airline', values = 'Passengers', aggfunc = 'sum').sort_values('Passengers', ascending = False).index[0:airlines_limit])
airlines_to_keep

In [None]:
new_df_airline_airport_pairs = df_airline_airport_pairs.query("Airline in @airlines_to_keep").copy()
new_df_airline_airport_pairs

In [None]:
airport_ranks = df_top_20_airports[['Airport', 'Rank']].drop_duplicates()
airport_ranks

In [None]:
df_top_airlines_and_airports = df_top_airlines_and_airports.merge(airport_ranks, left_on = "Airport", right_on = "Airport")

In [None]:



df_top_airlines_and_airports.sort_values("Rank", inplace = True)

fig_t4_airline_presence_at_t20_airports = px.bar(df_top_airlines_and_airports, x="Airport", y="Passengers", color="Airline", color_discrete_map=airline_color_map, title="Top 20 US Airports by Airline Share in 2018")
fig_t4_airline_presence_at_t20_airports

In [None]:
df_top_hubs = df_airline_airport_pairs.pivot_table(index = ['Airline', 'Airport'], values = 'Passengers', aggfunc = 'sum').reset_index().sort_values('Passengers', ascending = False)
df_top_hubs.head(20)
df_top_hubs['Hub'] = df_top_hubs['Airline'] + ' ' + df_top_hubs['Airport']
df_top_hubs

In [None]:
fig_top_hubs = px.bar(df_top_hubs.iloc[0:20, :], x = 'Hub', y = 'Passengers', color = 'Airline', color_discrete_map=airline_color_map)
fig_top_hubs.update_xaxes(categoryorder = 'total descending', title = "Top 20 US Airport Hubs in 2018 by Arriving Passengers") # See https://plotly.com/python/categorical-axes/

## Determining the top international hubs:

In [None]:
df_top_intl_hubs = df_airline_airport_pairs.query("Route_Type == 'International' & Destination_Region == 'Domestic'").pivot_table(index = ['Airline', 'Airport'], values = 'Passengers', aggfunc = 'sum').reset_index().sort_values('Passengers', ascending = False)
# I'd only like to show US airports within this chart, so I chose to filter it to include only domestic airports.
df_top_intl_hubs['Hub'] = df_top_intl_hubs['Airline'] + ' ' + df_top_intl_hubs['Airport']
df_top_intl_hubs.head(20)

In [None]:
fig_top_intl_hubs = px.bar(df_top_intl_hubs.iloc[0:20, :], x = 'Hub', y = 'Passengers', color = 'Airline', color_discrete_map=airline_color_map)
# fig_top_intl_hubs.update_traces(marker_line=dict(width=1,color='black'))
# See https://plotly.com/python/marker-style/
fig_top_intl_hubs.update_xaxes(categoryorder = 'total descending', title = "Top 20 US Airport Hubs in 2018 by Arriving International Passengers") # See https://plotly.com/python/categorical-axes/

# Creating a table of departures for a given airport:

In [None]:
df_dest_by_origin = pd.read_csv('dest_to_origin_2018.csv')

In [None]:
def create_departures_table(dest_airport):
    df_airport_by_origin = df_dest_by_origin.query("Airport == @dest_airport").sort_values('Passengers', ascending = False).reset_index(drop=True)
    if len(df_airport_by_origin) > 20:
        other_row = [dest_airport, 'Other', sum(df_airport_by_origin.iloc[20:,]['Passengers'])]
        df_airport_by_origin = df_airport_by_origin.iloc[0:20]
        df_airport_by_origin.loc[len(df_airport_by_origin)] = other_row
    pax_sum = sum(df_airport_by_origin['Passengers'])    
    df_airport_by_origin['Share'] = 100*df_airport_by_origin['Passengers'] / pax_sum  
    return (df_airport_by_origin)


In [None]:
df_origins_for_airport = create_departures_table('ABQ')
df_origins_for_airport

In [None]:
sum(df_origins_for_airport['Share'])

In [None]:
df = df_origins_for_airport
go.Figure(data=go.Table(
    header = dict(
        values = list(df.columns)
        ), 
    cells = dict(
        values = [df['Airport'], df['Origin'], df['Passengers'], df['Share']] )))

In [None]:
fig_top_origins_for_airport = px.bar(df_origins_for_airport, x = 'Origin', y = 'Passengers')
fig_top_origins_for_airport