Jupyter notebook for testing out Plotly graphs. (Once I have the graph code working on this page, I can then add it to my app.py page so that it will get included in my Dash app.)

In [2]:
import pandas as pd
import plotly.express as px
import nbformat
import numpy as np
import plotly.graph_objects as go

In [3]:
airline_color_map = {
    "DL":"#E3132C",
    "UA":"#005DAA",
    "AA":"gray",
    "WN":"#F9B612",
    "AS":"green",
    "B6":"#003876",
    "Other":"#804000"
}
# See https://plotly.com/python/discrete-color/

# RGB color sources:
# Delta: Schemecolor at https://www.schemecolor.com/delta-airlines-logo-colors.php
# United: Keshav Naidu at https://www.schemecolor.com/united-airlines-logo-blue-color.php
# JetBlue: Schemecolor at https://www.schemecolor.com/jetblue-airways-logo-color.php
# AA: I had initially used the gray color provided at 
# https://coloropedia.com/american-airlines-group-colors-logo-codes/ ,
# but that proved to be too light, so I chose a generic gray instead.
# Southwest (WN): https://www.schemecolor.com/southwest-airlines-logo-colors.php
# Color for 'other': https://en.wikipedia.org/wiki/Brown 

# Airline traffic by month graph:

In [6]:
df_airline_traffic_by_month = pd.read_csv('airline_traffic_by_month_2018.csv')
df_airline_traffic_by_month

Unnamed: 0,Airline,Month,Passengers
0,02Q,3,25.0
1,02Q,4,119.0
2,02Q,6,86.0
3,04Q,1,3834.0
4,04Q,2,3740.0
...,...,...,...
3435,ZX,8,112928.0
3436,ZX,9,98887.0
3437,ZX,10,97276.0
3438,ZX,11,73359.0


In [36]:
df_airport_traffic_by_month = pd.read_csv('airport_traffic_by_month_2018.csv')
df_airport_traffic_by_month

Unnamed: 0,Airport,Month,Passengers
0,05A,2,10.0
1,05A,3,44.0
2,05A,4,18.0
3,05A,5,42.0
4,05A,6,29.0
...,...,...,...
14650,ZXU,8,28.0
14651,ZXU,9,10.0
14652,ZXU,10,5.0
14653,ZXU,11,3.0


In [45]:
def line_chart_1_comp_option(df, x_value, y_value, time_parameter, 
max_number_to_include, number_to_include = None, compare_by_x_value = True):
    '''Creates a line chart with a single comparison option.
    
    number_to_include designates the number of items that should be included
    in the line chart (ranked by total prevalence). 
    If number_to_include is set to 0, all instances of a given variable
    will be shown. For instance, if creating a line chart of 
    airport traffic by month, setting number_to_include to 5 means
    that only the top 5 airports by passenger traffic will be included.
    
    If compare_by_x_value is set to True, all values will be grouped together
    into a single line. Otherwise, different lines will be created
    for each instance of the variable.'''

    df_for_charting = df.copy()

    if compare_by_x_value == True: 
        # If compare_by_x_value is set to True, n lines will be plotted, with
        # n equaling the lesser of number_to_include and max_number_to_include.
        # (The number of lines is restricted by these variables so as not to
        # create an overly detailed graph.) However, if compare_by_x_value
        # is set to False, there is no need to restrict the data being shown,
        # so these lines will be skipped.

        if number_to_include == None:
            number_to_include = max_number_to_include
        if ((1 <= number_to_include <= max_number_to_include) == False):
            number_to_include = max_number_to_include
        
        # The following lines determine the x values with the highest y_value
        # totals within df_for_charting.
        df_variable_ranks = df_for_charting.pivot_table(index = x_value, 
        values = y_value, aggfunc = 'sum').sort_values(
            y_value, ascending = False)
        x_values_to_include = list(
            df_variable_ranks.index[0:number_to_include])

        # The following line filters df_for_charting to include only
        # the x values contained within x_values_to_include.
        df_for_charting = df_for_charting.query(
            x_value+" in @x_values_to_include")

    # The line chart created by this function will be based on a pivot table.
    # If compare_by_x_value is set to True, the x value needs to be added in
    # as one of the index options for that pivot_table and as the 'color' 
    # argument for the line chart function call. If it is instead set to False,
    # only the time_parameter value should be included in the index option,
    # and the 'color' argument should be set to None. This is what the 
    # following if/else statement accomplishes.
    if compare_by_x_value == True:
        index_vals = [time_parameter, x_value]
        color_val = x_value
    else:
        index_vals = [time_parameter]
        color_val = None

    pivot_for_graphing = df_for_charting.pivot_table(
        index = index_vals, 
        values = y_value, aggfunc = 'sum').reset_index()

    line_chart = px.line(pivot_for_graphing, x = time_parameter, 
    y = y_value, color = color_val)

    return line_chart

line_chart_1_comp_option(df = df_airline_traffic_by_month, y_value = 'Passengers',
x_value = 'Airline', time_parameter = 'Month', max_number_to_include = 100, 
number_to_include = 10, compare_by_x_value = True)

In [46]:
line_chart_1_comp_option(df = df_airport_traffic_by_month, y_value = 'Passengers',
x_value = 'Airport', time_parameter = 'Month', max_number_to_include = 100, 
number_to_include = 10, compare_by_x_value = True)

## Interactive air traffic graph:

In [3]:
df_airline_airport_pairs = pd.read_csv('airport_airline_pairs_2018.csv')
all_data_value = 'All_Traffic'
df_airline_airport_pairs[all_data_value] = all_data_value

In [4]:
dest_airport = 'dca'
dest_airport.upper()

'DCA'

In [12]:
def generate_top_hubs_graph(hubs_limit, route_types):
    max_hubs_limit = 100
    print("hubs_limit:",hubs_limit,"route_types:",route_types)

    if hubs_limit == None:
        hubs_limit = max_hubs_limit
    if ((1 <= hubs_limit <= max_hubs_limit) == False):
        hubs_limit = max_hubs_limit
    
    print("hubs_limit:",hubs_limit)

    data_source = df_airline_airport_pairs.query("Destination_Region == 'Domestic' & Route_Type in @route_types").copy()
    df_top_hubs = data_source.pivot_table(index = ['Airline', 'Airport'], values = 'Passengers', aggfunc = 'sum').reset_index().sort_values('Passengers', ascending = False)
    df_top_hubs['Hub'] = df_top_hubs['Airline'] + ' ' + df_top_hubs['Airport']

    df_top_hubs.reset_index(drop = True, inplace=True)



    fig_top_hubs = px.bar(df_top_hubs.iloc[0:hubs_limit, :], x = 'Hub', y = 'Passengers', color = 'Airline', color_discrete_map=airline_color_map)
    fig_top_hubs.update_xaxes(categoryorder = 'total descending') # See https://plotly.com/python/categorical-axes/
    
    return fig_top_hubs, df_top_hubs


fig_top_hubs, df_top_hubs = generate_top_hubs_graph(hubs_limit = 20, route_types = ['Domestic', 'International'])

fig_top_hubs

hubs_limit: 20 route_types: ['Domestic', 'International']
hubs_limit: 20


In [13]:
df_top_hubs

Unnamed: 0,Airline,Airport,Passengers,Hub
0,DL,ATL,38537319.0,DL ATL
1,AA,DFW,22835074.0,AA DFW
2,AA,CLT,14084794.0,AA CLT
3,AA,MIA,13215249.0,AA MIA
4,UA,ORD,12579390.0,UA ORD
...,...,...,...,...
8515,04Q,MLB,1.0,04Q MLB
8516,AN,SHR,1.0,AN SHR
8517,04Q,MIA,1.0,04Q MIA
8518,AN,SEE,1.0,AN SEE


In [18]:
df_for_table = df_top_hubs

column_list = list(df_for_table.columns)
test_table = go.Figure(data = [go.Table(header = dict(values = column_list), cells = dict(values = [df_for_table[column] for column in column_list]))]) 
# Based on https://plotly.com/python/table/#use-a-pandas-dataframe


test_table

In [None]:
def generate_top_hubs_graph(hubs_limit, route_types):
    max_hubs_limit = 100

    if hubs_limit == None:
        hubs_limit = max_hubs_limit
    if ((1 <= hubs_limit <= max_hubs_limit) == False):
        hubs_limit = max_hubs_limit

    data_source = df_airline_airport_pairs.query("Destination_Region == 'Domestic' & Route_Type in @route_types").copy()
    df_top_hubs = data_source.pivot_table(index = ['Airline', 'Airport'], values = 'Passengers', aggfunc = 'sum').reset_index().sort_values('Passengers', ascending = False)
    df_top_hubs['Hub'] = df_top_hubs['Airline'] + ' ' + df_top_hubs['Airport']
    fig_top_hubs = px.bar(df_top_hubs.iloc[0:hubs_limit, :], x = 'Hub', y = 'Passengers', color = 'Airline', color_discrete_map=airline_color_map)
    fig_top_hubs.update_xaxes(categoryorder = 'total descending') # See https://plotly.com/python/categorical-axes/

    return fig_top_hubs


In [None]:
df_airline_airport_pairs

In [None]:
def create_top_airlines_chart(show_route_types, airline_filter, route_types):
    data_source = df_airline_airport_pairs.copy()
    data_source = data_source.query("Airline in @airline_filter & Route_Type in @route_types")

    # Determining airline ranks (which will be useful for sorting bars after
    # creating a pivot table):
    # Note that these airline ranks are based on passenger traffic within
    # the filtered copy of the pivot table rather than on all passenger traffic.

    df_airline_pivot = data_source.pivot_table(index = 'Airline', values = 'Passengers', aggfunc = 'sum').sort_values('Passengers', ascending = False).reset_index()
    df_airline_pivot['Airline_Rank'] = df_airline_pivot['Passengers'].rank(ascending=False) 
    df_airline_pivot.drop('Passengers', axis = 1, inplace = True) # This 
    # column will get in the way when merging the table with the pivot table 
    # on which the graph will be based.
    # print(df_airline_pivot)

    if 'show_route_type' in show_route_types:
        pivot_index = ['Airline', 'Route_Type']
    else:
        pivot_index = ['Airline'] 
    data_pivot = data_source.pivot_table(index = pivot_index, values = 'Passengers', aggfunc = 'sum').reset_index()
    data_pivot = data_pivot.merge(df_airline_pivot, on = 'Airline')
    
    if 'show_route_type' in show_route_types:
        data_pivot.sort_values(['Airline_Rank', 'Route_Type'], inplace = True)
    else:
        data_pivot.sort_values('Airline_Rank', inplace = True)

    if 'show_route_type' in show_route_types:
        data_pivot['Airline_Route_Pair'] = data_pivot['Airline'] + ' ' + data_pivot['Route_Type']
        x_val = 'Airline_Route_Pair'
    else:
        x_val = 'Airline'

    # print(data_pivot)

    fig_top_airlines = px.histogram(data_pivot, x = x_val, y = 'Passengers', color = 'Airline', color_discrete_map=airline_color_map)
    if 'show_route_type' in show_route_types:
        fig_top_airlines.update_xaxes(categoryorder = 'array', 
    categoryarray = data_pivot['Airline_Route_Pair']) # Reorders bars so that
    # domestic ones will always precede international ones

    return fig_top_airlines

create_top_airlines_chart(show_route_types = ['show_route_type'], airline_filter = ['WN', 'AA', 'DL', 'UA', 'B6', 'AS', 'NK', 'OO', 'F9', 'YX'], route_types=['Domestic', 'International'])

In [None]:
def update_chart(pivot_values, color_value, route_types_to_show, airports_to_graph, airlines_to_graph):
    # These arguments correspond to the input values
    # listed (in the same order).
    # return f'You have selected {pivot_values}'

    # The following code creates a pivot table version of the DataFrame that 
    # can be used for creating bar charts. It takes the specified pivot values
    # and color values as inputs, and then uses those values to group the
    # data accordingly. The code works with different numbers of pivot values,
    # including zero pivot values.
    # In order to represent all of the specified values, the code creates a 
    # column describing all (or almost all) of the pivot index variables
    # in the other columns, which then gets fed 
    # into the x axis parameter of the bar chart. However, if a color value is
    # also specified, this item does not get added into this column, since this
    # data will already get represented in the bar chart (by means of the color
    # legend). Removing this value helps
    # simplify the final chart output.

    data_source = df_airline_airport_pairs.copy()
    data_source_filtered = data_source.query("Route_Type in @route_types_to_show & Airport in @airports_to_graph & Airline in @airlines_to_graph").copy()

    # The following lines convert dropdown text to 
    # DataFrame column variables where discrepancies
    # exist between the two.
    pivot_values = ['Route_Type' if entry == 'Route Type' else entry for entry in pivot_values]

    if color_value == 'Route Type':
        color_value = 'Route_Type'
    
    color_value = color_value # This color value must also be present
    # within the pivot_values table.
    # group_value = 'Airline'
    if len(pivot_values) == 0:
        data_source_pivot = data_source_filtered.pivot_table(index = 'All_Traffic', values = 'Passengers', aggfunc = 'sum').reset_index()
    else:
        data_source_pivot = data_source_filtered.pivot_table(index = pivot_values, values = 'Passengers', aggfunc = 'sum').reset_index()

    # The following lines create a column containing the values of each of the
    # columns (other than the 'Passengers') column present in the bar chart. A
    # for loop is used so that this column can adapt to different variable
    # choices and different numbers of columns.
    if len(pivot_values) == 0:
        data_descriptor = all_data_value
    else:
        data_descriptor_values = pivot_values.copy()
        print("data_descriptor_values:", data_descriptor_values)
        if (color_value != 'None') & (len(data_descriptor_values) > 1):
            data_descriptor_values.remove(color_value) # If a value will be assigned a
            # color component in the graph, it doesn't need to be assigned a 
            # group component, since it will show up in the graph regardless. Removing 
            # it here helps simplify the graph.
        print("data_descriptor_values:", data_descriptor_values)
        data_descriptor = data_source_pivot[data_descriptor_values[0]].copy() # This copy() statement
        # is needed in order to avoid  modifying this column when the group column
        # gets chosen.
        for i in range(1, len(data_descriptor_values)):
            data_descriptor += ' ' + data_source_pivot[data_descriptor_values[i]]

    data_source_pivot['Group'] = data_descriptor

    data_source_pivot.head(5)

    output_histogram = px.histogram(data_source_pivot, x = 'Group', y = 'Passengers', color = None if color_value == 'None' else color_value, barmode = 'group', color_discrete_map=airline_color_map)

    return output_histogram

update_chart(pivot_values = ['Airport', 'Airline'], color_value = 'Airline', route_types_to_show = ['Domestic', 'International'], airports_to_graph = ['ATL', 'LAX', 'ORD', 'DFW', 'JFK'], airlines_to_graph = ['WN', 'AA', 'DL', 'UA'])

## Creating a more interactive version of the top airports pivot table:

In [None]:
df_airline_airport_pairs

In [None]:
def create_top_airports_list(route_types_to_show, airports_graph_airports_limit):
    data_source = df_airline_airport_pairs.query("Destination_Region == 'Domestic'").copy()
    data_source = data_source.query("Route_Type in @route_types_to_show").copy()

    if airports_graph_airports_limit == None:
        airports_graph_airports_limit = 100
    if ((1 <= airports_graph_airports_limit <= 100) == False):
        airports_graph_airports_limit = 100

    df_airline_airport_pivot = data_source.pivot_table(index = 'Airport', values = 'Passengers', aggfunc = 'sum').sort_values('Passengers', ascending = False).reset_index()
    airports_to_keep = list(df_airline_airport_pivot['Airport'][0:airports_graph_airports_limit].copy())
    return airports_to_keep

In [None]:
create_top_airports_list(route_types_to_show = ['Domestic', 'International'], 
airports_graph_airports_limit = 10)

In [None]:
airline_color_map = {
    "DL":"#E3132C",
    "UA":"#005DAA",
    "AA":"gray",
    "WN":"#F9B612",
    "AS":"green",
    "B6":"#003876",
    "Other":"#804000"
}

In [None]:
# top_airports_graph_options = ['show_airline_comparison', 'show_route_type']
# top_airports_graph_options = ['show_route_type']
top_airports_graph_options = ['show_airline_comparison', 'show_route_type']
# top_airports_graph_options = []


top_airports_graph_options
airports_graph_airports_limit = 20

In [None]:
data_source = df_airline_airport_pairs.copy() # Using a new copy
if airports_graph_airports_limit == None:
    airports_graph_airports_limit = 100
if ((1 <= airports_graph_airports_limit <= 100) == False):
    airports_graph_airports_limit = 100
print(f"Calling create_top_20_airports_graph with the following graph options: {top_airports_graph_options} and the following airports limit: {airports_graph_airports_limit}")

In [None]:
top_airlines = list(data_source.pivot_table(index = 'Airline', values = 'Passengers', aggfunc = 'sum').sort_values('Passengers', ascending = False).index[0:4])
top_airlines_as_string = ("|".join(top_airlines)) # Converts the airlines in the list to a string value that the following np.where statement can use to create an 'Other' category of airlines. # See
# https://docs.python.org/3/library/stdtypes.html#str.join
print(top_airlines_as_string)

In [None]:
top_airlines

In [None]:
data_source['Airline'] = np.where(data_source['Airline'].isin(top_airlines) == False, 'Other', data_source['Airline'])
# See https://pandas.pydata.org/docs/reference/api/pandas.Series.str.contains.html
# regarding the use of the pipe operator here.
data_source

In [None]:
data_source['Airline'].value_counts()

In [None]:
df_airline_airport_pivot = data_source.pivot_table(index = 'Airport', values = 'Passengers', aggfunc = 'sum').sort_values('Passengers', ascending = False).reset_index()
df_airline_airport_pivot['airport_rank'] = df_airline_airport_pivot['Passengers'].rank(ascending=False)
df_airline_airport_pivot

In [None]:
airports_to_keep = list(df_airline_airport_pivot['Airport'][0:airports_graph_airports_limit].copy())
airports_to_keep
data_source_filtered = data_source.query("Airport in @airports_to_keep").copy()
data_source_filtered = data_source_filtered.merge(df_airline_airport_pivot[['Airport', 'airport_rank']], on = 'Airport')
data_source_filtered

In [None]:
# The following code creates a pivot table based on the parameters specified above.

pivot_values = ['Airport', 'airport_rank']

if 'show_airline_comparison' in top_airports_graph_options:
    pivot_values.append('Airline')

if 'show_route_type' in top_airports_graph_options:
    pivot_values.append('Route_Type')

data_source_filtered_pivot = data_source_filtered.pivot_table(index = pivot_values,
values = 'Passengers', aggfunc = 'sum').reset_index()
data_source_filtered_pivot

if ('show_airline_comparison' in top_airports_graph_options) and ('show_route_type' in top_airports_graph_options):
    data_source_filtered_pivot['Airport_Route_Pair'] = data_source_filtered_pivot['Airport'] + ' ' + data_source_filtered_pivot['Route_Type']


if 'show_route_type' in top_airports_graph_options:
    data_source_filtered_pivot.sort_values(by = ['airport_rank', 'Airport', 'Route_Type'], inplace = True, ascending = [True, True, True])
# See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_values.html 
# regarding the list passed to the 'ascending' option here.

else:
    data_source_filtered_pivot.sort_values(by = ['airport_rank', 'Airport'], inplace = True, ascending = [True, True])


data_source_filtered_pivot

In [None]:
# Since there are two different top_airports_graph_options items that
    # can be chosen, there are in turn four possible graphs that can be created. 
    # Thus, the following code creates four separate bar charts.

if ('show_airline_comparison' in top_airports_graph_options) and ('show_route_type' in top_airports_graph_options):

    x_val = 'Airport_Route_Pair'
    color_val = 'Airline'

if top_airports_graph_options == ['show_airline_comparison']:
    color_val = 'Airline'
    x_val = 'Airport'

if top_airports_graph_options == ['show_route_type']:
    color_val = 'Route_Type'
    x_val = 'Airport'

if top_airports_graph_options == []:
    color_val = 'Airport'
    x_val = 'Airport'

top_airports_graph = px.histogram(data_source_filtered_pivot, x = x_val, y = 'Passengers', color = color_val)
top_airports_graph.update_xaxes(categoryorder = 'array', 
categoryarray = data_source_filtered_pivot[x_val])


top_airports_graph

In [None]:
df_airline_airport_pairs.query("Destination_Region == 'Domestic'")

## Creating a pivot table that shows all grouped rows:

In [None]:
sample_list = ['Airline', 'Route_Type']


In [None]:
sample_list = ['Route_Type' if entry == 'Route_Type' else entry for entry in sample_list]
sample_list

In [None]:
all_data_value = 'All_Traffic'
df_t5_t4_2018 = pd.read_csv('t5_airports_t4_airlines_2018.csv')
df_t5_t4_2018[all_data_value] = all_data_value # This column will allow
# the code to show all values when no pivot value is selected.
df_t5_t4_2018.head(5)

## Method that creates a group for every column:

In [None]:
pivot_values = ['Airline', 'Airport', 'Route_Type']
color_value = ['Airport']
group_value = ['Airline']
if len(pivot_values) == 0:
    df_t5_t4_2018_pivot = df_t5_t4_2018.pivot_table(index = 'All_Traffic', values = 'Passengers', aggfunc = 'sum').reset_index()
else:
    df_t5_t4_2018_pivot = df_t5_t4_2018.pivot_table(index = pivot_values, values = 'Passengers', aggfunc = 'sum').reset_index()

# The following lines create a column containing the values of each of the
# columns (other than the 'Passengers') column present in the bar chart. A
# for loop is used so that this column can adapt to different variable
# choices and different numbers of columns.
data_descriptor = df_t5_t4_2018_pivot.iloc[:,0].copy() # This copy() statement
# is needed in order to avoid  modifying this column when the group column
# gets chosen.
for i in range(1, len(df_t5_t4_2018_pivot.columns) - 1):
    data_descriptor += ' ' + df_t5_t4_2018_pivot.iloc[:, i]

df_t5_t4_2018_pivot['Group'] = data_descriptor

df_t5_t4_2018_pivot.head(5)

In [None]:
# Group/color example:
px.histogram(df_t5_t4_2018_pivot, x = 'Airport', y = 'Passengers', color = 'Airline', barmode = 'group')

In [None]:
px.histogram(df_t5_t4_2018_pivot, x = 'Group', y = 'Passengers', color = 'Airline')

In [None]:
px.histogram(df_t5_t4_2018_pivot, x = 'Airport', y = 'Passengers', color = 'Airline', barmode = 'group')

## Method that only creates a group for columns not present in the color section:

I think I'l still need to plot the group column, but I can simplify it by removing the color column value from it.

In [None]:
# The following code creates a pivot table version of the DataFrame that 
# can be used for creating bar charts. It takes the specified pivot values
# and color values as inputs, and then uses those values to group the
# data accordingly. The code works with different numbers of pivot values,
# including zero pivot values.
# In order to represent all of the specified values, the code creates a 
# column describing all (or almost all) of the pivot index variables
# in the other columns, which then gets fed 
# into the x axis parameter of the bar chart. However, if a color value is
# also specified, this item does not get added into this column, since this
# data will already get represented in the bar chart (by means of the color
# legend). Removing this value helps
# simplify the final chart output.

pivot_values = ['Airline', 'Airport', 'Route_Type']
color_value = 'None' # This color value must also be present
# within the pivot_values table.
# group_value = 'Airline'
if len(pivot_values) == 0:
    df_t5_t4_2018_pivot = df_t5_t4_2018.pivot_table(index = 'All_Traffic', values = 'Passengers', aggfunc = 'sum').reset_index()
else:
    df_t5_t4_2018_pivot = df_t5_t4_2018.pivot_table(index = pivot_values, values = 'Passengers', aggfunc = 'sum').reset_index()

# The following lines create a column containing the values of each of the
# columns (other than the 'Passengers') column present in the bar chart. A
# for loop is used so that this column can adapt to different variable
# choices and different numbers of columns.
if len(pivot_values) == 0:
    data_descriptor = all_data_value
else:
    data_descriptor_values = pivot_values.copy()
    if color_value != 'None':
        data_descriptor_values.remove(color_value) # If a value will be assigned a
        # color component in the graph, it doesn't need to be assigned a 
        # group component, since it will show up in the graph regardless. Removing 
        # it here helps simplify the graph.
    print(data_descriptor_values)   
    data_descriptor = df_t5_t4_2018_pivot[data_descriptor_values[0]].copy() # This copy() statement
    # is needed in order to avoid  modifying this column when the group column
    # gets chosen.
    for i in range(1, len(data_descriptor_values)):
        data_descriptor += ' ' + df_t5_t4_2018_pivot[data_descriptor_values[i]]

df_t5_t4_2018_pivot['Group'] = data_descriptor

df_t5_t4_2018_pivot.head(5)

In [None]:
px.histogram(df_t5_t4_2018_pivot, x = 'Group', y = 'Passengers', color = None if color_value == 'None' else color_value, barmode = 'group')

## Creating a bar chart showing the top 20 airports by passenger arrivals in 2018:

In [None]:
df_top_20_airports = pd.read_csv('top_20_airports_by_pax_arrivals_2018.csv')
df_top_20_airports

In [None]:
top_20_airports_list = list(df_top_20_airports['Airport'].unique())
top_20_airports_list

In [None]:
fig_top_20_airports_2018 = px.bar(df_top_20_airports, x="Airport", y="Passengers")

In [None]:
fig_top_20_airports_2018

In [None]:
df_aaa = pd.read_csv('local_copy_of_airports_airlines_aircraft_2018.csv')
df_aaa

In [None]:
# Consider building a graph that lets you pivot by airline name, airport, and plane type and then display the output in both chart form and table form.

In [None]:
# You could also try creating a chart that lets you compare the presence of a given set of airlines (maybe up to 5) for a given set of airports (maybe up to 20. The charts could be either grouped bar charts or stacked bar charts (to show the airline/airport relationship).

In [None]:
## Top 20 airlines in 2018:

df_top_20_airlines = pd.read_csv('top_20_airlines_by_passengers_2018.csv')
df_top_20_airlines

Creating a list of the top 4 airlines (which will be useful for a later graph):

In [None]:
top_airlines_list = list(df_top_20_airlines['Airline'][0:4])
top_airlines_list

In [None]:
fig_top_20_airlines_2018 = px.bar(df_top_20_airlines, x="Airline", y="Passengers")
fig_top_20_airlines_2018

In [None]:
df_airline_airport_pairs = pd.read_csv('airport_airline_pairs_2018.csv')
df_airline_airport_pairs

In [None]:
top_airline_list_as_string = ("|".join(top_airlines_list)) # Converts the airlines in the list to a string value that the following np.where statement can use to create an 'Other' category of airlines
top_airline_list_as_string

In [None]:
df_airline_airport_pairs

In [None]:
df_top_airlines_and_airports = df_airline_airport_pairs.query("Airport in @top_20_airports_list").copy().reset_index(drop=True)
df_top_airlines_and_airports['Airline'] = np.where(df_top_airlines_and_airports['Airline'].str.contains(top_airline_list_as_string) == False, 'Other', df_top_airlines_and_airports['Airline'])
df_top_airlines_and_airports

In [None]:
df_top_airlines_and_airports = df_top_airlines_and_airports.pivot_table(index = ["Airline", "Airport"], values = "Passengers", aggfunc = "sum").reset_index()


In [None]:
df_top_airlines_and_airports['Airline'].value_counts()

In [None]:
airport_ranks = df_top_20_airports[['Airport', 'Rank']]
airport_ranks

In [None]:
df_top_airlines_and_airports = df_top_airlines_and_airports.merge(airport_ranks, left_on = "Airport", right_on = "Airport")


In [None]:
df_top_airlines_and_airports.sort_values("Rank", inplace = True)
df_top_airlines_and_airports

## Top 20 US Airports by Airline Share

In [None]:
df_top_20_airports

In [None]:
# Top 20 US Airports by Airline Share:

top_airlines_list = list(df_top_20_airlines['Airline'].unique()[0:5])
top_airline_list_as_string = ("|".join(top_airlines_list)) # Converts the airlines in the list to a string value that the following np.where statement can use to create an 'Other' category of airlines
# unique() tags are needed to remove duplicate entries for each airport and
# airline. (These duplicates were created through the addition of 
# domestic/international travel breakdowns for each of the top airports
# and airlines.)

top_20_airports_list = list(df_top_20_airports['Airport'].unique())
print(top_airlines_list, top_20_airports_list)

In [None]:
df_airline_airport_pairs = pd.read_csv("airport_airline_pairs_2018.csv")


df_top_airlines_and_airports = df_airline_airport_pairs.query("Airport in @top_20_airports_list").copy().reset_index(drop=True)
df_top_airlines_and_airports['Airline'] = np.where(df_top_airlines_and_airports['Airline'].str.contains(top_airline_list_as_string) == False, 'Other', df_top_airlines_and_airports['Airline'])
df_top_airlines_and_airports = df_top_airlines_and_airports.pivot_table(index = ["Airline", "Airport"], values = "Passengers", aggfunc = "sum").reset_index()

In [None]:
list(df_airline_airport_pairs.pivot_table(index = 'Airline', values = 'Passengers', aggfunc = 'sum').sort_values('Passengers', ascending = False).index[0:10])

In [None]:
df_airline_airport_pairs

In [None]:
airlines_limit = 5
airlines_to_keep = list(df_airline_airport_pairs.pivot_table(index = 'Airline', values = 'Passengers', aggfunc = 'sum').sort_values('Passengers', ascending = False).index[0:airlines_limit])
airlines_to_keep

In [None]:
new_df_airline_airport_pairs = df_airline_airport_pairs.query("Airline in @airlines_to_keep").copy()
new_df_airline_airport_pairs

In [None]:
airport_ranks = df_top_20_airports[['Airport', 'Rank']].drop_duplicates()
airport_ranks

In [None]:
df_top_airlines_and_airports = df_top_airlines_and_airports.merge(airport_ranks, left_on = "Airport", right_on = "Airport")

In [None]:



df_top_airlines_and_airports.sort_values("Rank", inplace = True)

fig_t4_airline_presence_at_t20_airports = px.bar(df_top_airlines_and_airports, x="Airport", y="Passengers", color="Airline", color_discrete_map=airline_color_map, title="Top 20 US Airports by Airline Share in 2018")
fig_t4_airline_presence_at_t20_airports

In [None]:
df_top_hubs = df_airline_airport_pairs.pivot_table(index = ['Airline', 'Airport'], values = 'Passengers', aggfunc = 'sum').reset_index().sort_values('Passengers', ascending = False)
df_top_hubs.head(20)
df_top_hubs['Hub'] = df_top_hubs['Airline'] + ' ' + df_top_hubs['Airport']
df_top_hubs

In [None]:
fig_top_hubs = px.bar(df_top_hubs.iloc[0:20, :], x = 'Hub', y = 'Passengers', color = 'Airline', color_discrete_map=airline_color_map)
fig_top_hubs.update_xaxes(categoryorder = 'total descending', title = "Top 20 US Airport Hubs in 2018 by Arriving Passengers") # See https://plotly.com/python/categorical-axes/

## Determining the top international hubs:

In [None]:
df_top_intl_hubs = df_airline_airport_pairs.query("Route_Type == 'International' & Destination_Region == 'Domestic'").pivot_table(index = ['Airline', 'Airport'], values = 'Passengers', aggfunc = 'sum').reset_index().sort_values('Passengers', ascending = False)
# I'd only like to show US airports within this chart, so I chose to filter it to include only domestic airports.
df_top_intl_hubs['Hub'] = df_top_intl_hubs['Airline'] + ' ' + df_top_intl_hubs['Airport']
df_top_intl_hubs.head(20)

In [None]:
fig_top_intl_hubs = px.bar(df_top_intl_hubs.iloc[0:20, :], x = 'Hub', y = 'Passengers', color = 'Airline', color_discrete_map=airline_color_map)
# fig_top_intl_hubs.update_traces(marker_line=dict(width=1,color='black'))
# See https://plotly.com/python/marker-style/
fig_top_intl_hubs.update_xaxes(categoryorder = 'total descending', title = "Top 20 US Airport Hubs in 2018 by Arriving International Passengers") # See https://plotly.com/python/categorical-axes/

# Creating a table of departures for a given airport:

In [None]:
df_dest_by_origin = pd.read_csv('dest_to_origin_2018.csv')

In [None]:
def create_departures_table(dest_airport):
    df_airport_by_origin = df_dest_by_origin.query("Airport == @dest_airport").sort_values('Passengers', ascending = False).reset_index(drop=True)
    if len(df_airport_by_origin) > 20:
        other_row = [dest_airport, 'Other', sum(df_airport_by_origin.iloc[20:,]['Passengers'])]
        df_airport_by_origin = df_airport_by_origin.iloc[0:20]
        df_airport_by_origin.loc[len(df_airport_by_origin)] = other_row
    pax_sum = sum(df_airport_by_origin['Passengers'])    
    df_airport_by_origin['Share'] = 100*df_airport_by_origin['Passengers'] / pax_sum  
    return (df_airport_by_origin)


In [None]:
df_origins_for_airport = create_departures_table('ABQ')
df_origins_for_airport

In [None]:
sum(df_origins_for_airport['Share'])

In [None]:
df = df_origins_for_airport
go.Figure(data=go.Table(
    header = dict(
        values = list(df.columns)
        ), 
    cells = dict(
        values = [df['Airport'], df['Origin'], df['Passengers'], df['Share']] )))

In [None]:
fig_top_origins_for_airport = px.bar(df_origins_for_airport, x = 'Origin', y = 'Passengers')
fig_top_origins_for_airport