In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib notebook
%matplotlib notebook

In [2]:
# Read FLEET's routes data - directional 
old_routes = pd.read_csv('./4846_Route List.csv')
columns = ['#', 'AirportA', 'Country A', 'Continent A', 'Airport A Lat', 'Airport A Lon',
           'AirportB', 'Country B', 'Continent B', 'Airport B Lat', 'Airport B Lon','Demand in 2005']
old_routes = old_routes[columns]

old_routes['ROUTE'] = old_routes.apply(lambda x: x.AirportA+'-'+x.AirportB if x.AirportA > x.AirportB 
                                       else x.AirportB+'-'+x.AirportA, axis=1)

old_routes['DIRROUTE'] = old_routes.apply(lambda x: x.AirportA+'-'+x.AirportB, axis = 1) 

old_routes = old_routes.groupby(['DIRROUTE']).agg({'Demand in 2005': 'sum'})

old_routes.rename(columns = {'Demand in 2005':'Demand'}, inplace = True) # Change column name 

old_routes

Unnamed: 0_level_0,Demand
DIRROUTE,Unnamed: 1_level_1
ABQ-ATL,160767
ABQ-BWI,43747
ABQ-CLE,676
ABQ-COS,4994
ABQ-CVG,96517
...,...
ZRH-EWR,50123
ZRH-IAD,53228
ZRH-JFK,58875
ZRH-OKC,206


In [3]:
# Read new routes data - filtered directional routes with yearly demand, all airlines combined 
new_routes = pd.read_csv('./yearly_demand_2005.csv')

new_routes.rename(columns = {"PASSENGERS":"Demand"}, inplace = True) # Change column name 

new_routes

Unnamed: 0.1,Unnamed: 0,DIRROUTE,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,DEPARTURES_SCHEDULED,DEPARTURES_PERFORMED,SEATS,Demand,DISTANCE,AIR_TIME
0,0,ABQ-ATL,ABQ,"Albuquerque, NM",ATL,"Atlanta, GA",1396.0,1381.0,226904.0,161016.0,1269.0,8126.694444
1,5,ABQ-BWI,ABQ,"Albuquerque, NM",BWI,"Baltimore, MD",365.0,364.0,49868.0,43747.0,1670.0,5236.541667
2,7,ABQ-COS,ABQ,"Albuquerque, NM",COS,"Colorado Springs, CO",569.0,553.0,10889.0,4994.0,280.0,2549.916667
3,9,ABQ-CVG,ABQ,"Albuquerque, NM",CVG,"Cincinnati, OH",800.0,795.0,132522.0,96517.0,1240.0,8238.208333
4,10,ABQ-DAL,ABQ,"Albuquerque, NM",DAL,"Dallas, TX",2059.0,2059.0,275618.0,191543.0,580.0,4510.333333
...,...,...,...,...,...,...,...,...,...,...,...,...
3972,5589,ZRH-ATL,ZRH,"Zurich, Switzerland",ATL,"Atlanta, GA",365.0,360.0,73371.0,55459.0,4691.0,17860.416667
3973,5591,ZRH-DFW,ZRH,"Zurich, Switzerland",DFW,"Dallas/Fort Worth, TX",362.0,362.0,76447.0,65687.0,5240.0,19570.166667
3974,5592,ZRH-EWR,ZRH,"Zurich, Switzerland",EWR,"Newark, NJ",360.0,359.0,62952.0,50123.0,3946.0,11353.208333
3975,5593,ZRH-IAD,ZRH,"Zurich, Switzerland",IAD,"Washington, DC",366.0,365.0,70459.0,53228.0,4158.0,15436.791667


In [4]:
'''
Merging two data sets, joining the DIRROUTE columns in two data sets 
indicator = True to show which set a route belongs to: 'left_only' indicates old set, 'right_only' indicates new set
how = 'outer' means using the union of the 2 sets to compare 
''' 
comparison_df = old_routes.merge(new_routes, left_on = 'DIRROUTE', right_on = 'DIRROUTE',
                                  suffixes=('_old', '_new'), indicator=True, how='outer')
comparison_df.to_csv('./compare_demand_v2.csv')
comparison_df

Unnamed: 0.1,DIRROUTE,Demand_old,Unnamed: 0,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,DEPARTURES_SCHEDULED,DEPARTURES_PERFORMED,SEATS,Demand_new,DISTANCE,AIR_TIME,_merge
0,ABQ-ATL,160767.0,0.0,ABQ,"Albuquerque, NM",ATL,"Atlanta, GA",1396.0,1381.0,226904.0,161016.0,1269.0,8126.694444,both
1,ABQ-BWI,43747.0,5.0,ABQ,"Albuquerque, NM",BWI,"Baltimore, MD",365.0,364.0,49868.0,43747.0,1670.0,5236.541667,both
2,ABQ-CLE,676.0,,,,,,,,,,,,left_only
3,ABQ-COS,4994.0,7.0,ABQ,"Albuquerque, NM",COS,"Colorado Springs, CO",569.0,553.0,10889.0,4994.0,280.0,2549.916667,both
4,ABQ-CVG,96517.0,9.0,ABQ,"Albuquerque, NM",CVG,"Cincinnati, OH",800.0,795.0,132522.0,96517.0,1240.0,8238.208333,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4849,CVG-DAB,,1094.0,CVG,"Cincinnati, OH",DAB,"Daytona Beach, FL",0.0,382.0,26500.0,21503.0,711.0,2498.636364,right_only
4850,DAB-CVG,,1173.0,DAB,"Daytona Beach, FL",CVG,"Cincinnati, OH",0.0,376.0,26140.0,22780.0,711.0,2994.909091,right_only
4851,EWR-OSL,,1749.0,EWR,"Newark, NJ",OSL,"Oslo, Norway",248.0,248.0,42685.0,33884.0,3697.0,8505.333333,right_only
4852,LIT-SLC,,2951.0,LIT,"Little Rock, AR",SLC,"Salt Lake City, UT",0.0,624.0,43679.0,34657.0,1156.0,8287.416667,right_only


In [5]:
# Print out number of similar and different routes
status = comparison_df['_merge'].values.tolist()
same_routes = status.count('both')
diff_routes_old = status.count('left_only')
diff_routes_new = status.count('right_only')
print(f'There are {same_routes} same routes.\n' +
     f'There are {diff_routes_old} routes in the original dataset but not in the new dataset.\n' +
     f'There are {diff_routes_new} routes in the new dataset but not in the original dataset.')


There are 3969 same routes.
There are 877 routes in the original dataset but not in the new dataset.
There are 8 routes in the new dataset but not in the original dataset.


In [18]:
# All similar routes
# same_routes_df = comparison_df[comparison_df['_merge'] == 'both']
# same_routes_df

In [19]:
# All other routes left in the original list 
# diff_routes_old_df = comparison_df[comparison_df['_merge'] == 'left_only']
# diff_routes_olf_df

In [20]:
# All other routes left in the new list 
# diff_routes_new_df = comparison_df[comparison_df['_merge'] == 'right_only']
# diff_routes_new_df

In [21]:
'''
Track routes in "prefilter_2005" file - routes with origin and/or destination within the US, before other filters
'''

# all_2005 = pd.read_csv('./prefilter_new_routes.csv')
# all_2005['DIRROUTE'] = all_2005.apply(lambda x: x.ORIGIN+'-'+x.DEST, axis = 1)

'\nTrack routes in "prefilter_2005" file - routes with origin and/or destination within the US, before other filters\n'

In [22]:
# Leftover routes in the old list 
# diff_routes_old_analysis = group_2005[group_2005['DIRROUTE'].isin(diff_routes_old_df['DIRROUTE'])]
# diff_routes_old_analysis.to_csv('./diff_in_old.csv')

In [23]:
# Leftover routes in the new list but not in the old list 
# diff_routes_new_analysis = all_2005[all_2005['DIRROUTE'].isin(diff_routes_new_df['DIRROUTE'])]
# diff_routes_new_analysis.to_csv('./diff_in_new_nofilter.csv')

In [24]:
# Analyze 7 routes that match airport pairs (origin-destination) but missing demand
# Find all routes with those airport pairs in the original list (prefilter)
# diff_demand = all_2005[all_2005['DIRROUTE'].isin(['CLE-PHL', 'JFK-SNN', 'PIT-CLE',
#                                                  'HNL-OAK', 'OAK-HNL', 'PIT-RSW','RSW-PIT'])]

# No problem with the filter - demand got lost when aggregating for yearly performance, using UNIQUE CARRIER & CARRIER
# diff_demand = diff_demand[diff_demand['CARRIER_GROUP_NEW'] != 7]
# diff_demand = diff_demand[diff_demand['AIRCRAFT_CONFIG'] != 2]
# diff_demand = diff_demand[diff_demand['AIRCRAFT_CONFIG'] != 4]
# diff_demand = diff_demand[diff_demand['CLASS'].isin(['F','A','C','E'])]
# diff_demand = diff_demand[diff_demand['CARRIER_GROUP_NEW'] != 0]
# diff_demand = diff_demand[diff_demand['CARRIER_GROUP'] != 0]
# diff_demand = diff_demand[diff_demand['DEPARTURES_SCHEDULED'] > 0]
# diff_demand = diff_demand[(diff_demand['PASSENGERS'] > 0) & (diff_demand['SEATS'] > 0)]
# diff_demand = diff_demand[diff_demand['DISTANCE'] != 0]
# diff_demand.to_csv('./missing_demand_routes.csv')