In [None]:
import pandas as pd
import numpy as np
import itertools
import dask.dataframe as dd

**From OD Data to Segment Data**

Process:  

**Part 1**: Identify segments from each OD
- Need: OD data
- Output: 3 separate files for direct, one-connection, and two-connection routes

**Part 2**: Based on information entered (cities, countries, regions, etc. and year), select OD with relevant segments, then sum passengers and take average fare:
- Need: OD data with segments identified (Part 1), and Airports info
- Output: segment data with other info aggregated 

**PART 2: PULL SEGMENT DATA BASED ON REQUESTED FIELDS**

Steps: 
1. From Airports data, create list of all possible segments from region-region, region-country, or country-country pairs requested, and airports must be effective within the requested period
2. From data in Part 1, get segments within above list and within that period
3. Sum for passengers and average fare

In [None]:
# ENTER INFORMATION TO REQUEST HERE

# Time: any time within range [2011, 2020]
start = 2011
end = 2013

# Airport 1: 
region1 = ['AF1']
country1 = []
city1 = []

# Airport 2:
region2 = ['AF2']
country2 = []
city2 = []

In [None]:
# Load airports data 
airports = pd.read_csv('./worldwide_airports_info.CSV')

airports['eff_from'] = pd.to_datetime(airports['eff_from'])
airports['eff_to'] = pd.to_datetime(airports['eff_to'])

In [None]:
# Filter for airports valid within requested period and based in requested regions/countries/cities 
airports_period = airports[~(airports['eff_from'].dt.year > end) & ~(airports['eff_to'].dt.year < start)]

airport1_df =  airports_period[airports['region'].isin(region1)]
airport1_set = list(set(airport1_df['iata'].tolist())) # get unique values of airport names, turn to list

airport2_df =  airports_period[airports['region'].isin(region2)]
airport2_set = list(set(airport2_df['iata'].tolist()))

In [None]:
# Permutations of two airport lists are all possible segments 
# NOTE: no need to remove combinations like AAA-AAA because the data would not have such segments anyway
airport_pair = list(itertools.product(airport1_set, airport2_set))
req_segments = [c + str('-') + d for (c, d) in airport_pair]

In [None]:
%%time

# Load OD data with segments identified
direct = dd.read_csv('./OD_with_segment_direct.CSV', 
                     dtype={'gateway1_airport_code': 'object','gateway2_airport_code': 'object'})
one_stop = dd.read_csv('./OD_with_segment_one.CSV', 
                       dtype={'gateway1_airport_code': 'object','gateway2_airport_code': 'object'})
two_stop = dd.read_csv('./OD_with_segment_two.CSV')

In [None]:
# Get entries relevant to list of segments requested
subset = dd.concat([direct[direct['segment'].isin(req_segments)], 
                    one_stop[one_stop['segment'].isin(req_segments)], 
                    two_stop[two_stop['segment'].isin(req_segments)]], axis = 0)

subset = subset[subset['time_series'].isin(range(start,end+1))]

In [None]:
subset = subset.compute()

In [None]:
subset.reset_index(inplace = True, drop = True)

**FLEET Refresh filters needed here**

Before combining into segments, data must be filtered 
1. Route with X passengers only for the whole year? (but this X means the number of people going directly from A to B, there can be other people on the flight, they just don't go directly there) --> Maybe filter after combining to segments
2. km? 
3. Average fare less than 50? 

In [None]:
# Group by segment and year, then sum pax and average fare
output = subset.groupby(['time_series', 'segment']).agg({'total_est._pax':'sum', 
                                                         'cabin_first':'sum', 
                                                         'cabin_business':'sum', 
                                                         'cabin_premium_economy':'sum', 
                                                         'cabin_full_y':'sum', 
                                                         'cabin_discount_economy':'sum', 
                                                         'avgfare_total':'mean', 
                                                         'avgfare_first':'mean', 
                                                         'avgfare_business':'mean',
                                                         'avgfare_premium_economy':'mean',
                                                         'avgfare_fully':'mean',
                                                         'avgfare_discount_economy':'mean'})
output.reset_index(inplace = True)

# NOTE: More accurate way to calculate fare is to multiply each fare by the corresponding number of pax 

In [None]:
# Save/show output
output