In [None]:
import pandas as pd
import numpy as np
import dask.dataframe as dd

**Cleaning Data Columns**

Note: This part **can be skipped** if column names are already standardized.
* Turn all columns into lowercase with underscore
* Fix other column names (e.g. removing '.')
* Create route_id

In [None]:
OD_data = dd.read_csv('./worldwide_OD_1120.CSV')

In [None]:
OD_data.columns = OD_data.columns.str.replace(' ','_').str.lower()

OD_data = OD_data.rename(columns={"unnamed:_0": "route_id", "total_est._pax":"total_est_pax", 
                                  "avgfare_fully":"avgfare_full_y"})

OD_data['route_id'] = OD_data.index + 1

**From OD Data to Segment Data**

Process:  

**Part 1**: Identify segments from each OD
- Need: OD data
- Output: 3 separate files for direct, one-connection, and two-connection routes

**Part 2**: Based on information entered (cities, countries, regions, etc. and year), select OD with relevant segments, then sum passengers and take average fare:
- Create dictionary or list of segments from region-region, region-country, or country-country pairs requested
- Get entries from OD Dataset that have segments in the above dict/list
- Need: OD data with segments identified (Part 1), and Airports info
- Output: segment data with other info aggregated 

**PART 1: IDENTIFY SEGMENTS FROM ROUTES**

Note: This part **can be skipped** if dataset(s) of segments is/are already available.

Steps: 
1. Create a column with route name (e.g.A-B-C)
2. Separate to 3 sets: direct, one connection, two connections
3. For each set, identify all segments in route (e.g. A-B-C has 2 segments A-B and B-C)
    - For one-connection routes, each route would appear twice in the dataset due to having 2 segments
    - Similarly, for two-connection routes, each route would appear 3 times in the dataset due to having 3 segments
4. Save 3 sets as 3 separate files

In [None]:
# Create route column of formats AAA-BBB, AAA-BBB-CCC, AAA-BBB-CCC-DDD
OD_data['route'] = OD_data['dep_airport_code'] + str('-') + OD_data['gateway1_airport_code'].fillna('0') + str('-') + OD_data['gateway2_airport_code'].fillna('0') + str('-') + OD_data['arr_airport_code']

In [None]:
OD_data['route'] = OD_data['route'].str.replace('-0-0-','-')
OD_data['route'] = OD_data['route'].str.replace('-0-','-')

In [None]:
# Group based on character length in route name (7 characters = direct, 11 char = one-con, 15 char = two-con)
direct = OD_data[OD_data['route'].str.len() == 7]
one_stop = OD_data[OD_data['route'].str.len() == 11]
two_stop = OD_data[OD_data['route'].str.len() == 15]

# direct.reset_index(inplace = True, drop = True)
# one_stop.reset_index(inplace = True, drop = True)
# two_stop.reset_index(inplace = True, drop = True)

In [None]:
# print(len(direct.index), len(one_stop.index), len(two_stop.index))

In [None]:
# Identify segments for direct routes (AAA-BBB)
direct['segment'] = direct['route']

In [None]:
direct = direct.compute()

In [None]:
direct.to_csv('./OD_with_segment_direct.CSV', index = False)

In [None]:
# Identify segments for one-connection routes (AAA-BBB-CCC)
one_stop['segment1'] = one_stop['route'].str[:7]
one_stop['segment2'] = one_stop['route'].str[4:]

In [None]:
one_stop = one_stop.compute()

In [None]:
# Melt columns 'segment1' and 'segment2' into one column 'segment'
# Each route would appear twice in the set, each time with a different value in 'segment' 

id_vars = ['route_id', 'dep_airport_code', 'gateway1_airport_code',
           'gateway2_airport_code', 'arr_airport_code', 'total_est_pax',
           'km', 'cabin_first', 'cabin_business', 'cabin_premium_economy',
           'cabin_full_y', 'cabin_discount_economy', 'avgfare_total',
           'avgfare_first', 'avgfare_business', 'avgfare_premium_economy',
           'avgfare_full_y', 'avgfare_discount_economy', 'time_series',
           'route']
segment_x = [c for c in one_stop if c.startswith('segment')]

one_stop = pd.melt(one_stop, id_vars=id_vars, value_vars=segment_x, value_name = 'segment')
one_stop = one_stop.drop(columns=['variable'])

In [None]:
one_stop.to_csv('./OD_with_segment_one.CSV', index = False)

In [None]:
# Identify segments for two-connection routes (AAA-BBB-CCC-DDD)
two_stop['segment1'] = two_stop['route'].str[:7]
two_stop['segment2'] = two_stop['route'].str[4:11]
two_stop['segment3'] = two_stop['route'].str[8:]

In [None]:
two_stop = two_stop.compute()

In [None]:
two_stop.to_csv('./OD_with_segment_two.CSV', index = False)

Restart kernel here to avoid dead kernel

In [None]:
two_stop = pd.read_csv('./OD_with_segment_two.CSV')

In [None]:
# Melt columns 'segment1', 'segment2' and 'segment3' into one column 'segment'
# Each route would appear 3 times in the set, each time with a different value in 'segment' 

id_vars = ['route_id', 'dep_airport_code', 'gateway1_airport_code',
           'gateway2_airport_code', 'arr_airport_code', 'total_est_pax',
           'km', 'cabin_first', 'cabin_business', 'cabin_premium_economy',
           'cabin_full_y', 'cabin_discount_economy', 'avgfare_total',
           'avgfare_first', 'avgfare_business', 'avgfare_premium_economy',
           'avgfare_full_y', 'avgfare_discount_economy', 'time_series',
           'route']
segment_x = [c for c in two_stop if c.startswith('segment')]
two_stop = pd.melt(two_stop, id_vars=id_vars, value_vars=segment_x, value_name = 'segment')
two_stop = two_stop.drop(columns=['variable'])

In [None]:
two_stop.to_csv('./OD_with_segment_two.CSV', index = False)