# Preprocessing datasets

File dedicated to preprocessing datasets to preprare them for use in the final model. 

In [1]:
# importing packages
import pandas as pd
import os
import problem

## US GDP data

In [2]:
us_gdp = pd.read_csv(os.path.join('raw_add_data', 'gdp_data.csv'), na_values='(D)')
us_gdp.head()

Unnamed: 0,GeoFips,GeoName,LineCode,Description,2011,2012,2013
0,12060,"Atlanta-Sandy Springs-Alpharetta, GA (Metropol...",1.0,All industry total,277405509.0,287570424.0,299918186.0
1,12060,"Atlanta-Sandy Springs-Alpharetta, GA (Metropol...",2.0,Private industries,250779823.0,260962774.0,273113553.0
2,12060,"Atlanta-Sandy Springs-Alpharetta, GA (Metropol...",3.0,"Agriculture, forestry, fishing and hunting",,,602096.0
3,12060,"Atlanta-Sandy Springs-Alpharetta, GA (Metropol...",6.0,"Mining, quarrying, and oil and gas extraction",267771.0,228417.0,231368.0
4,12060,"Atlanta-Sandy Springs-Alpharetta, GA (Metropol...",10.0,Utilities,3704264.0,3582854.0,


The first step is to associate the column GeoName with the list of airports from the original problem. 

In [3]:
cities = us_gdp['GeoName'].unique()
print(f'Total cities: {len(cities)}')
print(f'The cities are: \n{cities}')


Total cities: 18
The cities are: 
['Atlanta-Sandy Springs-Alpharetta, GA (Metropolitan Statistical Area)'
 'Austin-Round Rock-Georgetown, TX (Metropolitan Statistical Area)'
 'Boston-Cambridge-Newton, MA-NH (Metropolitan Statistical Area)'
 'Charlotte-Concord-Gastonia, NC-SC (Metropolitan Statistical Area)'
 'Chicago-Naperville-Elgin, IL-IN-WI (Metropolitan Statistical Area)'
 'Denver-Aurora-Lakewood, CO (Metropolitan Statistical Area) *'
 'Detroit-Warren-Dearborn, MI (Metropolitan Statistical Area)'
 'Houston-The Woodlands-Sugar Land, TX (Metropolitan Statistical Area)'
 'Las Vegas-Henderson-Paradise, NV (Metropolitan Statistical Area)'
 'Los Angeles-Long Beach-Anaheim, CA (Metropolitan Statistical Area)'
 'Miami-Fort Lauderdale-Pompano Beach, FL (Metropolitan Statistical Area)'
 'Minneapolis-St. Paul-Bloomington, MN-WI (Metropolitan Statistical Area)'
 'New York-Newark-Jersey City, NY-NJ-PA (Metropolitan Statistical Area)'
 'Orlando-Kissimmee-Sanford, FL (Metropolitan Statistical Are

In [4]:
#retrieve name of the first city from GeoName column
us_gdp['city'] = us_gdp['GeoName'].apply(lambda city: city.split('-')[0])
cities = us_gdp['city'].unique()
print('The values in the column city are:\n', cities) 

The values in the column city are:
 ['Atlanta' 'Austin' 'Boston' 'Charlotte' 'Chicago' 'Denver' 'Detroit'
 'Houston' 'Las Vegas' 'Los Angeles' 'Miami' 'Minneapolis' 'New York'
 'Orlando' 'Philadelphia' 'Phoenix' 'San Francisco' 'Seattle']


We can check the correspondance between the cities and the airports from the file `city_ariport.csv`

In [5]:
city_airport = pd.read_csv(os.path.join('raw_add_data', 'city_airport.csv'), header=None)
city_airport.columns = ['airport', 'city']
print(city_airport)

   airport           city
0      ATL        Atlanta
1      BOS         Boston
2      CLT      Charlotte
3      ORD        Chicago
4      DEN         Denver
5      DTW        Detroit
6      DFW         Austin
7      IAH        Houston
8      LAS      Las Vegas
9      LAX    Los Angeles
10     MIA          Miami
11     MSP    Minneapolis
12     JFK       New York
13     LGA       New York
14     EWR       New York
15     MCO        Orlando
16     PHL   Philadelphia
17     PHX        Phoenix
18     SFO  San Francisco
19     SEA        Seattle


In [6]:
merged_us_gdp = pd.merge(us_gdp, city_airport, on='city')
merged_us_gdp.head()

Unnamed: 0,GeoFips,GeoName,LineCode,Description,2011,2012,2013,city,airport
0,12060,"Atlanta-Sandy Springs-Alpharetta, GA (Metropol...",1.0,All industry total,277405509.0,287570424.0,299918186.0,Atlanta,ATL
1,12060,"Atlanta-Sandy Springs-Alpharetta, GA (Metropol...",2.0,Private industries,250779823.0,260962774.0,273113553.0,Atlanta,ATL
2,12060,"Atlanta-Sandy Springs-Alpharetta, GA (Metropol...",3.0,"Agriculture, forestry, fishing and hunting",,,602096.0,Atlanta,ATL
3,12060,"Atlanta-Sandy Springs-Alpharetta, GA (Metropol...",6.0,"Mining, quarrying, and oil and gas extraction",267771.0,228417.0,231368.0,Atlanta,ATL
4,12060,"Atlanta-Sandy Springs-Alpharetta, GA (Metropol...",10.0,Utilities,3704264.0,3582854.0,,Atlanta,ATL


The columns `city`, `GeoFips` and `GeoName` are no longer useful. The column `Description` is encode in `LineCode` and can also be deleted.

In [7]:
del (merged_us_gdp['GeoFips'], merged_us_gdp['GeoName'], merged_us_gdp['Description'],
     merged_us_gdp['city'])


In [8]:
merged_us_gdp.head()

Unnamed: 0,LineCode,2011,2012,2013,airport
0,1.0,277405509.0,287570424.0,299918186.0,ATL
1,2.0,250779823.0,260962774.0,273113553.0,ATL
2,3.0,,,602096.0,ATL
3,6.0,267771.0,228417.0,231368.0,ATL
4,10.0,3704264.0,3582854.0,,ATL


In [9]:
#melting data frame to obtain years as a factor
melted_us_gdp = merged_us_gdp.melt(id_vars=['LineCode', 'airport'], 
                                   value_vars=['2011', '2012', '2013'],
                                   var_name='year', value_name='gdp')
melted_us_gdp.head()

Unnamed: 0,LineCode,airport,year,gdp
0,1.0,ATL,2011,277405509.0
1,2.0,ATL,2011,250779823.0
2,3.0,ATL,2011,
3,6.0,ATL,2011,267771.0
4,10.0,ATL,2011,3704264.0


In [10]:
#converting dates to datetime
melted_us_gdp['year'] = pd.to_datetime(melted_us_gdp['year'], format='%Y')
melted_us_gdp.set_index('year', inplace=True)
melted_us_gdp.head()

Unnamed: 0_level_0,LineCode,airport,gdp
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2011-01-01,1.0,ATL,277405509.0
2011-01-01,2.0,ATL,250779823.0
2011-01-01,3.0,ATL,
2011-01-01,6.0,ATL,267771.0
2011-01-01,10.0,ATL,3704264.0


In [11]:
melted_us_gdp.to_csv(os.path.join('processed_add_data', 'gdp_data_processed.csv'))

## US holiday data

In [12]:
us_hol = pd.read_csv(os.path.join('raw_add_data', 'usholidays.csv'), usecols=['Date', 'Holiday'])
us_hol['Date'] = pd.to_datetime(us_hol['Date']) # convert dates to datetime
us_hol.set_index('Date', inplace=True)
us_hol.head()

Unnamed: 0_level_0,Holiday
Date,Unnamed: 1_level_1
2010-12-31,New Year's Day
2011-01-17,"Birthday of Martin Luther King, Jr."
2011-02-21,Washington's Birthday
2011-05-30,Memorial Day
2011-07-04,Independence Day


To select the relevant dates, we rely on the train and test datasets

In [13]:
x_train, _ = problem.get_train_data()
x_test, _ = problem.get_test_data()

max_date = max(x_test['DateOfDeparture'].max(), x_train['DateOfDeparture'].max())
min_date = min(x_test['DateOfDeparture'].min(), x_train['DateOfDeparture'].min())

print(f'The max date is: {max_date}')
print(f'The min date is: {min_date}')

The max date is: 2013-03-05
The min date is: 2011-09-01


In [14]:
us_hol = us_hol[min_date : max_date]
us_hol.to_csv(os.path.join('processed_add_data', 'usholidays_processed.csv'))

## US flights data

The data comes from the Bureau of Transportation Statistics' T-100 form. It aggregates the domestic flight infomartion monthly by Air Company in the US. 

In [15]:
flight_11 = pd.read_csv(os.path.join('raw_add_data', 'flight_data_2011.csv'))
flight_12 = pd.read_csv(os.path.join('raw_add_data', 'flight_data_2012.csv'))
flight_13 = pd.read_csv(os.path.join('raw_add_data', 'flight_data_2013.csv'))

In [16]:
flight = pd.concat([flight_11, flight_12, flight_13])

### Data exploration

In [17]:
print(flight.columns)

Index(['DEPARTURES_SCHEDULED', 'DEPARTURES_PERFORMED', 'PAYLOAD', 'SEATS',
       'PASSENGERS', 'FREIGHT', 'MAIL', 'DISTANCE', 'RAMP_TO_RAMP', 'AIR_TIME',
       'CARRIER_NAME', 'ORIGIN', 'DEST', 'YEAR', 'MONTH', 'Unnamed: 15'],
      dtype='object')


We are not interested in the `PAYLOAD`, `FREIGHT`, `MAIL`, `RAMP_TO_RAMP`, `AIR_TIME`, and `CARRIER_NAME` variables

In [19]:
del (flight['PAYLOAD'], flight['FREIGHT'], flight['MAIL'], flight['RAMP_TO_RAMP'],
     flight['AIR_TIME'], flight['CARRIER_NAME'])

In [20]:
print(flight.columns)

Index(['DEPARTURES_SCHEDULED', 'DEPARTURES_PERFORMED', 'SEATS', 'PASSENGERS',
       'DISTANCE', 'ORIGIN', 'DEST', 'YEAR', 'MONTH', 'Unnamed: 15'],
      dtype='object')


In [21]:
flight.head()

Unnamed: 0,DEPARTURES_SCHEDULED,DEPARTURES_PERFORMED,SEATS,PASSENGERS,DISTANCE,ORIGIN,DEST,YEAR,MONTH,Unnamed: 15
0,0.0,2.0,0.0,0.0,0.0,LRD,LRD,2011,9,
1,0.0,1.0,0.0,0.0,0.0,LRD,LRD,2011,9,
2,0.0,1.0,0.0,0.0,855.0,LRD,MCI,2011,9,
3,0.0,1.0,0.0,0.0,1562.0,LRD,MDT,2011,9,
4,0.0,1.0,0.0,0.0,762.0,LRD,MEM,2011,9,


In [22]:
test1 = flight['ORIGIN']=='LAX'
test2 = flight['DEST']=='JFK'
test3 = flight['MONTH']==3
test4 = flight['YEAR']==2011
flight.loc[test1 & test2 & test3 & test4, :]

Unnamed: 0,DEPARTURES_SCHEDULED,DEPARTURES_PERFORMED,SEATS,PASSENGERS,DISTANCE,ORIGIN,DEST,YEAR,MONTH,Unnamed: 15
37842,0.0,26.0,6096.0,3978.0,2475.0,LAX,JFK,2011,3,
37863,0.0,1.0,0.0,0.0,2475.0,LAX,JFK,2011,3,
128734,1.0,1.0,225.0,133.0,2475.0,LAX,JFK,2011,3,
155024,2.0,2.0,376.0,355.0,2475.0,LAX,JFK,2011,3,
327119,89.0,90.0,13500.0,11681.0,2475.0,LAX,JFK,2011,3,
348607,181.0,181.0,26921.0,23399.0,2475.0,LAX,JFK,2011,3,
348691,182.0,182.0,20020.0,17479.0,2475.0,LAX,JFK,2011,3,
350167,209.0,209.0,36361.0,29610.0,2475.0,LAX,JFK,2011,3,
351998,302.0,300.0,50400.0,43643.0,2475.0,LAX,JFK,2011,3,


Since our original data was divided by air carrier, we have more than one entry per month. We need, therefore, to group the data and sum the columns `DEPARTURES_SCHEDULED`, `DEPARTURES_PERFORMED`, `SEATS`, and `PASSENGERS`

In [23]:
flight = flight.groupby(['MONTH', 'YEAR', 'ORIGIN', 'DEST', 'DISTANCE'], as_index=False).sum()

In [24]:
#check that we have only one entry per month
# redefine tests as indexes were changed
test1 = flight['ORIGIN']=='LAX'
test2 = flight['DEST']=='JFK'
test3 = flight['MONTH']==3
test4 = flight['YEAR']==2011
flight.loc[test1 & test2 & test3 & test4, :]

Unnamed: 0,MONTH,YEAR,ORIGIN,DEST,DISTANCE,DEPARTURES_SCHEDULED,DEPARTURES_PERFORMED,SEATS,PASSENGERS,Unnamed: 15
66042,3,2011,LAX,JFK,2475.0,966.0,992.0,153899.0,130278.0,0.0


### Data Transformation

We want to calculate the total number of passengers, flights, and seats that originated from each of the airports.

In [25]:
origin_totals = flight.loc[:, flight.columns != 'DISTANCE'].groupby(
    ['MONTH', 'YEAR', 'ORIGIN'], as_index=False
                                                                    ).sum()

In [26]:
origin_totals.head()

Unnamed: 0,MONTH,YEAR,ORIGIN,DEPARTURES_SCHEDULED,DEPARTURES_PERFORMED,SEATS,PASSENGERS,Unnamed: 15
0,1,2011,1G4,0.0,49.0,931.0,664.0,0.0
1,1,2011,A07,0.0,1.0,0.0,0.0,0.0
2,1,2011,A23,0.0,9.0,54.0,7.0,0.0
3,1,2011,A27,0.0,4.0,36.0,8.0,0.0
4,1,2011,AA8,0.0,1.0,19.0,3.0,0.0
