# Wrangle Data Together

Script assumes 3 files:
- ORD weather data
- ORD taxi data
- ORD flight data
- Airline fleet matrices 

Weather Variables:
https://mesonet.agron.iastate.edu/request/download.phtml?network=IL_ASOS

Flight Variables:
https://www.transtats.bts.gov/Fields.asp?Table_ID=236

In [1]:
import pandas as pd
import numpy as np

## Load data

In [2]:
ORD_weather = pd.read_csv("../data/ORD_weather.txt", sep =  ",", skiprows = 5)
ORD_outbound = pd.read_csv("../data/ORD_outbound.csv")
ORD_OTP = pd.read_csv("../data/ORD_OTP.csv")
seat_data = pd.read_csv("../data/seat_counts_wiki.csv")

  interactivity=interactivity, compiler=compiler, result=result)


## Wrangle weather data

In [4]:
# Remove unnecessary columns
ORD_weather = ORD_weather.drop(columns=['station', 'lon', 'lat', 'drct', 'alti', 'mslp', 'gust', 'skyc4', 'skyl4', 'wxcodes', 'ice_accretion_1hr', 'ice_accretion_3hr', 'ice_accretion_6hr', 'peak_wind_gust', 'peak_wind_drct', 'peak_wind_time', 'metar'])
# Obtain datetime
ORD_weather = ORD_weather.rename(columns={'valid' : 'date'})
ORD_weather['date'] = pd.to_datetime(ORD_weather['date'])
ORD_weather = ORD_weather.set_index("date")
# Change missing values to nan
ORD_weather = ORD_weather.replace('M', np.nan)
# Change columns data types so that averaging can occur
numeric_weather_features = ['tmpf', 'dwpf', 'relh', 'sknt', 'p01i', 'vsby', 'skyl1', 'skyl2', 'skyl3', 'feel']
categorical_weather_features = ['skyc1', 'skyc2', 'skyc3']
ORD_weather[numeric_weather_features] = ORD_weather[numeric_weather_features].apply(pd.to_numeric)
ORD_weather[categorical_weather_features] = ORD_weather[categorical_weather_features].astype('category')

**TO DO: Figure out converting categoricals to ordinal for averaging and inclusion.**

In [5]:
# Strip out time categories
ORD_weather['year'] = ORD_weather.index.year
ORD_weather['month'] = ORD_weather.index.month
ORD_weather['day'] = ORD_weather.index.day
ORD_weather['hour'] = ORD_weather.index.hour
# Average columns by hour
ORD_weather_hourly = ORD_weather.groupby(['year', 'month', 'day', 'hour']).mean().reset_index()

## Wrangle Taxi Data

In [65]:
# Remove unnecessary columns
ORD_outbound = ORD_outbound.drop(columns=['Unnamed: 0', 'pickup_community_area'])

## Wrangle Flight Data

In [15]:
ORD_OTP = ORD_OTP.drop(columns=['Unnamed: 0'])
ORD_OTP['FL_DATE'] = pd.to_datetime(ORD_OTP['FL_DATE'])
ORD_OTP = ORD_OTP[['FL_DATE', 'OP_UNIQUE_CARRIER', 'ARR_HOUR']]

In [61]:
ORD_OTP['year'] = pd.DatetimeIndex(ORD_OTP['FL_DATE']).year
ORD_OTP['month'] = pd.DatetimeIndex(ORD_OTP['FL_DATE']).month
ORD_OTP['day'] = pd.DatetimeIndex(ORD_OTP['FL_DATE']).day
ORD_OTP = ORD_OTP.rename(columns={'ARR_HOUR':'hour'})
ORD_OTP = ORD_OTP.dropna()
ORD_OTP.astype({'hour': 'int64'})
ORD_OTP = ORD_OTP.drop(columns=["FL_DATE"])

In [62]:
ORD_OTP

Unnamed: 0,OP_UNIQUE_CARRIER,hour,year,month,day
0,AA,0.0,2013,1,7
1,AA,0.0,2013,1,14
2,AA,0.0,2013,1,21
3,AA,0.0,2013,1,28
4,AA,8.0,2013,1,7
...,...,...,...,...,...
2106102,OO,20.0,2019,11,17
2106103,OO,7.0,2019,11,17
2106104,OO,18.0,2019,11,17
2106105,OO,8.0,2019,11,17


In [3]:
plane_details = pd.read_csv("../data/plane_details.csv")

In [4]:
plane_details

Unnamed: 0.1,Unnamed: 0,tail_num,manufacturer,model
0,0,N582AA,MCDONNELL DOUGLAS,DC-9-82(MD-82)
1,1,N439AA,MCDONNELL DOUGLAS,DC-9-83(MD-83)
2,2,N553AA,MCDONNELL DOUGLAS,DC-9-82(MD-82)
3,3,N558AA,MCDONNELL DOUGLAS,DC-9-82(MD-82)
4,4,N536AA,AMERICAN AIRCRAFT INC,FALCON XP
...,...,...,...,...
4507,4507,N324DX,AIRBUS,A321-211
4508,4508,N384DN,AIRBUS,A321-211
4509,4509,N872DN,BOEING,737-900ER
4510,4510,N406AN,AIRBUS,A321-253NX


In [58]:
ORD_OTP

Unnamed: 0.1,Unnamed: 0,DAY_OF_WEEK,FL_DATE,OP_UNIQUE_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST,ARR_TIME,ARR_DELAY,ARR_HOUR
0,19,1,2013-01-07,AA,N3DEAA,618,SFO,13930,1393002,ORD,7.0,-8.0,0.0
1,20,1,2013-01-14,AA,N3DDAA,618,SFO,13930,1393002,ORD,10.0,-5.0,0.0
2,21,1,2013-01-21,AA,N3DJAA,618,SFO,13930,1393002,ORD,7.0,-8.0,0.0
3,22,1,2013-01-28,AA,N3AMAA,618,SFO,13930,1393002,ORD,19.0,4.0,0.0
4,90,1,2013-01-07,AA,N456AA,153,MCO,13930,1393002,ORD,806.0,-4.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106102,602391,7,2019-11-17,OO,N930SW,5829,FAR,13930,1393007,ORD,2017.0,-8.0,20.0
2106103,602412,7,2019-11-17,OO,N121SY,5852,TYS,13930,1393007,ORD,714.0,21.0,7.0
2106104,602414,7,2019-11-17,OO,N203SY,5854,LGA,13930,1393007,ORD,1816.0,1.0,18.0
2106105,602441,7,2019-11-17,OO,N145SY,5887,PWM,13930,1393007,ORD,803.0,-39.0,8.0


In [78]:
tails = plane_details.tail_num.unique().tolist()

In [92]:
df = ORD_OTP[~ORD_OTP['TAIL_NUM'].isin(tails)]

In [96]:
df['OP_UNIQUE_CARRIER'].value_counts()

AA    248624
MQ    225388
EV     33555
UA     17669
OO      6454
DL      5681
F9      5532
US      3095
OH       272
VX       250
B6       199
9E       172
AS        90
YX        82
NK        24
Name: OP_UNIQUE_CARRIER, dtype: int64

In [56]:
np.sort(plane_details.model.unique())

array(['150', '150G', '150H', '150M', '152', '172E', '172K', '172M',
       '172N', '172S', '180', '180J', '185D', '206B', '208B',
       '210-5(205)', '210N', '25D', '340A', '35-33', '35-C33', '35A',
       '390', '401', '401A', '402C', '421C', '500 S', '500-B', '501',
       '510', '550', '560', '58TC', '65-A90', '717-200', '727-200',
       '727-223', '737-401', '737-4B7', '737-500', '737-700', '737-71Q',
       '737-724', '737-732', '737-73V', '737-790', '737-7V3', '737-8',
       '737-800', '737-823', '737-824', '737-832', '737-890', '737-8EH',
       '737-8FH', '737-9', '737-900ER', '737-924', '737-924ER',
       '737-932ER', '737-990', '737-990ER', '747-422', '750', '757-222',
       '757-223', '757-224', '757-231', '757-232', '757-23N', '757-251',
       '757-26D', '757-2B7', '757-2G7', '757-324', '757-33N', '767-300',
       '767-322', '767-323', '767-323ER', '767-332', '767-3CB',
       '767-424ER', '777-222', '777-223', '777-224', '777-300ER',
       '777-323ER', '787-10', '

In [18]:
df = plane_details['manufacturer'].value_counts()

In [27]:
pd.DataFrame(df).head(20)

Unnamed: 0,manufacturer
BOEING,1560
AIRBUS,765
BOMBARDIER INC,689
EMBRAER,472
AIRBUS INDUSTRIE,409
EMBRAER S A,230
MCDONNELL DOUGLAS,117
MCDONNELL DOUGLAS AIRCRAFT CO,73
CESSNA,36
EMBRAER-EMPRESA BRASILEIRA DE,19


In [50]:
df2 = plane_details['model'].value_counts()

In [53]:
pd.DataFrame(df2).head(20)

Unnamed: 0,model
CL-600-2B19,337
A320-232,318
737-823,278
ERJ 170-200 LR,227
CL-600-2C10,200
A321-231,192
EMB-145LR,164
CL-600-2D24,155
737-824,135
737-924ER,130


In [43]:
plane_details[plane_details['manufacturer'] == 'BOMBARDIER INC'].model.unique()

array(['CL-600-2D24', 'CL-600-2B19', 'CL-600-2C10', 'CL-600-2C11',
       'DHC-8-402'], dtype=object)

In [None]:
plane_details

In [14]:
seat_data.aircraft.str.split(" ", expand=True)[0].unique()

array(['Bombardier', 'Airbus', 'Boeing', 'Embraer', 'McDonnell'],
      dtype=object)

## Wrangle Seat Data

In [None]:
seat_data = seat_data.drop(columns=['Unnamed: 0'])

In [54]:
np.sort(seat_data.aircraft.unique())

array(['Airbus A220-100', 'Airbus A319-100', 'Airbus A320-200',
       'Airbus A320neo', 'Airbus A321-200', 'Airbus A321neo',
       'Airbus A330-200', 'Airbus A330-300', 'Airbus A330-900neo',
       'Airbus A350-900', 'Boeing 717-200', 'Boeing 737 MAX 8',
       'Boeing 737 MAX 9', 'Boeing 737-700', 'Boeing 737-800',
       'Boeing 737-900', 'Boeing 737-900ER', 'Boeing 757-200',
       'Boeing 757-300', 'Boeing 767-300ER', 'Boeing 767-400ER',
       'Boeing 777-200', 'Boeing 777-200ER', 'Boeing 777-200LR',
       'Boeing 777-300ER', 'Boeing 787-10', 'Boeing 787-8',
       'Boeing 787-9', 'Bombardier CRJ-100', 'Bombardier CRJ-200',
       'Bombardier CRJ-700', 'Bombardier CRJ-900', 'Bombardier CRJ100',
       'Bombardier CRJ700', 'Bombardier CRJ900', 'Embraer 175',
       'Embraer 190', 'Embraer E170', 'Embraer E175', 'Embraer ERJ-140',
       'Embraer ERJ-145', 'Embraer ERJ-145LR', 'Embraer ERJ-145XR',
       'McDonnell Douglas MD-88', 'McDonnell Douglas MD-90-30'],
      dtype=object

In [64]:
seat_data[seat_data['aircraft'] == 'Airbus A319-100']

Unnamed: 0.1,Unnamed: 0,airline,aircraft,in_service,pass_count
4,4,AA,Airbus A319-100,133,128.0
24,25,AS,Airbus A319-100,10,119.0
39,52,DL,Airbus A319-100,57,132.0
73,89,NK,Airbus A319-100,31,145.0
99,124,UA,Airbus A319-100,83,126.0
100,125,UA,Airbus A319-100,83,128.0
130,158,US,Airbus A319-100,93,124.0
138,167,VX,Airbus A319-100,10,119.0


## Merge data together

In [77]:
ORD_outbound_weather = pd.merge(ORD_outbound, ORD_weather_hourly, how='left', on=['year', 'month', 'day', 'hour'])

In [78]:
ORD_outbound_weather

Unnamed: 0.1,Unnamed: 0,pickup_community_area,year,month,day,hour,rides,tmpf,dwpf,relh,sknt,p01i,vsby,skyl1,skyl2,skyl3,feel
0,0,76,2013,1,1,0,22,24.98,17.96,74.290000,9.000000,0.0000,9.000000,1800.000000,13000.000000,19000.0,14.780000
1,1,76,2013,1,1,1,9,24.89,17.78,73.995000,9.000000,0.0000,8.000000,1800.000000,14000.000000,,14.670000
2,2,76,2013,1,1,2,11,21.20,14.00,73.320000,10.000000,0.0000,9.000000,1800.000000,11000.000000,15000.0,9.420000
3,3,76,2013,1,1,3,3,21.14,12.14,67.723333,10.666667,0.0000,9.666667,5400.000000,12000.000000,15000.0,8.976667
4,4,76,2013,1,1,4,5,19.94,10.94,67.580000,10.000000,0.0000,10.000000,2100.000000,9500.000000,,7.840000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61977,61977,76,2020,1,31,20,257,37.00,30.00,75.560000,5.076923,0.0001,7.615385,1730.384615,,,31.550000
61978,61978,76,2020,1,31,21,243,37.00,30.00,75.560000,5.153846,0.0000,7.615385,1745.384615,2500.000000,,31.550000
61979,61979,76,2020,1,31,22,154,37.00,30.00,75.560000,4.714286,0.0000,7.000000,2064.214286,2723.076923,,32.330000
61980,61980,76,2020,1,31,23,77,36.00,28.90,75.150000,5.076923,0.0000,8.615385,1976.230769,,,30.350000


In [88]:
manufacturer = ['a','a','a','b','b','b']
model = ['blues', 'blu', 'redish', 'big', 'huge', 'tiny']
description = ['blue', 'blue', 'red', 'large', 'large', 'small']
df = pd.DataFrame(list(zip(manufacturer, model, description)), columns = ['manufacturer', 'model', 'description'])

In [89]:
df

Unnamed: 0,manufacturer,model,description
0,a,blues,blue
1,a,blu,blue
2,a,redish,red
3,b,big,large
4,b,huge,large
5,b,tiny,small


In [90]:
d = {}
for i in df['manufacturer'].unique():
    d[i] = {df['model'][j] : df['description'][j] for j in range(0,df.shape[0]) if df['manufacturer'][j] == i}

In [91]:
d

{'a': {'blues': 'blue', 'blu': 'blue', 'redish': 'red'},
 'b': {'big': 'large', 'huge': 'large', 'tiny': 'small'}}

In [92]:
d_2 = {df['model'][j] : df['description'][j] for j in range(0,df.shape[0])}

In [93]:
d_2

{'blues': 'blue',
 'blu': 'blue',
 'redish': 'red',
 'big': 'large',
 'huge': 'large',
 'tiny': 'small'}

In [94]:
manufacturer_test = ['a','a','a','a','b']
model_test = ['blues', 'blues', 'blu', 'redish', 'tiny']
df_test = pd.DataFrame(list(zip(manufacturer_test, model_test)), columns = ['manufacturer', 'model'])

In [95]:
df_test

Unnamed: 0,manufacturer,model
0,a,blues
1,a,blues
2,a,blu
3,a,redish
4,b,tiny


In [115]:
df_test['description'] = df_test.apply(lambda row: d[row.manufacturer][row.model], axis=1)

In [116]:
df_test

Unnamed: 0,manufacturer,model,description
0,a,blues,blue
1,a,blues,blue
2,a,blu,blue
3,a,redish,red
4,b,tiny,small


In [102]:
df_test.apply(lambda row: d_2[row.model], axis=1)

0     blue
1     blue
2     blue
3      red
4    small
dtype: object

In [117]:
manufacturer_test2 = ['a','a','a','a','b']
model_test2 = ['afdsfasdfdss', 'blues', 'blu', 'redish', 'tiny']
df_test2 = pd.DataFrame(list(zip(manufacturer_test2, model_test2)), columns = ['manufacturer', 'model'])

In [118]:
df_test2

Unnamed: 0,manufacturer,model
0,a,afdsfasdfdss
1,a,blues
2,a,blu
3,a,redish
4,b,tiny


In [119]:
df_test2.apply(lambda row: d[row.manufacturer][row.model], axis=1)

KeyError: 'afdsfasdfdss'

In [111]:
df_test2.apply(lambda row: d_2[row.model], axis=1)

0     blue
1     blue
2     blue
3      red
4    small
dtype: object

In [126]:
plane_details

Unnamed: 0.1,Unnamed: 0,tail_num,manufacturer,model
0,0,N582AA,MCDONNELL DOUGLAS,DC-9-82(MD-82)
1,1,N439AA,MCDONNELL DOUGLAS,DC-9-83(MD-83)
2,2,N553AA,MCDONNELL DOUGLAS,DC-9-82(MD-82)
3,3,N558AA,MCDONNELL DOUGLAS,DC-9-82(MD-82)
4,4,N536AA,AMERICAN AIRCRAFT INC,FALCON XP
...,...,...,...,...
4507,4507,N324DX,AIRBUS,A321-211
4508,4508,N384DN,AIRBUS,A321-211
4509,4509,N872DN,BOEING,737-900ER
4510,4510,N406AN,AIRBUS,A321-253NX


In [137]:
plane_details.drop(columns=['Unnamed: 0', 'tail_num']).drop_duplicates().to_csv('../data/plane_names.csv')

In [9]:
aircraft_types = pd.read_csv("../data/L_AIRCRAFT_TYPE.csv_")

In [143]:
aircraft_types.sort_values('Description')

Unnamed: 0,Code,Description
276,643,1124A Westwind II
342,723,A200-100 BD-500-1A10
343,724,A220-300 BD-500-1A11
219,510,AW-650
46,103,Aero Commander (500/600 Series Excpt 680FL)
...,...,...
232,580,Vickers Viscount 700/744/745/745D
233,584,Vickers Viscount V800/810/812
214,490,Volpar Turbo 18
141,396,Westland SR-N5 (Acv)


In [144]:
plane_names = pd.read_csv("../data/plane_names.csv")

In [145]:
plane_names

Unnamed: 0,manufacturer,model,description
0,3D ROBOTICS,SOLO,
1,AERO COMMANDER,500-B,Aero Commander (500/600 Series Excpt 680FL)
2,AERO COMMANDER,500 S,Aero Commander (500/600 Series Excpt 680FL)
3,AGUSTA SPA,A109E,
4,AGUSTA SPA,A119,Agusta A-119 Koala
...,...,...,...
206,TAGUE LARRY,PIONEER 200,
207,TEXTRON AVIATION INC,208B,
208,TEXTRON AVIATION INC,172S,
209,WOBIG WAYNE R,FALCON-XP,


In [5]:
combined_ord_seats = pd.read_csv('../data/combined_ord_seats.csv')


In [10]:
combined_ord_seats.head(1)

Unnamed: 0.1,Unnamed: 0,DEPARTURES_SCHEDULED,DEPARTURES_PERFORMED,SEATS,PASSENGERS,UNIQUE_CARRIER,UNIQUE_CARRIER_NAME,ORIGIN_AIRPORT_ID,ORIGIN,DEST_AIRPORT_ID,DEST,AIRCRAFT_TYPE,MONTH,Unnamed: 12
0,2883,0.0,1.0,9.0,7.0,14Q,London Air Services Limited,12003,GTF,13930,ORD,686,1,


In [12]:
combined_ord_seats_types = pd.merge(combined_ord_seats, aircraft_types, how='left', left_on='AIRCRAFT_TYPE', right_on='Code')

In [22]:
df1 = combined_ord_seats_types[['DEPARTURES_PERFORMED', 'SEATS', 'Description']]
#df1['plane_seats'] = df1.apply(lambda row: row.SEATS/row.DEPARTURES_PERFORMED, axis = 1)

In [32]:
df_summary = df1.groupby('Description').sum()

In [34]:
df_summary['plane_seats'] = df_summary.apply(lambda row: row.SEATS/row.DEPARTURES_PERFORMED, axis = 1)

In [35]:
df_summary

Unnamed: 0_level_0,DEPARTURES_PERFORMED,SEATS,plane_seats
Description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Airbus Industrie A300-600/R/CF/RCF,5366.0,0.0,0.000000
Airbus Industrie A300B/C/F-100/200,3.0,0.0,0.000000
Airbus Industrie A310-200C/F,15.0,0.0,0.000000
Airbus Industrie A319,122426.0,16011317.0,130.783633
Airbus Industrie A320-100/200,177059.0,27666706.0,156.256988
...,...,...,...
McDonnell Douglas DC-9-50,4224.0,506772.0,119.974432
McDonnell Douglas DC9 Super 80/MD81/82/83/88,107681.0,15024019.0,139.523398
McDonnell Douglas MD-11,3989.0,0.0,0.000000
McDonnell Douglas MD-90,8424.0,1343855.0,159.526947
