# Wrangle Data

Script assumes 8 files:
- ORD weather data
- ORD taxi data
- ORD flight data
- ORD seats data
- Airline fleet matrices (planes in service/seat counts)
- Tail number data (manufacturer, model or planes)
- FAA aircraft indexes (table that matches FAA plane index to FAA plane name)
- Plane names matching file (table matching plane models to FAA plane names)

Weather Variables:
https://mesonet.agron.iastate.edu/request/download.phtml?network=IL_ASOS

Flight Variables:
https://www.transtats.bts.gov/Fields.asp?Table_ID=236

## Load Packages

In [1]:
import pandas as pd
import numpy as np
import re
import math

## Load Data
Loaded in same order as above

In [2]:
ORD_weather = pd.read_csv("../data/ORD_weather.txt", sep =  ",", skiprows = 5)
ORD_outbound = pd.read_csv("../data/ORD_outbound.csv")
ORD_OTP = pd.read_csv("../data/ORD_OTP.csv")
ORD_seats = pd.read_csv('../data/ORD_seats.csv')
seat_counts_wiki = pd.read_csv("../data/seat_counts_wiki.csv")
plane_details = pd.read_csv("../data/plane_details.csv", usecols=['tail_num', 'manufacturer', 'model'])
aircraft_types = pd.read_csv("../data/L_AIRCRAFT_TYPE.csv_")
plane_names = pd.read_csv("../data/plane_names.csv")

  interactivity=interactivity, compiler=compiler, result=result)


## Preliminary Exploratory Data Analysis

In [3]:
tails_list = plane_details.tail_num.unique().tolist()
print("Percent of ORD_OTP flights with tail number:", round((ORD_OTP.shape[0] - ORD_OTP[~ORD_OTP['TAIL_NUM'].isin(tails_list)].shape[0])/ORD_OTP.shape[0], 3))

Percent of ORD_OTP flights with tail number: 0.74


#### ORD_OTP Airline Distribution:

In [4]:
ORD_OTP['OP_UNIQUE_CARRIER'].value_counts()

UA    483352
AA    413998
OO    361638
MQ    358550
EV    248383
DL     58473
NK     50266
YX     32262
F9     19261
US     18900
B6     17044
AS     15951
YV      9548
VX      8161
9E      7450
OH      2870
Name: OP_UNIQUE_CARRIER, dtype: int64

In [5]:
print("Unique Carriers:", ORD_OTP['OP_UNIQUE_CARRIER'].unique().shape[0])

Unique Carriers: 16


#### Plane Manufacturer Distribution:

In [6]:
pd.DataFrame(plane_details['manufacturer'].value_counts()).head(20)

Unnamed: 0,manufacturer
BOEING,1560
AIRBUS,765
BOMBARDIER INC,689
EMBRAER,472
AIRBUS INDUSTRIE,409
EMBRAER S A,230
MCDONNELL DOUGLAS,117
MCDONNELL DOUGLAS AIRCRAFT CO,73
CESSNA,36
EMBRAER-EMPRESA BRASILEIRA DE,19


In [7]:
print("Unique Manufacturers:", plane_details['manufacturer'].unique().shape[0])

Unique Manufacturers: 69


Manufacturers in Wikipedia data:

In [8]:
pd.DataFrame(seat_counts_wiki.aircraft.str.split(" ", expand=True)[0].unique())

Unnamed: 0,0
0,Bombardier
1,Airbus
2,Boeing
3,Embraer
4,McDonnell


#### Plane Model Distribution:

In [9]:
pd.DataFrame(plane_details['model'].value_counts()).head(20)

Unnamed: 0,model
CL-600-2B19,337
A320-232,318
737-823,278
ERJ 170-200 LR,227
CL-600-2C10,200
A321-231,192
EMB-145LR,164
CL-600-2D24,155
737-824,135
737-924ER,130


In [10]:
print("Unique Models:", plane_details['model'].unique().shape[0])

Unique Models: 202


In [11]:
print("Number of models in FAA:", aircraft_types.shape[0])

Number of models in FAA: 425


## Wrangle weather data

In [12]:
# Remove unnecessary columns
ORD_weather = ORD_weather.drop(columns=['station', 'lon', 'lat', 'drct', 'alti', 'mslp', 'gust', 'skyc4', 'skyl4', 'wxcodes', 'ice_accretion_1hr', 'ice_accretion_3hr', 'ice_accretion_6hr', 'peak_wind_gust', 'peak_wind_drct', 'peak_wind_time', 'metar'])
# Obtain datetime
ORD_weather = ORD_weather.rename(columns={'valid' : 'date'})
ORD_weather['date'] = pd.to_datetime(ORD_weather['date'])
ORD_weather = ORD_weather.set_index("date")
# Change missing values to nan
ORD_weather = ORD_weather.replace('M', np.nan)
# Change columns data types so that averaging can occur
numeric_weather_features = ['tmpf', 'dwpf', 'relh', 'sknt', 'p01i', 'vsby', 'skyl1', 'skyl2', 'skyl3', 'feel']
categorical_weather_features = ['skyc1', 'skyc2', 'skyc3']
ORD_weather[numeric_weather_features] = ORD_weather[numeric_weather_features].apply(pd.to_numeric)
ORD_weather[categorical_weather_features] = ORD_weather[categorical_weather_features].astype('category')

**TO DO: Figure out converting categoricals to ordinal for averaging and inclusion.**  
For the script below, when mean is taken, categoricals are removed.

In [13]:
# Strip out time categories
ORD_weather['year'] = ORD_weather.index.year
ORD_weather['month'] = ORD_weather.index.month
ORD_weather['day'] = ORD_weather.index.day
ORD_weather['hour'] = ORD_weather.index.hour
# Average columns by hour
ORD_weather_hourly = ORD_weather.groupby(['year', 'month', 'day', 'hour']).mean().reset_index()

In [14]:
ORD_weather_hourly.head(5)

Unnamed: 0,year,month,day,hour,tmpf,dwpf,relh,sknt,p01i,vsby,skyl1,skyl2,skyl3,feel
0,2013,1,1,0,24.98,17.96,74.29,9.0,0.0,9.0,1800.0,13000.0,19000.0,14.78
1,2013,1,1,1,24.89,17.78,73.995,9.0,0.0,8.0,1800.0,14000.0,,14.67
2,2013,1,1,2,21.2,14.0,73.32,10.0,0.0,9.0,1800.0,11000.0,15000.0,9.42
3,2013,1,1,3,21.14,12.14,67.723333,10.666667,0.0,9.666667,5400.0,12000.0,15000.0,8.976667
4,2013,1,1,4,19.94,10.94,67.58,10.0,0.0,10.0,2100.0,9500.0,,7.84


## Wrangle Taxi Data

In [15]:
# Remove unnecessary columns
ORD_outbound = ORD_outbound.drop(columns=['Unnamed: 0', 'pickup_community_area'])

In [16]:
ORD_outbound.head(5)

Unnamed: 0,year,month,day,hour,rides
0,2013,1,1,0,22
1,2013,1,1,1,9
2,2013,1,1,2,11
3,2013,1,1,3,3
4,2013,1,1,4,5


## Wrangle Flight Data

In [17]:
# Remove unnecessary columns
ORD_OTP = ORD_OTP.drop(columns=['Unnamed: 0'])
# Obtain datetime
ORD_OTP['FL_DATE'] = pd.to_datetime(ORD_OTP['FL_DATE'])

In [18]:
print("Number of flights without a flight time:", sum(ORD_OTP['ARR_TIME'].isna()))

Number of flights without a flight time: 63748


In [19]:
# Strip out time categories
ORD_OTP['year'] = pd.DatetimeIndex(ORD_OTP['FL_DATE']).year
ORD_OTP['month'] = pd.DatetimeIndex(ORD_OTP['FL_DATE']).month
ORD_OTP['day'] = pd.DatetimeIndex(ORD_OTP['FL_DATE']).day
# Drop out NAs in arrival time so that hour can be pulled out
ORD_OTP = ORD_OTP.dropna(subset=['ARR_TIME'])
# Extract flight hour 
ORD_OTP['hour'] = (ORD_OTP['ARR_TIME']/100).apply(math.floor)
ORD_OTP = ORD_OTP.astype({'hour': 'int64'})

In [20]:
ORD_OTP.head(5)

Unnamed: 0,DAY_OF_WEEK,FL_DATE,OP_UNIQUE_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST,ARR_TIME,ARR_DELAY,ARR_HOUR,year,month,day,hour
0,1,2013-01-07,AA,N3DEAA,618,SFO,13930,1393002,ORD,7.0,-8.0,0.0,2013,1,7,0
1,1,2013-01-14,AA,N3DDAA,618,SFO,13930,1393002,ORD,10.0,-5.0,0.0,2013,1,14,0
2,1,2013-01-21,AA,N3DJAA,618,SFO,13930,1393002,ORD,7.0,-8.0,0.0,2013,1,21,0
3,1,2013-01-28,AA,N3AMAA,618,SFO,13930,1393002,ORD,19.0,4.0,0.0,2013,1,28,0
4,1,2013-01-07,AA,N456AA,153,MCO,13930,1393002,ORD,806.0,-4.0,8.0,2013,1,7,8


**Debug missing months - Jarome Looking into**

In [21]:
ORD_OTP[(ORD_OTP['FL_DATE'] > '2017-04-01') & (ORD_OTP['FL_DATE'] < '2017-04-30')]['FL_DATE'].unique()

array([], dtype='datetime64[ns]')

In [22]:
ORD_OTP[(ORD_OTP['FL_DATE'] > '2016-01-01') & (ORD_OTP['FL_DATE'] < '2017-01-01')]['FL_DATE'].unique().shape

(244,)

In [23]:
ORD_OTP[(ORD_OTP['FL_DATE'] > '2014-01-01') & (ORD_OTP['FL_DATE'] < '2015-01-01')].month.value_counts().sort_index()

1     17358
2     18291
3     44122
5     23417
6     47730
8     49508
10    50608
12    46200
Name: month, dtype: int64

## Wrangle Seat Data - FAA

Code to create start draft of plane_names.csv:  
```
plane_details.drop(columns=['tail_num']).drop_duplicates().to_csv('../data/plane_names.csv')
````

In [24]:
plane_names.head(5)

Unnamed: 0,manufacturer,model,description
0,3D ROBOTICS,SOLO,
1,AERO COMMANDER,500 S,Aero Commander (500/600 Series Excpt 680FL)
2,AERO COMMANDER,500-B,Aero Commander (500/600 Series Excpt 680FL)
3,AGUSTA SPA,A109E,
4,AGUSTA SPA,A119,Agusta A-119 Koala


In [25]:
# Fix duplicate manufacturer names
names = ['AIRBUS', 'CANADAIR', 'DASSAULT', 'DIAMOND AIRCRAFT', 'EMBRAER', 'GATES LEARJET', 'MCDONNELL DOUGLAS', 'PIPER', 'RAYTHEON', 'ROBINSON HELICOPTER']
for name in names:
    plane_details['manufacturer'].replace(regex=True,inplace=True,to_replace=rf'.*{name}.*',value=rf'{name}')
# Add FAA descriptions
plane_details = pd.merge(plane_details, plane_names, how='left', on=['manufacturer', 'model'])

In [26]:
plane_details.head(5)

Unnamed: 0,tail_num,manufacturer,model,description
0,N582AA,MCDONNELL DOUGLAS,DC-9-82(MD-82),McDonnell Douglas DC9 Super 80/MD81/82/83/88
1,N439AA,MCDONNELL DOUGLAS,DC-9-83(MD-83),McDonnell Douglas DC9 Super 80/MD81/82/83/88
2,N553AA,MCDONNELL DOUGLAS,DC-9-82(MD-82),McDonnell Douglas DC9 Super 80/MD81/82/83/88
3,N558AA,MCDONNELL DOUGLAS,DC-9-82(MD-82),McDonnell Douglas DC9 Super 80/MD81/82/83/88
4,N536AA,AMERICAN AIRCRAFT INC,FALCON XP,


In [28]:
ORD_seats.head(5)

Unnamed: 0.1,Unnamed: 0,DEPARTURES_SCHEDULED,DEPARTURES_PERFORMED,SEATS,PASSENGERS,UNIQUE_CARRIER,UNIQUE_CARRIER_NAME,ORIGIN_AIRPORT_ID,ORIGIN,DEST_AIRPORT_ID,DEST,AIRCRAFT_TYPE,MONTH,Unnamed: 12
0,2883,0.0,1.0,9.0,7.0,14Q,London Air Services Limited,12003,GTF,13930,ORD,686,1,
1,2899,0.0,1.0,0.0,0.0,LH,Lufthansa German Airlines,10397,ATL,13930,ORD,740,1,
2,2908,0.0,18.0,0.0,0.0,CX,Cathay Pacific Airways Ltd.,10299,ANC,13930,ORD,820,1,
3,2915,0.0,12.0,0.0,0.0,CX,Cathay Pacific Airways Ltd.,12478,JFK,13930,ORD,820,1,
4,2949,0.0,1.0,235.0,48.0,09Q,"Swift Air, LLC d/b/a Eastern Air Lines d/b/a E...",11066,CMH,13930,ORD,625,1,


In [30]:
aircraft_types.head(5)

Unnamed: 0,Code,Description
0,7,Aero Commander 200
1,8,Aero Macchi AL-60
2,9,Aeronca 7-AC
3,10,Beech Bonanza 35A/C/D/E/G/H/J/K/S/V/ 36A
4,20,Bellanca CH-300


In [31]:
# Merge plane descriptions onto FAA seat data
ORD_seats_types = pd.merge(ORD_seats, aircraft_types, how='left', left_on='AIRCRAFT_TYPE', right_on='Code')

In [32]:
ORD_seats_types.head(5)

Unnamed: 0.1,Unnamed: 0,DEPARTURES_SCHEDULED,DEPARTURES_PERFORMED,SEATS,PASSENGERS,UNIQUE_CARRIER,UNIQUE_CARRIER_NAME,ORIGIN_AIRPORT_ID,ORIGIN,DEST_AIRPORT_ID,DEST,AIRCRAFT_TYPE,MONTH,Unnamed: 12,Code,Description
0,2883,0.0,1.0,9.0,7.0,14Q,London Air Services Limited,12003,GTF,13930,ORD,686,1,,686,Learjet45
1,2899,0.0,1.0,0.0,0.0,LH,Lufthansa German Airlines,10397,ATL,13930,ORD,740,1,,740,McDonnell Douglas MD-11
2,2908,0.0,18.0,0.0,0.0,CX,Cathay Pacific Airways Ltd.,10299,ANC,13930,ORD,820,1,,820,Boeing 747-400F
3,2915,0.0,12.0,0.0,0.0,CX,Cathay Pacific Airways Ltd.,12478,JFK,13930,ORD,820,1,,820,Boeing 747-400F
4,2949,0.0,1.0,235.0,48.0,09Q,"Swift Air, LLC d/b/a Eastern Air Lines d/b/a E...",11066,CMH,13930,ORD,625,1,,625,Boeing 767-200/ER/EM


Code to create L_AIRCRAFT_TYPE_with_seats.csv:  
```
aircraft_types_seats = combined_ord_seats_types[['DEPARTURES_PERFORMED', 'SEATS', 'Description']].groupby('Description').sum()  
aircraft_types_seats['plane_seats'] = aircraft_types_seats.apply(lambda row: row.SEATS/row.DEPARTURES_PERFORMED, axis = 1)
aircraft_types_seats[['plane_seats']].to_csv("../data/L_AIRCRAFT_TYPE_with_seats.csv")
```

In [33]:
# Find seat counts per carrier plane by dividing total seats of a given plane model by the number of flights it had
seat_counts_FAA = ORD_seats_types.query('DEPARTURES_PERFORMED > 0.0').groupby(['UNIQUE_CARRIER','Description']).sum().reset_index()
seat_counts_FAA['seats'] = seat_counts_FAA.apply(lambda row: row.SEATS/row.DEPARTURES_PERFORMED, axis = 1)
seat_counts_FAA = seat_counts_FAA[['UNIQUE_CARRIER', 'Description', 'seats']]

In [34]:
seat_counts_FAA.head(5)

Unnamed: 0,UNIQUE_CARRIER,Description,seats
0,04Q,Cessna Citation X Model 650/550B/550XL,8.0
1,09Q,Boeing 737-300,142.1
2,09Q,Boeing 737-400,127.513924
3,09Q,Boeing 737-800,165.6
4,09Q,Boeing 767-200/ER/EM,195.1


## Wrangle Seat Data - Wikipedia

In [35]:
seat_counts_wiki = seat_counts_wiki.drop(columns=['Unnamed: 0'])

In [36]:
seat_counts_wiki.head(5)

Unnamed: 0,airline,aircraft,in_service,pass_count
0,9E,Bombardier CRJ-200,42,50.0
1,9E,Bombardier CRJ-700,14,69.0
2,9E,Bombardier CRJ-900,3,70.0
3,9E,Bombardier CRJ-900,116,76.0
4,AA,Airbus A319-100,133,128.0


In [38]:
def wavg(data, avg_name, weight_name):
    """
    Returns weighted average
    http://stackoverflow.com/questions/10951341/pandas-dataframe-aggregate-function-using-multiple-columns
    """
    d = data[avg_name]
    w = data[weight_name]
    try:
        return (d * w).sum() / w.sum()
    except ZeroDivisionError:
        return d.mean()

In [39]:
# Apply weighted average function to grouped data
avg_airline_seats = pd.DataFrame({'seat_count': seat_counts_wiki.groupby('airline').apply(wavg, 'pass_count', 'in_service')})

In [41]:
avg_airline_seats.head(5)

Unnamed: 0_level_0,seat_count
airline,Unnamed: 1_level_1
9E,69.097143
AA,170.144491
AS,162.337607
B6,149.780303
DL,183.838424


## Merge data together

In [42]:
# Merge tail data onto flight data
ORD_OTP_with_tails = pd.merge(ORD_OTP, plane_details, how='left', left_on=['TAIL_NUM'], right_on=['tail_num'])

In [44]:
ORD_OTP_with_tails.tail(5)

Unnamed: 0,DAY_OF_WEEK,FL_DATE,OP_UNIQUE_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST,ARR_TIME,ARR_DELAY,ARR_HOUR,year,month,day,hour,tail_num,manufacturer,model,description
2042354,7,2019-11-17,OO,N930SW,5829,FAR,13930,1393007,ORD,2017.0,-8.0,20.0,2019,11,17,20,N930SW,BOMBARDIER INC,CL-600-2B19,Canadair RJ-200ER /RJ-440
2042355,7,2019-11-17,OO,N121SY,5852,TYS,13930,1393007,ORD,714.0,21.0,7.0,2019,11,17,7,N121SY,EMBRAER,ERJ 170-200 LR,Embraer-Emb-170
2042356,7,2019-11-17,OO,N203SY,5854,LGA,13930,1393007,ORD,1816.0,1.0,18.0,2019,11,17,18,N203SY,EMBRAER,ERJ 170-200 LR,Embraer-Emb-170
2042357,7,2019-11-17,OO,N145SY,5887,PWM,13930,1393007,ORD,803.0,-39.0,8.0,2019,11,17,8,N145SY,EMBRAER,ERJ 170-200 LR,Embraer-Emb-170
2042358,7,2019-11-17,OO,N786SK,5896,OKC,13930,1393007,ORD,2033.0,10.0,20.0,2019,11,17,20,N786SK,BOMBARDIER INC,CL-600-2C10,Canadair RJ-700


In [45]:
# Merge seat data onto flights that have tails
ORD_OTP_with_tails_seats = pd.merge(ORD_OTP_with_tails, seat_counts_FAA, how='left', left_on=['OP_UNIQUE_CARRIER', 'description'], right_on=['UNIQUE_CARRIER', 'Description'])

In [46]:
ORD_OTP_with_tails_seats.tail(5)

Unnamed: 0,DAY_OF_WEEK,FL_DATE,OP_UNIQUE_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST,ARR_TIME,...,month,day,hour,tail_num,manufacturer,model,description,UNIQUE_CARRIER,Description,seats
2042354,7,2019-11-17,OO,N930SW,5829,FAR,13930,1393007,ORD,2017.0,...,11,17,20,N930SW,BOMBARDIER INC,CL-600-2B19,Canadair RJ-200ER /RJ-440,OO,Canadair RJ-200ER /RJ-440,50.0
2042355,7,2019-11-17,OO,N121SY,5852,TYS,13930,1393007,ORD,714.0,...,11,17,7,N121SY,EMBRAER,ERJ 170-200 LR,Embraer-Emb-170,,,
2042356,7,2019-11-17,OO,N203SY,5854,LGA,13930,1393007,ORD,1816.0,...,11,17,18,N203SY,EMBRAER,ERJ 170-200 LR,Embraer-Emb-170,,,
2042357,7,2019-11-17,OO,N145SY,5887,PWM,13930,1393007,ORD,803.0,...,11,17,8,N145SY,EMBRAER,ERJ 170-200 LR,Embraer-Emb-170,,,
2042358,7,2019-11-17,OO,N786SK,5896,OKC,13930,1393007,ORD,2033.0,...,11,17,20,N786SK,BOMBARDIER INC,CL-600-2C10,Canadair RJ-700,OO,Canadair RJ-700,69.011412


In [59]:
print("Seat data added for:", round((-ORD_OTP_with_tails_seats.seats.isna()).sum()/ORD_OTP_with_tails_seats.shape[0],3), "% of rows")

Seat data added for: 0.684 % of rows


In [60]:
# Impute remaining seat NaNs with average for airline
ORD_OTP_with_tails_seats_and_imputation = pd.merge(ORD_OTP_with_tails_seats, avg_airline_seats.reset_index(), how='left', left_on=['OP_UNIQUE_CARRIER'], right_on=['airline'])
ORD_OTP_with_tails_seats_and_imputation['seats'] = (-ORD_OTP_with_tails_seats_and_imputation.seats.isna() * ORD_OTP_with_tails_seats_and_imputation.seats).fillna(0) + (ORD_OTP_with_tails_seats_and_imputation.seats.isna() * ORD_OTP_with_tails_seats_and_imputation.seat_count)
ORD_OTP_with_tails_seats_and_imputation = ORD_OTP_with_tails_seats_and_imputation.drop(columns=['seat_count'])

In [61]:
ORD_OTP_with_tails_seats_and_imputation.head()

Unnamed: 0,DAY_OF_WEEK,FL_DATE,OP_UNIQUE_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST,ARR_TIME,...,day,hour,tail_num,manufacturer,model,description,UNIQUE_CARRIER,Description,seats,airline
0,1,2013-01-07,AA,N3DEAA,618,SFO,13930,1393002,ORD,7.0,...,7,0,,,,,,,170.144491,AA
1,1,2013-01-14,AA,N3DDAA,618,SFO,13930,1393002,ORD,10.0,...,14,0,,,,,,,170.144491,AA
2,1,2013-01-21,AA,N3DJAA,618,SFO,13930,1393002,ORD,7.0,...,21,0,,,,,,,170.144491,AA
3,1,2013-01-28,AA,N3AMAA,618,SFO,13930,1393002,ORD,19.0,...,28,0,,,,,,,170.144491,AA
4,1,2013-01-07,AA,N456AA,153,MCO,13930,1393002,ORD,806.0,...,7,8,,,,,,,170.144491,AA


Note:  
Alternative Imputation using apply (takes 2 minutes)  
`ORD_OTP_with_tails_seats.apply(lambda x: avg_airline_seats.loc[x.OP_UNIQUE_CARRIER].seat_count if pd.isna(x.seats) else x.seats, axis=1)`

In [66]:
# Get seats and flights per hour
seats_per_hour = ORD_OTP_with_tails_seats_and_imputation.groupby(['year','month','day','hour']).sum().reset_index()[['year','month','day','hour','seats']]
flights_per_hour = ORD_OTP_with_tails_seats_and_imputation.groupby(['year','month','day','hour']).count().reset_index()[['year','month','day','hour','airline']]

In [67]:
seats_per_hour.head(5)

Unnamed: 0,year,month,day,hour,seats
0,2013,1,1,0,929.752508
1,2013,1,1,1,295.924859
2,2013,1,1,4,1694.300371
3,2013,1,1,5,2612.235611
4,2013,1,1,6,2490.161406


In [68]:
flights_per_hour.head(5)

Unnamed: 0,year,month,day,hour,airline
0,2013,1,1,0,6
1,2013,1,1,1,2
2,2013,1,1,4,8
3,2013,1,1,5,18
4,2013,1,1,6,38


In [70]:
# Merge taxi and weather data
ORD_outbound_weather = pd.merge(ORD_outbound, ORD_weather_hourly, how='left', on=['year', 'month', 'day', 'hour'])

In [71]:
ORD_outbound_weather.head(5)

Unnamed: 0,year,month,day,hour,rides,tmpf,dwpf,relh,sknt,p01i,vsby,skyl1,skyl2,skyl3,feel
0,2013,1,1,0,22,24.98,17.96,74.29,9.0,0.0,9.0,1800.0,13000.0,19000.0,14.78
1,2013,1,1,1,9,24.89,17.78,73.995,9.0,0.0,8.0,1800.0,14000.0,,14.67
2,2013,1,1,2,11,21.2,14.0,73.32,10.0,0.0,9.0,1800.0,11000.0,15000.0,9.42
3,2013,1,1,3,3,21.14,12.14,67.723333,10.666667,0.0,9.666667,5400.0,12000.0,15000.0,8.976667
4,2013,1,1,4,5,19.94,10.94,67.58,10.0,0.0,10.0,2100.0,9500.0,,7.84


In [72]:
# Merge taxi/weather to seats/flights
ORD_outbound_weather_seats = pd.merge(ORD_outbound_weather, seats_per_hour, how='left', on=['year', 'month', 'day', 'hour'])
ORD_outbound_weather_seats_flights = pd.merge(ORD_outbound_weather_seats, flights_per_hour, how='left', on=['year', 'month', 'day', 'hour'])

In [73]:
ORD_outbound_weather_seats_flights

Unnamed: 0,year,month,day,hour,rides,tmpf,dwpf,relh,sknt,p01i,vsby,skyl1,skyl2,skyl3,feel,seats,airline
0,2013,1,1,0,22,24.98,17.96,74.290000,9.000000,0.0000,9.000000,1800.000000,13000.000000,19000.0,14.780000,929.752508,6.0
1,2013,1,1,1,9,24.89,17.78,73.995000,9.000000,0.0000,8.000000,1800.000000,14000.000000,,14.670000,295.924859,2.0
2,2013,1,1,2,11,21.20,14.00,73.320000,10.000000,0.0000,9.000000,1800.000000,11000.000000,15000.0,9.420000,,
3,2013,1,1,3,3,21.14,12.14,67.723333,10.666667,0.0000,9.666667,5400.000000,12000.000000,15000.0,8.976667,,
4,2013,1,1,4,5,19.94,10.94,67.580000,10.000000,0.0000,10.000000,2100.000000,9500.000000,,7.840000,1694.300371,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61977,2020,1,31,20,257,37.00,30.00,75.560000,5.076923,0.0001,7.615385,1730.384615,,,31.550000,,
61978,2020,1,31,21,243,37.00,30.00,75.560000,5.153846,0.0000,7.615385,1745.384615,2500.000000,,31.550000,,
61979,2020,1,31,22,154,37.00,30.00,75.560000,4.714286,0.0000,7.000000,2064.214286,2723.076923,,32.330000,,
61980,2020,1,31,23,77,36.00,28.90,75.150000,5.076923,0.0000,8.615385,1976.230769,,,30.350000,,


## Booyah