# Wrangle Data Together

Script assumes 3 files:
- ORD weather data
- ORD taxi data
- ORD flight data

Weather Variables:
https://mesonet.agron.iastate.edu/request/download.phtml?network=IL_ASOS

Flight Variables:
https://www.transtats.bts.gov/Fields.asp?Table_ID=236

In [1]:
import pandas as pd
import numpy as np

## Load data

In [176]:
ORD_weather = pd.read_csv("../data/ORD_weather.txt", sep =  ",", skiprows = 5)
ORD_outbound = pd.read_csv("../data/ORD_outbound.csv")
ORD_OTP = pd.read_csv("../data/ORD_OTP.csv")

  interactivity=interactivity, compiler=compiler, result=result)


## Wrangle weather data

In [177]:
# Remove unnecessary columns
ORD_weather = ORD_weather.drop(columns=['station', 'lon', 'lat', 'drct', 'alti', 'mslp', 'gust', 'skyc4', 'skyl4', 'wxcodes', 'ice_accretion_1hr', 'ice_accretion_3hr', 'ice_accretion_6hr', 'peak_wind_gust', 'peak_wind_drct', 'peak_wind_time', 'metar'])
# Obtain datetime
ORD_weather = ORD_weather.rename(columns={'valid' : 'date'})
ORD_weather['date'] = pd.to_datetime(ORD_weather['date'])
ORD_weather = ORD_weather.set_index("date")
# Change missing values to nan
ORD_weather = ORD_weather.replace('M', np.nan)
# Change columns data types so that averaging can occur
numeric_weather_features = ['tmpf', 'dwpf', 'relh', 'sknt', 'p01i', 'vsby', 'skyl1', 'skyl2', 'skyl3', 'feel']
categorical_weather_features = ['skyc1', 'skyc2', 'skyc3']
ORD_weather[numeric_weather_features] = ORD_weather[numeric_weather_features].apply(pd.to_numeric)
ORD_weather[categorical_weather_features] = ORD_weather[categorical_weather_features].astype('category')

**TO DO: Figure out converting categoricals to ordinal for averaging and inclusion.**

In [178]:
# Strip out time categories
ORD_weather['year'] = ORD_weather.index.year
ORD_weather['month'] = ORD_weather.index.month
ORD_weather['day'] = ORD_weather.index.day
ORD_weather['hour'] = ORD_weather.index.hour
# Average columns by hour
ORD_weather_hourly = ORD_weather.groupby(['year', 'month', 'day', 'hour']).mean().reset_index()

## Wrangle Taxi Data

In [179]:
# Remove unnecessary columns
ORD_outbound = ORD_outbound.drop(columns=['Unnamed: 0', 'pickup_community_area'])

## Wrangle Flight Data

**TO DO: Figure out how to determine number of passengers arriving from number of planes**

In [180]:
ORD_OTP

Unnamed: 0.1,Unnamed: 0,DAY_OF_WEEK,FL_DATE,OP_UNIQUE_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST,ARR_TIME,ARR_DELAY,ARR_HOUR
0,19,1,2013-01-07,AA,N3DEAA,618,SFO,13930,1393002,ORD,7.0,-8.0,0.0
1,20,1,2013-01-14,AA,N3DDAA,618,SFO,13930,1393002,ORD,10.0,-5.0,0.0
2,21,1,2013-01-21,AA,N3DJAA,618,SFO,13930,1393002,ORD,7.0,-8.0,0.0
3,22,1,2013-01-28,AA,N3AMAA,618,SFO,13930,1393002,ORD,19.0,4.0,0.0
4,90,1,2013-01-07,AA,N456AA,153,MCO,13930,1393002,ORD,806.0,-4.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106102,602391,7,2019-11-17,OO,N930SW,5829,FAR,13930,1393007,ORD,2017.0,-8.0,20.0
2106103,602412,7,2019-11-17,OO,N121SY,5852,TYS,13930,1393007,ORD,714.0,21.0,7.0
2106104,602414,7,2019-11-17,OO,N203SY,5854,LGA,13930,1393007,ORD,1816.0,1.0,18.0
2106105,602441,7,2019-11-17,OO,N145SY,5887,PWM,13930,1393007,ORD,803.0,-39.0,8.0


## Merge data together

In [181]:
ORD_outbound_and_weather = pd.merge(ORD_outbound, ORD_weather_hourly, how='left', on=['year', 'month', 'day', 'hour'])

In [182]:
ORD_outbound_and_weather

Unnamed: 0,year,month,day,hour,rides,tmpf,dwpf,relh,sknt,p01i,vsby,skyl1,skyl2,skyl3,feel
0,2013,1,1,0,22,24.98,17.96,74.290000,9.000000,0.0000,9.000000,1800.000000,13000.000000,19000.0,14.780000
1,2013,1,1,1,9,24.89,17.78,73.995000,9.000000,0.0000,8.000000,1800.000000,14000.000000,,14.670000
2,2013,1,1,2,11,21.20,14.00,73.320000,10.000000,0.0000,9.000000,1800.000000,11000.000000,15000.0,9.420000
3,2013,1,1,3,3,21.14,12.14,67.723333,10.666667,0.0000,9.666667,5400.000000,12000.000000,15000.0,8.976667
4,2013,1,1,4,5,19.94,10.94,67.580000,10.000000,0.0000,10.000000,2100.000000,9500.000000,,7.840000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61977,2020,1,31,20,257,37.00,30.00,75.560000,5.076923,0.0001,7.615385,1730.384615,,,31.550000
61978,2020,1,31,21,243,37.00,30.00,75.560000,5.153846,0.0000,7.615385,1745.384615,2500.000000,,31.550000
61979,2020,1,31,22,154,37.00,30.00,75.560000,4.714286,0.0000,7.000000,2064.214286,2723.076923,,32.330000
61980,2020,1,31,23,77,36.00,28.90,75.150000,5.076923,0.0000,8.615385,1976.230769,,,30.350000
