# Airports Processsing 

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots

In [2]:
# Read preprocessed data
df = pd.read_parquet("flightsCleaned.parquet")
pd.set_option('display.max_columns', None)

df.head()

Unnamed: 0,DATE,AIRLINE_CODE,AIRLINE,FLIGHT_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,DEPARTURE_TIME,DEPARTURE_DELAY,ELAPSED_TIME,DISTANCE,ARRIVAL_TIME,ARRIVAL_DELAY,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,OTHER_DELAY,ORIGIN_AIRPORT_NAME,ORIGIN_CITY,ORIGIN_STATE,ORIGIN_LATITUDE,ORIGIN_LONGITUDE,DESTINATION_AIRPORT_NAME,DESTINATION_CITY,DESTINATION_STATE,DESTINATION_LATITUDE,DESTINATION_LONGITUDE
0,2015-01-01,AS,Alaska Airlines Inc.,98,ANC,SEA,2354.0,-11.0,194.0,1448,408.0,-22.0,0.0,0.0,0.0,0.0,0.0,0.0,Ted Stevens Anchorage International Airport,Anchorage,AK,61.17432,-149.99619,Seattle-Tacoma International Airport,Seattle,WA,47.44898,-122.30931
1,2015-01-01,AA,American Airlines Inc.,2336,LAX,PBI,2.0,-8.0,279.0,2330,741.0,-9.0,0.0,0.0,0.0,0.0,0.0,0.0,Los Angeles International Airport,Los Angeles,CA,33.94254,-118.40807,Palm Beach International Airport,West Palm Beach,FL,26.68316,-80.09559
2,2015-01-01,US,US Airways Inc.,840,SFO,CLT,18.0,-2.0,293.0,2296,811.0,5.0,0.0,0.0,0.0,0.0,0.0,5.0,San Francisco International Airport,San Francisco,CA,37.619,-122.37484,Charlotte Douglas International Airport,Charlotte,NC,35.21401,-80.94313
3,2015-01-01,AA,American Airlines Inc.,258,LAX,MIA,15.0,-5.0,281.0,2342,756.0,-9.0,0.0,0.0,0.0,0.0,0.0,0.0,Los Angeles International Airport,Los Angeles,CA,33.94254,-118.40807,Miami International Airport,Miami,FL,25.79325,-80.29056
4,2015-01-01,AS,Alaska Airlines Inc.,135,SEA,ANC,24.0,-1.0,215.0,1448,259.0,-21.0,0.0,0.0,0.0,0.0,0.0,0.0,Seattle-Tacoma International Airport,Seattle,WA,47.44898,-122.30931,Ted Stevens Anchorage International Airport,Anchorage,AK,61.17432,-149.99619


### Creamos la agrupación por aeropuertos, ordenando por número de vuelos

Al estar agrupando por aeropuertos, hay variables propias de los vuelos que no nos aportarán información y podemos borrar

In [3]:
df = df.drop(["FLIGHT_NUMBER","DEPARTURE_TIME","ARRIVAL_TIME"], axis=1)

Ahora realizamos la agrupación por aeropuerto de origen, que son con los que trabajaremos

In [4]:
# Primero comprobamos que variables son categóricas, para asegurarnos de quedarnos con las que queremos
df.dtypes[df.dtypes == 'object']

AIRLINE_CODE                object
AIRLINE                     object
ORIGIN_AIRPORT              object
DESTINATION_AIRPORT         object
ORIGIN_AIRPORT_NAME         object
ORIGIN_CITY                 object
ORIGIN_STATE                object
DESTINATION_AIRPORT_NAME    object
DESTINATION_CITY            object
DESTINATION_STATE           object
dtype: object

In [5]:
variables_to_group_by = ["ORIGIN_AIRPORT","ORIGIN_AIRPORT_NAME","ORIGIN_CITY","ORIGIN_STATE"]
airports = df.groupby(variables_to_group_by).mean()
airports["FLIGHTS"] = df.groupby(variables_to_group_by).size()
airports["DELAYED_FLIGHTS"] = df[df["ARRIVAL_DELAY"]>0].groupby(variables_to_group_by).size()
airports["DELAYED_PERCENTAGE"] = airports["DELAYED_FLIGHTS"]/airports["FLIGHTS"]
airports = airports.sort_values("FLIGHTS",ascending=False).reset_index()
airports.head()

Unnamed: 0,ORIGIN_AIRPORT,ORIGIN_AIRPORT_NAME,ORIGIN_CITY,ORIGIN_STATE,DEPARTURE_DELAY,ELAPSED_TIME,DISTANCE,ARRIVAL_DELAY,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,OTHER_DELAY,ORIGIN_LATITUDE,ORIGIN_LONGITUDE,DESTINATION_LATITUDE,DESTINATION_LONGITUDE,FLIGHTS,DELAYED_FLIGHTS,DELAYED_PERCENTAGE
0,ATL,Hartsfield-Jackson Atlanta International Airport,Atlanta,GA,8.85027,114.866383,640.178136,2.416871,1.785697,0.004062,3.406679,2.878495,0.839232,1.022221,33.64044,-84.42694,35.552319,-86.109114,375903,122213,0.325118
1,ORD,Chicago O'Hare International Airport,Chicago,IL,13.20502,127.695187,721.919227,7.608336,3.284368,0.009161,4.3614,5.254136,1.396482,1.10235,41.9796,-87.90446,38.347093,-90.637218,303888,121696,0.400463
2,DFW,Dallas/Fort Worth International Airport,Dallas-Fort Worth,TX,11.057457,130.859005,788.274763,6.425772,2.247282,0.022907,4.475041,4.349548,1.121342,1.149282,32.89595,-97.0372,35.462532,-95.100926,252938,98565,0.38968
3,DEN,Denver International Airport,Denver,CO,11.259532,137.207102,877.915006,6.472926,2.666673,0.004362,3.774007,5.082969,0.446868,1.218958,39.85841,-104.667,38.010582,-100.578165,211369,86012,0.406928
4,LAX,Los Angeles International Airport,Los Angeles,CA,10.173233,179.770106,1253.742786,5.286918,2.24161,0.016639,3.588663,4.982808,0.162934,1.302307,33.94254,-118.40807,36.582438,-104.603276,209631,85590,0.408289


La misma agrupación la queremos para cada aeropuerto por día

In [6]:
variables_to_group_by = ["ORIGIN_AIRPORT","DATE","ORIGIN_AIRPORT_NAME","ORIGIN_CITY","ORIGIN_STATE"]
airports_date = df.groupby(variables_to_group_by).mean()
airports_date["FLIGHTS"] = df.groupby(variables_to_group_by).size()
airports_date["DELAYED_FLIGHTS"] = df[df["ARRIVAL_DELAY"]>0].groupby(variables_to_group_by).size()
# los NAs que aparezcan serán porque hay 0 vuelos en esa categoría
airports_date = airports_date.fillna(0)
airports_date["DELAYED_PERCENTAGE"] = airports_date["DELAYED_FLIGHTS"]/airports_date["FLIGHTS"]
airports_date = airports_date.sort_values(["ORIGIN_AIRPORT","DATE"]).reset_index()
airports_date.head()

Unnamed: 0,ORIGIN_AIRPORT,DATE,ORIGIN_AIRPORT_NAME,ORIGIN_CITY,ORIGIN_STATE,DEPARTURE_DELAY,ELAPSED_TIME,DISTANCE,ARRIVAL_DELAY,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,OTHER_DELAY,ORIGIN_LATITUDE,ORIGIN_LONGITUDE,DESTINATION_LATITUDE,DESTINATION_LONGITUDE,FLIGHTS,DELAYED_FLIGHTS,DELAYED_PERCENTAGE
0,ABE,2015-01-01,Lehigh Valley International Airport,Allentown,PA,-5.0,115.333333,603.0,-20.333333,0.0,0.0,0.0,0.0,0.0,0.0,40.65236,-75.4404,36.497647,-84.067573,3,0.0,0.0
1,ABE,2015-01-02,Lehigh Valley International Airport,Allentown,PA,-3.2,128.0,577.6,-3.6,0.0,0.0,0.0,0.0,0.0,3.4,40.65236,-75.4404,38.73692,-84.691204,5,2.0,0.4
2,ABE,2015-01-03,Lehigh Valley International Airport,Allentown,PA,7.2,131.2,577.6,10.6,6.4,0.0,3.4,3.6,0.0,0.0,40.65236,-75.4404,38.73692,-84.691204,5,3.0,0.6
3,ABE,2015-01-04,Lehigh Valley International Airport,Allentown,PA,72.5,132.5,615.75,72.75,2.75,0.0,1.75,71.25,0.0,0.0,40.65236,-75.4404,37.868135,-85.026795,4,2.0,0.5
4,ABE,2015-01-05,Lehigh Valley International Airport,Allentown,PA,-0.2,127.6,531.8,2.8,3.8,0.0,0.2,0.0,0.0,2.6,40.65236,-75.4404,38.783412,-83.78008,5,3.0,0.6


In [7]:
# Guardamos los datos procesados, para ser utilizados en la predicción posterior
airports.to_parquet("airportsCleaned.parquet", index=False)

In [8]:
# Guardamos los datos procesados, para ser utilizados en la predicción posterior
airports_date.to_parquet("airportsDateCleaned.parquet", index=False)

In [9]:
airports_date.head()

Unnamed: 0,ORIGIN_AIRPORT,DATE,ORIGIN_AIRPORT_NAME,ORIGIN_CITY,ORIGIN_STATE,DEPARTURE_DELAY,ELAPSED_TIME,DISTANCE,ARRIVAL_DELAY,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,OTHER_DELAY,ORIGIN_LATITUDE,ORIGIN_LONGITUDE,DESTINATION_LATITUDE,DESTINATION_LONGITUDE,FLIGHTS,DELAYED_FLIGHTS,DELAYED_PERCENTAGE
0,ABE,2015-01-01,Lehigh Valley International Airport,Allentown,PA,-5.0,115.333333,603.0,-20.333333,0.0,0.0,0.0,0.0,0.0,0.0,40.65236,-75.4404,36.497647,-84.067573,3,0.0,0.0
1,ABE,2015-01-02,Lehigh Valley International Airport,Allentown,PA,-3.2,128.0,577.6,-3.6,0.0,0.0,0.0,0.0,0.0,3.4,40.65236,-75.4404,38.73692,-84.691204,5,2.0,0.4
2,ABE,2015-01-03,Lehigh Valley International Airport,Allentown,PA,7.2,131.2,577.6,10.6,6.4,0.0,3.4,3.6,0.0,0.0,40.65236,-75.4404,38.73692,-84.691204,5,3.0,0.6
3,ABE,2015-01-04,Lehigh Valley International Airport,Allentown,PA,72.5,132.5,615.75,72.75,2.75,0.0,1.75,71.25,0.0,0.0,40.65236,-75.4404,37.868135,-85.026795,4,2.0,0.5
4,ABE,2015-01-05,Lehigh Valley International Airport,Allentown,PA,-0.2,127.6,531.8,2.8,3.8,0.0,0.2,0.0,0.0,2.6,40.65236,-75.4404,38.783412,-83.78008,5,3.0,0.6
