In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

pd.set_option('display.max_columns', None)

Queremos predecir los retrasos correspondientes con los 5 aeropuertos que experimentan mayor número de retrasos. En primer lugar debemos seleccionar dichos aeropuertos entre todos los seleccionados previamente

## Selección de aeropuertos

In [2]:
# Lectura de datos previamente limpiados
df = pd.read_parquet("../Preprocessing/flightsFilteredCleaned.parquet")
df['ORIGIN_AIRPORT'].unique()

array(['LAX', 'SFO', 'SEA', 'LAS', 'DEN', 'MSP', 'PHX', 'ORD', 'MCO',
       'BOS', 'DFW', 'IAH', 'ATL', 'EWR', 'CLT', 'DTW'], dtype=object)

In [3]:
df.head()

Unnamed: 0,DATE,AIRLINE_CODE,AIRLINE,FLIGHT_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,DEPARTURE_TIME,DEPARTURE_DELAY,ELAPSED_TIME,DISTANCE,ARRIVAL_TIME,ARRIVAL_DELAY,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,OTHER_DELAY,ORIGIN_AIRPORT_NAME,ORIGIN_CITY,ORIGIN_STATE,ORIGIN_LATITUDE,ORIGIN_LONGITUDE,DESTINATION_AIRPORT_NAME,DESTINATION_CITY,DESTINATION_STATE,DESTINATION_LATITUDE,DESTINATION_LONGITUDE
0,2015-01-01,AA,American Airlines Inc.,2336,LAX,PBI,2.0,-8.0,279.0,2330,741.0,-9.0,0.0,0.0,0.0,0.0,0.0,0.0,Los Angeles International Airport,Los Angeles,CA,33.94254,-118.40807,Palm Beach International Airport,West Palm Beach,FL,26.68316,-80.09559
1,2015-01-01,US,US Airways Inc.,840,SFO,CLT,18.0,-2.0,293.0,2296,811.0,5.0,0.0,0.0,0.0,0.0,0.0,5.0,San Francisco International Airport,San Francisco,CA,37.619,-122.37484,Charlotte Douglas International Airport,Charlotte,NC,35.21401,-80.94313
2,2015-01-01,AA,American Airlines Inc.,258,LAX,MIA,15.0,-5.0,281.0,2342,756.0,-9.0,0.0,0.0,0.0,0.0,0.0,0.0,Los Angeles International Airport,Los Angeles,CA,33.94254,-118.40807,Miami International Airport,Miami,FL,25.79325,-80.29056
3,2015-01-01,AS,Alaska Airlines Inc.,135,SEA,ANC,24.0,-1.0,215.0,1448,259.0,-21.0,0.0,0.0,0.0,0.0,0.0,0.0,Seattle-Tacoma International Airport,Seattle,WA,47.44898,-122.30931,Ted Stevens Anchorage International Airport,Anchorage,AK,61.17432,-149.99619
4,2015-01-01,DL,Delta Air Lines Inc.,806,SFO,MSP,20.0,-5.0,230.0,1589,610.0,8.0,0.0,0.0,0.0,0.0,0.0,8.0,San Francisco International Airport,San Francisco,CA,37.619,-122.37484,Minneapolis-Saint Paul International Airport,Minneapolis,MN,44.88055,-93.21692


In [4]:
# De todos estos aeropuertos seleccionamos los 5 con mayor numero de retrasos
# Agrupamos los datos de vuelos que experimentan retraso por aeropuertos
airports =  pd.DataFrame(df[df["ARRIVAL_DELAY"]>0].groupby(['ORIGIN_AIRPORT','ORIGIN_AIRPORT_NAME'])["FLIGHT_NUMBER"].count()).reset_index()
airports = airports.rename(columns={"FLIGHT_NUMBER":"DELAYED_FLIGHTS"})

# Ordenamos los aeropuertos de tal manera que los 5 con mayor % de retrasos aparezcan los primeros
airports = airports.sort_values('DELAYED_FLIGHTS',ascending=False)
airports = airports.reset_index(level=0, drop=True)

airports.head()

Unnamed: 0,ORIGIN_AIRPORT,ORIGIN_AIRPORT_NAME,DELAYED_FLIGHTS
0,ATL,Hartsfield-Jackson Atlanta International Airport,122213
1,ORD,Chicago O'Hare International Airport,121696
2,DFW,Dallas/Fort Worth International Airport,98565
3,DEN,Denver International Airport,86012
4,LAX,Los Angeles International Airport,85590


In [5]:
# Seleccionamos dichos aeropuertos
top_5 = list(airports.loc[0:4,'ORIGIN_AIRPORT'])

# Filtramos nuestros datos para quedarnos únicamente con los correspondientes a dichos aeropuertos
df = df[df.ORIGIN_AIRPORT.isin(top_5)]

In [6]:
# Guardamos
df.to_parquet("top_5_delay.parquet", index=False)

## Preparamos los datos para el forecast

In [7]:
# Agrupamos los datos por aerolinea, fecha, y aeropuerto de origen y contamos el número de vuelos
data = pd.DataFrame(df[df["ARRIVAL_DELAY"]>0].groupby(["DATE","ORIGIN_AIRPORT","ORIGIN_AIRPORT_NAME"])["FLIGHT_NUMBER"].count()).reset_index()
data = data.rename(columns={"FLIGHT_NUMBER":"DELAYED_FLIGHTS"})
data.head()

Unnamed: 0,DATE,ORIGIN_AIRPORT,ORIGIN_AIRPORT_NAME,DELAYED_FLIGHTS
0,2015-01-01,ATL,Hartsfield-Jackson Atlanta International Airport,162
1,2015-01-01,DEN,Denver International Airport,406
2,2015-01-01,DFW,Dallas/Fort Worth International Airport,453
3,2015-01-01,LAX,Los Angeles International Airport,236
4,2015-01-01,ORD,Chicago O'Hare International Airport,271


Pueder ser que haya aeropuertos que no tengan retrasos algún día (caso raro). Para realizar el forecast necesitamos tener datos en todos los días de de todos los aeropuertos para todas las aerolíneas así que vamos a asegurarnos de que esto ocurra. Para ello crearemos un dataframe auxiliar con contenga todos los datos de fechas y aeropuertos, pero sin el número de vuelos retrasados

In [8]:
dates = [datetime(2015,1,1) + timedelta(days=d) for d in range((datetime(2015,12,31) - datetime(2015,1,1)).days + 1)]*5

airports = []
for airport in data['ORIGIN_AIRPORT'].unique():
    airport_dates=[airport]*365
    airports=airports+airport_dates

# Creamos el dataframe
aux = pd.DataFrame()
aux['DATE'] = dates
aux['ORIGIN_AIRPORT'] = airports

aux.head()

Unnamed: 0,DATE,ORIGIN_AIRPORT
0,2015-01-01,ATL
1,2015-01-02,ATL
2,2015-01-03,ATL
3,2015-01-04,ATL
4,2015-01-05,ATL


Una vez hecho esto vamos a unir el dataframe actual con el obtenido a partir del groupby, mediante un left join, de tal manera que si no tenemos datos de algún aeropuerto en alguna fecha exista una fila con valor nulo, el cual sustituiremos por cero

In [10]:
forecast_data = aux.merge(data, on=['DATE','ORIGIN_AIRPORT'], how='left')
forecast_data = forecast_data.fillna(0)
forecast_data['DELAYED_FLIGHTS'] = forecast_data['DELAYED_FLIGHTS'].astype(int)
forecast_data.head()

Unnamed: 0,DATE,ORIGIN_AIRPORT,ORIGIN_AIRPORT_NAME,DELAYED_FLIGHTS
0,2015-01-01,ATL,Hartsfield-Jackson Atlanta International Airport,162
1,2015-01-02,ATL,Hartsfield-Jackson Atlanta International Airport,445
2,2015-01-03,ATL,Hartsfield-Jackson Atlanta International Airport,547
3,2015-01-04,ATL,Hartsfield-Jackson Atlanta International Airport,653
4,2015-01-05,ATL,Hartsfield-Jackson Atlanta International Airport,317


In [11]:
# Guardamos
forecast_data.to_parquet("forecast_data.parquet", index=False)