# EDA

## Pre-análisis descriptivo

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots

In [2]:
# base de datos de vuelos
df = pd.read_parquet("../Preprocessing/flightsCleaned.parquet")

# Añadimos el FlightNum como variable categórica
df['FLIGHT_NUMBER']=df['FLIGHT_NUMBER'].astype(object) 
# Configuramos DATE como variable de tipo datetime
df['DATE'] = pd.to_datetime(df['DATE'])

pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,DATE,IATA_CODE,AIRLINE,FLIGHT_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,OTHER_DELAY,ORIGIN_CITY,ORIGIN_STATE,ORIGIN_COUNTRY,ORIGIN_LATITUDE,ORIGIN_LONGITUDE,DESTINATION_CITY,DESTINATION_STATE,DESTINATION_COUNTRY,DESTINATION_LATITUDE,DESTINATION_LONGITUDE
0,2015-01-01,AS,Alaska Airlines Inc.,98,ANC,SEA,5,2354.0,-11.0,21.0,15.0,205.0,194.0,169.0,1448,404.0,4.0,430,408.0,-22.0,0.0,0.0,0.0,0.0,0.0,0.0,Anchorage,AK,USA,61.17432,-149.99619,Seattle,WA,USA,47.44898,-122.30931
1,2015-01-01,AA,American Airlines Inc.,2336,LAX,PBI,10,2.0,-8.0,12.0,14.0,280.0,279.0,263.0,2330,737.0,4.0,750,741.0,-9.0,0.0,0.0,0.0,0.0,0.0,0.0,Los Angeles,CA,USA,33.94254,-118.40807,West Palm Beach,FL,USA,26.68316,-80.09559
2,2015-01-01,US,US Airways Inc.,840,SFO,CLT,20,18.0,-2.0,16.0,34.0,286.0,293.0,266.0,2296,800.0,11.0,806,811.0,5.0,0.0,0.0,0.0,0.0,0.0,5.0,San Francisco,CA,USA,37.619,-122.37484,Charlotte,NC,USA,35.21401,-80.94313
3,2015-01-01,AA,American Airlines Inc.,258,LAX,MIA,20,15.0,-5.0,15.0,30.0,285.0,281.0,258.0,2342,748.0,8.0,805,756.0,-9.0,0.0,0.0,0.0,0.0,0.0,0.0,Los Angeles,CA,USA,33.94254,-118.40807,Miami,FL,USA,25.79325,-80.29056
4,2015-01-01,AS,Alaska Airlines Inc.,135,SEA,ANC,25,24.0,-1.0,11.0,35.0,235.0,215.0,199.0,1448,254.0,5.0,320,259.0,-21.0,0.0,0.0,0.0,0.0,0.0,0.0,Seattle,WA,USA,47.44898,-122.30931,Anchorage,AK,USA,61.17432,-149.99619


In [3]:
# base de datos de aeropuertos
airports_date = pd.read_parquet("../Preprocessing/airportsDateCleaned.parquet")
airports_date.head()

FileNotFoundError: [Errno 2] No such file or directory: '../Preprocessing/airportsDateCleaned.parquet'

In [None]:
# Observamos el tamaño de los aeropuertos
fig = px.treemap(airports_date, path=["ORIGIN_AIRPORT"], values='FLIGHTS',
                  color='DELAYED_PERCENTAGE', hover_data=['ORIGIN_CITY'],
                  color_continuous_scale='Viridis',
                  template="plotly_dark")
fig.update_layout(title="Departure Flights by Airport",
                  margin = dict(t=75, l=25, r=25, b=25),
                  width = 1400)
fig.show()

Escogemos los 16 aeropuertos con más vuelos para hacer el análisis. De estos aeropuertos salen la mayor parte de vuelos nacionales en EEUU y por ello implantaremos las medidas necesarias en ellos como prueba beta.

In [6]:
airports = pd.read_parquet("../Preprocessing/airortsCleaned.parquet")
top16_airports = airports.head(16)["ORIGIN_AIRPORT"]

FileNotFoundError: [Errno 2] No such file or directory: '../Preprocessing/airortsCleaned.parquet'

Filtramos en los dos dataframes que usaremos por estos aeropuertos

In [None]:
df_airports = airports_date[airports_date["ORIGIN_AIRPORT"].isin(top16_airports)]
df_airports.head()

In [None]:
df_flights=df[df["ORIGIN_AIRPORT"].isin(top16_airports)]
df_flights.head()

In [None]:
# Porcentaje de vuelos con los que nos quedamos
print("{:.2%}".format(df_flights.shape[0] / df.shape[0]))

Es interesante el dato de que el 48.71% de los vuelos nacionales de EEUU salen de tan solo 16 aeropuertos. <br>
Esto reducirá el coste de implantar las recomendaciones sacadas de este análisis, ya que solo habrá que realizar cambios en 16 aeropuertos para mejorar casi el 50% de los vuelos. Es parecido a realizar un test A/B en la vida real.

In [None]:
#fig = px.bar(airports, x="FLIGHTS", y="ORIGIN_AIRPORT", orientation='h')
#fig.update_layout(yaxis={'categoryorder':'total ascending'}) # add only this line
#fig.show()

In [None]:
# Guardamos los datos ya que serán los utilizados en la mayoría nuestros análisis
df_flights.to_parquet("df_flights.parquet", index=False)
df_airports.to_parquet("df_airports.parquet", index=False)

## Análisis Descriptivo

Primero hacemos un análisis de los retrasos a lo largo del año

In [None]:
#del data
data = pd.DataFrame()
data["Total"] = df_flights.groupby('DATE')["FLIGHT_NUMBER"].count()
data["Delayed"] = df_flights[df_flights["ARRIVAL_DELAY"]>0].groupby('DATE')["FLIGHT_NUMBER"].count()
data.head()

In [None]:
np.mean(data["Delayed"])

In [None]:
fig = px.line(data, x=data.index, y=["Total", "Delayed"], color_discrete_sequence=px.colors.qualitative.Vivid)
fig.update_layout(
    title="Trend in total and delayed flights throughout the year",
    xaxis_title="Date",
    yaxis_title="Flights",
    legend_title="Number of flights",
    template="plotly_dark",
    hovermode="x unified"
)

fig.add_hline(y=np.mean(data["Total"]), line_dash="dash", line_color="white", annotation_text="mean", annotation_position="top right")
fig.add_hline(y=np.mean(data["Delayed"]), line_dash="dash", line_color="white", annotation_text="mean", annotation_position="bottom right")

fig.show()

Parece haber una tendencia semanal, vamos a confirmarlo

In [None]:
## Definimos la paleta usada con plotly express, para poder utilizarla con plotly normal
color_palette = px.colors.qualitative.Vivid

In [None]:
data = pd.DataFrame()
data["Total"] = df_flights.groupby(df_flights['DATE'].dt.day_name())["FLIGHT_NUMBER"].count()
data["Delayed"] = df_flights[df_flights["ARRIVAL_DELAY"]>0].groupby(df_flights['DATE'].dt.day_name())["FLIGHT_NUMBER"].count()
data = data.reindex(index = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'])
data.head()

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(x=data.index, y=data["Total"], name="Total", marker_color = color_palette[0]))
fig.add_trace(go.Scatter(x=data.index, y=data["Delayed"], name="Delayed", line_color=color_palette[1]))
fig.update_layout(
    title="Total and delayed flights by airline",
    xaxis_title="Weekday",
    yaxis_title="Flights",
    legend_title="Number of Flights",
    template="plotly_dark",
    hovermode="x unified"
)
fig.show()

Efectivamente, los sábados hay menos vuelos

Lo siguiente será analizar cuánto se retrasan de media los vuelos y debido a qué causas, a lo largo del año

In [None]:
data = df_flights.groupby(df_flights['DATE'].dt.month_name()).mean()
ORDERED_MONTHS = ["January", "February", "March", "April", "May", "June",
      "July", "August", "September", "October", "November", "December"]
data = data.reindex(index = ORDERED_MONTHS)

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=data.index, y=data["AIR_SYSTEM_DELAY"], stackgroup='one', name="Air system", line_color=color_palette[0]))
fig.add_trace(go.Scatter(x=data.index, y=data["SECURITY_DELAY"], stackgroup='one', name="Security", line_color=color_palette[1])) 
fig.add_trace(go.Scatter(x=data.index, y=data["AIRLINE_DELAY"], stackgroup='one', name="Airline", line_color=color_palette[2])) 
fig.add_trace(go.Scatter(x=data.index, y=data["LATE_AIRCRAFT_DELAY"], stackgroup='one', name="Late aircraft", line_color=color_palette[3])) 
fig.add_trace(go.Scatter(x=data.index, y=data["WEATHER_DELAY"], stackgroup='one', name="Weather", line_color=color_palette[4])) 
fig.add_trace(go.Scatter(x=data.index, y=data["OTHER_DELAY"], stackgroup='one', name="Other", line_color=color_palette[5])) 

fig.update_layout(
    title="Average delay and cause over the year",
    xaxis_title="Month",
    yaxis_title="Minutes",
    legend_title="Cause of delay",
    template="plotly_dark",
    hovermode="x unified"
)

fig.show()

¿Correlación entre distancia y retraso?

In [None]:
# fig = px.scatter(df_flights[df_flights["ARRIVAL_DELAY"]>0], x="ARRIVAL_DELAY", y="DISTANCE", color_discrete_sequence=px.colors.qualitative.Vivid)
# fig.update_layout(
#     title="Tendencia vuelos totales y retrasados a lo largo del año",
#     xaxis_title="Fecha",
#     yaxis_title="Vuelos",
#     legend_title="Leyenda",
#     template="plotly_dark"
# )
# fig.show()

No hay correlación entre la distancia y el retraso de los vuelos

In [None]:
df_airports2 = airports[airports["ORIGIN_AIRPORT"].isin(top16_airports)]
df_airports2.head()

In [None]:
# variables_to_group_by = ["ORIGIN_AIRPORT","ORIGIN_AIRPORT_NAME","ORIGIN_CITY","ORIGIN_STATE"]
# df_airports.groupby(variables_to_group_by).mean().head()

In [None]:
fig = px.scatter_geo(df_airports2, lat="ORIGIN_LATITUDE", lon = "ORIGIN_LONGITUDE",
                     size= "FLIGHTS", # size of markers
                     size_max= 30,
                     color= "DELAYED_PERCENTAGE", # which column to use to set the color of markers
                     scope="usa",
                     text = "ORIGIN_AIRPORT",
                     hover_data  = ["ORIGIN_CITY"],
                     color_continuous_scale='RdYlGn_r',
                     template="plotly_dark")
fig.update_traces(textposition="top center")
fig.update_layout(
    title="Origin airports with number of departing flights and percentage of delayed flights <br><br><sup>Size indicates the number of departing flights</sup>",
    legend_title="Causa del Retraso",
    width = 1000, 
    height = 650
)
fig.show()

Estudiamos la disvisión de los retrasos en cada aeropuerto

In [None]:
fig = make_subplots(rows=4, cols=4, subplot_titles=df_airports2["ORIGIN_AIRPORT"].values,
                    specs=[[{"type": "pie"}, {"type": "pie"}, {"type": "pie"}, {"type": "pie"}],
                           [{"type": "pie"}, {"type": "pie"}, {"type": "pie"}, {"type": "pie"}],
                           [{"type": "pie"}, {"type": "pie"}, {"type": "pie"}, {"type": "pie"}],
                           [{"type": "pie"}, {"type": "pie"}, {"type": "pie"}, {"type": "pie"}]],
                    horizontal_spacing = 0.03, vertical_spacing = 0.03)

delay_labels = ["AIR_SYSTEM_DELAY","SECURITY_DELAY","AIRLINE_DELAY","LATE_AIRCRAFT_DELAY","WEATHER_DELAY","OTHER_DELAY"]

for i in range(4):
    values1 = df_airports2[delay_labels].iloc[i]
    fig.add_trace(go.Pie(labels=delay_labels, values=values1, direction ='clockwise', marker_colors=px.colors.qualitative.Vivid, hole=.3),
              row=1, col=i+1)
    values2 = df_airports2[delay_labels].iloc[i+4]
    fig.add_trace(go.Pie(labels=delay_labels, values=values2, direction ='clockwise', marker_colors=px.colors.qualitative.Vivid, hole=.3),
              row=2, col=i+1)
    values3 = df_airports2[delay_labels].iloc[i+8]
    fig.add_trace(go.Pie(labels=delay_labels, values=values3, direction ='clockwise', marker_colors=px.colors.qualitative.Vivid, hole=.3),
              row=3, col=i+1)
    values4 = df_airports2[delay_labels].iloc[i+12]
    fig.add_trace(go.Pie(labels=delay_labels, values=values4, direction ='clockwise', marker_colors=px.colors.qualitative.Vivid, hole=.3),
              row=4, col=i+1)

fig.update_layout(title_text="Average Delay Distribution by Airport", legend_title="Delay Cause", template="plotly_dark", height=1400, width=1400,
                    legend=dict(orientation="h", y=-0.02, x =0.08))
fig.update_annotations(yshift=-150)

fig.show()

No se si hacer barras apiladas con la misma información

In [None]:
round(df["ARRIVAL_DELAY"],2)

In [None]:
fig = px.density_heatmap(df_flights, y=df_flights["DATE"].dt.month_name(), x=df_flights["ORIGIN_AIRPORT"], z='ARRIVAL_DELAY', histfunc="avg",
                         color_continuous_scale='RdYlGn_r', text_auto=".2f")

fig.update_layout(
    title="Average Delay by Airport and Month",
    xaxis_title="Month",
    yaxis_title="Airport",
    template="plotly_dark",
    width = 1200, 
    height = 650
)

fig.layout.coloraxis.colorbar.title = 'Average Delay'

fig.show()

# FIN