In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
pd.set_option('display.max_columns', None)

# Procesado de los datos

## Lectura de datos

In [3]:
# Cargamos los datos y eliminamos las filas duplicadas. Un mismo vuelo no puede retrasarse en el mismo momento del tiempo más de una vez
df = pd.read_csv("flights.csv")
df = df.drop_duplicates()
df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,2015,1,1,4,AS,98,N407AS,ANC,SEA,5,2354.0,-11.0,21.0,15.0,205.0,194.0,169.0,1448,404.0,4.0,430,408.0,-22.0,0,0,,,,,,
1,2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,2.0,-8.0,12.0,14.0,280.0,279.0,263.0,2330,737.0,4.0,750,741.0,-9.0,0,0,,,,,,
2,2015,1,1,4,US,840,N171US,SFO,CLT,20,18.0,-2.0,16.0,34.0,286.0,293.0,266.0,2296,800.0,11.0,806,811.0,5.0,0,0,,,,,,
3,2015,1,1,4,AA,258,N3HYAA,LAX,MIA,20,15.0,-5.0,15.0,30.0,285.0,281.0,258.0,2342,748.0,8.0,805,756.0,-9.0,0,0,,,,,,
4,2015,1,1,4,AS,135,N527AS,SEA,ANC,25,24.0,-1.0,11.0,35.0,235.0,215.0,199.0,1448,254.0,5.0,320,259.0,-21.0,0,0,,,,,,


## Arreglo de datos

In [4]:
df['ORIGIN_AIRPORT'] = df['ORIGIN_AIRPORT'].astype(str)
df['DESTINATION_AIRPORT'] = df['DESTINATION_AIRPORT'].astype(str)

Hay una serie de códigos de aeropuertos de origen y destino que no corresponden con el IATA_CODE asociado al aeropuerto si no que aparece un id numérico.

In [5]:
print("Aeropuertos origen: "+ str(len(df[df['ORIGIN_AIRPORT'].str.isdigit()])))
print("Aeropuertos destino: "+ str(len(df[df['DESTINATION_AIRPORT'].str.isdigit()])))

Aeropuertos origen: 486165
Aeropuertos destino: 486165


Para corregir esto hacemos uso del archivo airports_dict, el cual genera un diccionario en el que se asocian las claves de aeropuerto numéricas a un código str como el que tenemos en el resto de casos en función de las rutas de los vuelos.

In [6]:
# Cargamos el diccionario
with open('dict_airport.json', 'rb') as fp:
    dict_airport = pickle.load(fp)

In [7]:
# Creamos dos columnas auxiliares para reemplazar el código numérico por el codigo str de aeropuerto correspondiente
df['CODE_ORI'] = df['ORIGIN_AIRPORT']
df['CODE_ORI'] = df['CODE_ORI'].map(dict_airport)

df['CODE_DEST'] = df['ORIGIN_AIRPORT']
df['CODE_DEST'] = df['DESTINATION_AIRPORT'].map(dict_airport)

# Sustituimos
df['CODE_ORI'] = df['CODE_ORI'].fillna(df['ORIGIN_AIRPORT'])
df['ORIGIN_AIRPORT'] = df['CODE_ORI']

df['CODE_DEST'] = df['CODE_DEST'].fillna(df['DESTINATION_AIRPORT'])
df['DESTINATION_AIRPORT'] = df['CODE_DEST']

df = df.drop(['CODE_ORI','CODE_DEST'],axis = 1)

In [8]:
# Comprobamos
print("Aeropuertos origen: "+ str(len(df[df['ORIGIN_AIRPORT'].str.isdigit()])))
print("Aeropuertos destino: "+ str(len(df[df['DESTINATION_AIRPORT'].str.isdigit()])))

Aeropuertos origen: 1235
Aeropuertos destino: 1235


**NOTA:** vemos que hay una serie de índices que el algoritmos no consigue emparejar con un aeropuerto. No obstante, estos datos no corresponden con una muestra representativa de los datos (poco volumen).

## Selección de los datos

Trabajaremos únicamente con los datos correspondientes a los 10 aeropuertos de Estados Unidos con mayor tráfico aéreo:

In [9]:
airports = pd.DataFrame(df.groupby(df['ORIGIN_AIRPORT'])["FLIGHT_NUMBER"].count())
airports = airports.rename(columns={"FLIGHT_NUMBER":"TOTAL_FLIGHTS"})
airports = airports.sort_values('TOTAL_FLIGHTS',ascending=False)
airports

Unnamed: 0_level_0,TOTAL_FLIGHTS
ORIGIN_AIRPORT,Unnamed: 1_level_1
ATL,379424
ORD,313536
DFW,260595
DEN,214191
LAX,212401
...,...
12255,53
12888,53
ITH,34
11097,31


In [10]:
airports = airports.reset_index(level=0, drop=False)

#Seleccionamos las 10 primeras filas
aux = airports.loc[0:9]

# Lista con el top 10 de aeropuertos
top_10_airports = list(aux['ORIGIN_AIRPORT'])
top_10_airports

['ATL', 'ORD', 'DFW', 'DEN', 'LAX', 'SFO', 'IAH', 'PHX', 'LAS', 'MSP']

In [11]:
# Filtramos los datos
df = df[df.ORIGIN_AIRPORT.isin(top_10_airports)]
len(df)

2130464

In [12]:
df.head()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
1,2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,2.0,-8.0,12.0,14.0,280.0,279.0,263.0,2330,737.0,4.0,750,741.0,-9.0,0,0,,,,,,
2,2015,1,1,4,US,840,N171US,SFO,CLT,20,18.0,-2.0,16.0,34.0,286.0,293.0,266.0,2296,800.0,11.0,806,811.0,5.0,0,0,,,,,,
3,2015,1,1,4,AA,258,N3HYAA,LAX,MIA,20,15.0,-5.0,15.0,30.0,285.0,281.0,258.0,2342,748.0,8.0,805,756.0,-9.0,0,0,,,,,,
5,2015,1,1,4,DL,806,N3730B,SFO,MSP,25,20.0,-5.0,18.0,38.0,217.0,230.0,206.0,1589,604.0,6.0,602,610.0,8.0,0,0,,,,,,
6,2015,1,1,4,NK,612,N635NK,LAS,MSP,25,19.0,-6.0,11.0,30.0,181.0,170.0,154.0,1299,504.0,5.0,526,509.0,-17.0,0,0,,,,,,


In [13]:
print("septiembre: "+str(len(df[df['MONTH']==9])))
print("octubre: "+str(len(df[df['MONTH']==10])))

septiembre: 172409
octubre: 180722


Volumen de vuelos paradido en ambos meses por lo que damos por válida la correción del código de los aeropuertos

## Análisis de variables

In [14]:
df.columns

Index(['YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'FLIGHT_NUMBER',
       'TAIL_NUMBER', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
       'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DEPARTURE_DELAY', 'TAXI_OUT',
       'WHEELS_OFF', 'SCHEDULED_TIME', 'ELAPSED_TIME', 'AIR_TIME', 'DISTANCE',
       'WHEELS_ON', 'TAXI_IN', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME',
       'ARRIVAL_DELAY', 'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON',
       'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY',
       'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY'],
      dtype='object')

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2130464 entries, 1 to 5819074
Data columns (total 31 columns):
 #   Column               Dtype  
---  ------               -----  
 0   YEAR                 int64  
 1   MONTH                int64  
 2   DAY                  int64  
 3   DAY_OF_WEEK          int64  
 4   AIRLINE              object 
 5   FLIGHT_NUMBER        int64  
 6   TAIL_NUMBER          object 
 7   ORIGIN_AIRPORT       object 
 8   DESTINATION_AIRPORT  object 
 9   SCHEDULED_DEPARTURE  int64  
 10  DEPARTURE_TIME       float64
 11  DEPARTURE_DELAY      float64
 12  TAXI_OUT             float64
 13  WHEELS_OFF           float64
 14  SCHEDULED_TIME       float64
 15  ELAPSED_TIME         float64
 16  AIR_TIME             float64
 17  DISTANCE             int64  
 18  WHEELS_ON            float64
 19  TAXI_IN              float64
 20  SCHEDULED_ARRIVAL    int64  
 21  ARRIVAL_TIME         float64
 22  ARRIVAL_DELAY        float64
 23  DIVERTED             int64  
 24

Columnas que se pueden eliminar:
- TAIL NUMBER: representa un ID de avión único por lo que no corresponde con una variable representativa para el análisis por lo que la eliminamos

### Debatir:
Nos importan..?
- SCHEDULED_DEPARTURE: yo diría que no, si llega en hora = OK!
- DEPARTURE_TIME: es interesante analizar los retrasos por tramo horario?
- DEPARTURE_DELAY: yo diría que no, si sale tarde pero llega en hora = OK!
- ARRIVAL_TIME: yo diría que no, solo me interesa si el vuelo llega tarde, no?
- SCHEDULED_TIME: no se a que se refiere  
- SCHEDULED_ARRIVAL: solo nos interesa saber si se retrasa o no, no la previsión

In [16]:
# Eliminamos tail number porque es un identificador de avión (no vuelo) único
df = df.drop(["TAIL_NUMBER"],axis=1)

Hay un error de formato con la variable flight number, que es un indicador del vuelo y por lo tanto una variable categórica. Lo mismo ocurre con Cancelled y Diverted

In [17]:
# Añadimos el FlightNum, Cancelled and Diverted como variables categóricas
df['FLIGHT_NUMBER']=df['FLIGHT_NUMBER'].astype(object) 
df['CANCELLED']=df['CANCELLED'].astype(object) 
df['DIVERTED']=df['DIVERTED'].astype(object) 

In [18]:
# Unimos las columnas Year, Month y Day of Month como una única variable fecha
# Formato por defecto mes/dia/año para que to_datetime funcione correctamente
df["DATE"]  = df['MONTH'].astype(str) +'/'+ df['DAY'].astype(str) +'/' + df['YEAR'].astype(str)
df["DATE"] = pd.to_datetime(df["DATE"])

Si nos interesa en algún momento podríamos añadir las horas también 

In [19]:
#Eliminamos las columnas year, month y day
df = df.drop(["YEAR","MONTH", "DAY"],axis=1)

# También podemos eliminar la columna DAY_OF_WEEK ya que podemos obtenerla
df = df.drop("DAY_OF_WEEK",axis=1)
print(f"Ejemplo: obtener dia de la semana de la fecha {df['DATE'].iloc[0]} --> {df['DATE'].iloc[0].day_name()} (dia {df['DATE'].iloc[0].dayofweek})")

# NOTA: hay que tener en cuenta que dayofweek empieza a contar en 0 = lunes

Ejemplo: obtener dia de la semana de la fecha 2015-01-01 00:00:00 --> Thursday (dia 3)


In [20]:
# Cambiamos el orden de las columnas, para que DATE sea la primera
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]
df = df[cols] 
df.head()

Unnamed: 0,DATE,AIRLINE,FLIGHT_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
1,2015-01-01,AA,2336,LAX,PBI,10,2.0,-8.0,12.0,14.0,280.0,279.0,263.0,2330,737.0,4.0,750,741.0,-9.0,0,0,,,,,,
2,2015-01-01,US,840,SFO,CLT,20,18.0,-2.0,16.0,34.0,286.0,293.0,266.0,2296,800.0,11.0,806,811.0,5.0,0,0,,,,,,
3,2015-01-01,AA,258,LAX,MIA,20,15.0,-5.0,15.0,30.0,285.0,281.0,258.0,2342,748.0,8.0,805,756.0,-9.0,0,0,,,,,,
5,2015-01-01,DL,806,SFO,MSP,25,20.0,-5.0,18.0,38.0,217.0,230.0,206.0,1589,604.0,6.0,602,610.0,8.0,0,0,,,,,,
6,2015-01-01,NK,612,LAS,MSP,25,19.0,-6.0,11.0,30.0,181.0,170.0,154.0,1299,504.0,5.0,526,509.0,-17.0,0,0,,,,,,


Hablar formato tiempos: 
    SCHEDULED_DEPARTURE, DEPARTURE_TIME, SCHEDULED_TIME
    SCHEDULED_ARRIVAL, ARRIVAL_TIME (se podrían eliminar dado que nos sirven para sacar arrival delay)

## Debatir:

Las variables SCHEDULED_DEPARTURE, DEPARTURE_TIME, DEPARTURE_DELAY, ARRIVAL_TIME y SCHEDULED_ARRIVAL nos ayudan a saber si un vuelo se ha retrasado o no, pero una vez aue sabemos esto no tienen  valor en sí mismas por lo que las podemos eliminar

#### Tipos de vuelo:
1. On time/ arrived earlier --> arrival_delay <=0
2. Delayed   --> arrival_delay > 0
3. Diverted  --> diverted == 1
4. Cancelled --> cancelled == 1

##### Razones por las que se retrasa un vuelo:
- AIR_SYSTEM_DELAY     
- SECURITY_DELAY       
- AIRLINE_DELAY        
- LATE_AIRCRAFT_DELAY 
- WEATHER_DELAY       

In [21]:
# Vamos a examinar ahora los valores nulos
df.isna().sum()

DATE                         0
AIRLINE                      0
FLIGHT_NUMBER                0
ORIGIN_AIRPORT               0
DESTINATION_AIRPORT          0
SCHEDULED_DEPARTURE          0
DEPARTURE_TIME           27790
DEPARTURE_DELAY          27790
TAXI_OUT                 28925
WHEELS_OFF               28925
SCHEDULED_TIME               2
ELAPSED_TIME             34662
AIR_TIME                 34662
DISTANCE                     0
WHEELS_ON                30406
TAXI_IN                  30406
SCHEDULED_ARRIVAL            0
ARRIVAL_TIME             30406
ARRIVAL_DELAY            34662
DIVERTED                     0
CANCELLED                    0
CANCELLATION_REASON    2101083
AIR_SYSTEM_DELAY       1717078
SECURITY_DELAY         1717078
AIRLINE_DELAY          1717078
LATE_AIRCRAFT_DELAY    1717078
WEATHER_DELAY          1717078
dtype: int64

Las variables CANCELLATION_REASON, AIR_SYSTEM_DELAY, SECURITY_DELAY, AIRLINE_DELAY, LATE_AIRCRAFT_DELAY,WEATHER_DELAY presentan una gran cantidad de valores nulos. No obstante, todas estas variables se relacionan con vuelos cancelados o retrasados por lo que tiene sentido que sean valores nulos para aquellos vuelos que no se hayan cancelado ni retrasado. Vamos a analizar estas variables

In [22]:
cancelled = df[df['CANCELLED'] == 1]
cancelled.isna().sum()

DATE                       0
AIRLINE                    0
FLIGHT_NUMBER              0
ORIGIN_AIRPORT             0
DESTINATION_AIRPORT        0
SCHEDULED_DEPARTURE        0
DEPARTURE_TIME         27790
DEPARTURE_DELAY        27790
TAXI_OUT               28925
WHEELS_OFF             28925
SCHEDULED_TIME             2
ELAPSED_TIME           29381
AIR_TIME               29381
DISTANCE                   0
WHEELS_ON              29381
TAXI_IN                29381
SCHEDULED_ARRIVAL          0
ARRIVAL_TIME           29381
ARRIVAL_DELAY          29381
DIVERTED                   0
CANCELLED                  0
CANCELLATION_REASON        0
AIR_SYSTEM_DELAY       29381
SECURITY_DELAY         29381
AIRLINE_DELAY          29381
LATE_AIRCRAFT_DELAY    29381
WEATHER_DELAY          29381
dtype: int64

Vemos como en este caso no existen valores nulos para la columna CANCELLATION_REASON. No obstante, nuestro análisis consiste en prededcir el retraso de vuelos por lo que no necesitamos la info de vuelos cancelados/ redirigidos así que eliminamos dichos registros

In [23]:
# Eliminamos los cancelados
df = df[df['CANCELLED'] == 0]

#Eliminamos los redirigidos
df = df[df['DIVERTED'] == 0]

df = df.drop(["DIVERTED","CANCELLED", "CANCELLATION_REASON"],axis=1)

Nos quedamos solo con los vuelos que llegan antes de lo previsto, en hora o con retraso

In [24]:
len(df)

2095802

In [25]:
df.isna().sum()

DATE                         0
AIRLINE                      0
FLIGHT_NUMBER                0
ORIGIN_AIRPORT               0
DESTINATION_AIRPORT          0
SCHEDULED_DEPARTURE          0
DEPARTURE_TIME               0
DEPARTURE_DELAY              0
TAXI_OUT                     0
WHEELS_OFF                   0
SCHEDULED_TIME               0
ELAPSED_TIME                 0
AIR_TIME                     0
DISTANCE                     0
WHEELS_ON                    0
TAXI_IN                      0
SCHEDULED_ARRIVAL            0
ARRIVAL_TIME                 0
ARRIVAL_DELAY                0
AIR_SYSTEM_DELAY       1682416
SECURITY_DELAY         1682416
AIRLINE_DELAY          1682416
LATE_AIRCRAFT_DELAY    1682416
WEATHER_DELAY          1682416
dtype: int64

Realizamos ahora el análisis de los valores nulos relacionados con los vuelos retrasados.

**NOTA**: consideramos que un vuelo se retrasa si llega pasada la hora prevista, independientemente de si ha tenido retraso en la hora de salida o no

In [26]:
delayed = df[df['ARRIVAL_DELAY'] > 0]
delayed.isna().sum()

DATE                        0
AIRLINE                     0
FLIGHT_NUMBER               0
ORIGIN_AIRPORT              0
DESTINATION_AIRPORT         0
SCHEDULED_DEPARTURE         0
DEPARTURE_TIME              0
DEPARTURE_DELAY             0
TAXI_OUT                    0
WHEELS_OFF                  0
SCHEDULED_TIME              0
ELAPSED_TIME                0
AIR_TIME                    0
DISTANCE                    0
WHEELS_ON                   0
TAXI_IN                     0
SCHEDULED_ARRIVAL           0
ARRIVAL_TIME                0
ARRIVAL_DELAY               0
AIR_SYSTEM_DELAY       385520
SECURITY_DELAY         385520
AIRLINE_DELAY          385520
LATE_AIRCRAFT_DELAY    385520
WEATHER_DELAY          385520
dtype: int64

Si considerabamos todos los vuelos teníamos 1283118 registros con valor nulo en las variables DELAY, cuando filtramos por vuelos retrasados tenemos 275025.
Vamos a analizar primero los valores nulos de aquellos vuelos que se han retrasado

In [27]:
delayed = delayed[['AIR_SYSTEM_DELAY', 'SECURITY_DELAY','AIRLINE_DELAY','LATE_AIRCRAFT_DELAY','WEATHER_DELAY']]
delayed.head()

Unnamed: 0,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
2,,,,,
5,,,,,
20,,,,,
27,25.0,0.0,0.0,0.0,0.0
29,,,,,


In [28]:
# Seleccionamos todas las filas que tengan NaN en todas las columnas
nulls = delayed.loc[(delayed['AIR_SYSTEM_DELAY'].isnull() == True) & (delayed['SECURITY_DELAY'].isnull() == True) & (delayed['AIRLINE_DELAY'].isnull() == True) & (delayed['LATE_AIRCRAFT_DELAY'].isnull() == True) & (delayed['WEATHER_DELAY'].isnull() == True)]
len(nulls)

385520

Vemos que TODOS los NaN se concentran en las mismas filas. Entendemos que en este caso el vuelo se ha retrasado por causa desconocida. Para indicar esto creamos una nueva columna 'OTHER_DELAY' en nuestro data frame cuyo valor sea igual al delay

In [29]:
# Creamos la OTHER_DELAY con los mismos datos que ARRIVAL_DELAY
df['OTHER_DELAY'] = df['ARRIVAL_DELAY']

# Como hemos visto que si una columna _DELAY es NaN el resto también, utilizamos una única columa para comparar, y asignamos a OTHER_DELAY la diferencia entre el delay a la llegada y el resto de delays
df.loc[pd.notna(df['AIR_SYSTEM_DELAY']),'OTHER_DELAY'] = df['ARRIVAL_DELAY'] - df['AIR_SYSTEM_DELAY'] - df['SECURITY_DELAY'] - df['AIRLINE_DELAY'] -df['LATE_AIRCRAFT_DELAY'] - df['WEATHER_DELAY']
df[45:50]

Unnamed: 0,DATE,AIRLINE,FLIGHT_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,OTHER_DELAY
168,2015-01-01,NK,224,ORD,LGA,556,547.0,-9.0,16.0,603.0,124.0,160.0,92.0,733,835.0,52.0,900,927.0,27.0,27.0,0.0,0.0,0.0,0.0,0.0
169,2015-01-01,UA,1457,LAX,IAH,557,555.0,-2.0,12.0,607.0,192.0,178.0,160.0,1379,1047.0,6.0,1109,1053.0,-16.0,,,,,,-16.0
172,2015-01-01,UA,210,MSP,DEN,600,556.0,-4.0,14.0,610.0,139.0,120.0,100.0,680,650.0,6.0,719,656.0,-23.0,,,,,,-23.0
174,2015-01-01,UA,247,PHX,IAH,600,751.0,111.0,31.0,822.0,148.0,160.0,118.0,1009,1120.0,11.0,928,1131.0,123.0,12.0,0.0,0.0,0.0,111.0,0.0
183,2015-01-01,UA,1016,DEN,IAH,600,552.0,-8.0,11.0,603.0,142.0,146.0,114.0,862,857.0,21.0,922,918.0,-4.0,,,,,,-4.0


In [30]:
# Cambiamos OTHEY_DELAY <0 por 0 dado que estos vuelos han llegado antes de lo previsto, no han experimentado un retraso
df.loc[df["OTHER_DELAY"] < 0, "OTHER_DELAY"] = 0
df[45:50]

Unnamed: 0,DATE,AIRLINE,FLIGHT_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,OTHER_DELAY
168,2015-01-01,NK,224,ORD,LGA,556,547.0,-9.0,16.0,603.0,124.0,160.0,92.0,733,835.0,52.0,900,927.0,27.0,27.0,0.0,0.0,0.0,0.0,0.0
169,2015-01-01,UA,1457,LAX,IAH,557,555.0,-2.0,12.0,607.0,192.0,178.0,160.0,1379,1047.0,6.0,1109,1053.0,-16.0,,,,,,0.0
172,2015-01-01,UA,210,MSP,DEN,600,556.0,-4.0,14.0,610.0,139.0,120.0,100.0,680,650.0,6.0,719,656.0,-23.0,,,,,,0.0
174,2015-01-01,UA,247,PHX,IAH,600,751.0,111.0,31.0,822.0,148.0,160.0,118.0,1009,1120.0,11.0,928,1131.0,123.0,12.0,0.0,0.0,0.0,111.0,0.0
183,2015-01-01,UA,1016,DEN,IAH,600,552.0,-8.0,11.0,603.0,142.0,146.0,114.0,862,857.0,21.0,922,918.0,-4.0,,,,,,0.0


In [31]:
# Early arrival flights
early_arrival = df[df['ARRIVAL_DELAY']<=0]

print(len(early_arrival))
early_arrival.isna().sum()

1296896


DATE                         0
AIRLINE                      0
FLIGHT_NUMBER                0
ORIGIN_AIRPORT               0
DESTINATION_AIRPORT          0
SCHEDULED_DEPARTURE          0
DEPARTURE_TIME               0
DEPARTURE_DELAY              0
TAXI_OUT                     0
WHEELS_OFF                   0
SCHEDULED_TIME               0
ELAPSED_TIME                 0
AIR_TIME                     0
DISTANCE                     0
WHEELS_ON                    0
TAXI_IN                      0
SCHEDULED_ARRIVAL            0
ARRIVAL_TIME                 0
ARRIVAL_DELAY                0
AIR_SYSTEM_DELAY       1296896
SECURITY_DELAY         1296896
AIRLINE_DELAY          1296896
LATE_AIRCRAFT_DELAY    1296896
WEATHER_DELAY          1296896
OTHER_DELAY                  0
dtype: int64

El resto de valores NaN en dichas columnas corresponden a aquellos vuelos que han llegado antes de tiempo a destino, cosa que tiene sentido dado que no han experimentado ningún delay

Una vez analizado el por qué de los valores NaN presentes en el dataset parece razonable sustituir dichos valores por 0

In [32]:
# Asignamos a todos los NaNs el valor 0, ya que ahora todo el retraso de sus vuelos está plasmado en la variable OTHER_DELAY
df = df.fillna(0)
df[45:50]

Unnamed: 0,DATE,AIRLINE,FLIGHT_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,OTHER_DELAY
168,2015-01-01,NK,224,ORD,LGA,556,547.0,-9.0,16.0,603.0,124.0,160.0,92.0,733,835.0,52.0,900,927.0,27.0,27.0,0.0,0.0,0.0,0.0,0.0
169,2015-01-01,UA,1457,LAX,IAH,557,555.0,-2.0,12.0,607.0,192.0,178.0,160.0,1379,1047.0,6.0,1109,1053.0,-16.0,0.0,0.0,0.0,0.0,0.0,0.0
172,2015-01-01,UA,210,MSP,DEN,600,556.0,-4.0,14.0,610.0,139.0,120.0,100.0,680,650.0,6.0,719,656.0,-23.0,0.0,0.0,0.0,0.0,0.0,0.0
174,2015-01-01,UA,247,PHX,IAH,600,751.0,111.0,31.0,822.0,148.0,160.0,118.0,1009,1120.0,11.0,928,1131.0,123.0,12.0,0.0,0.0,0.0,111.0,0.0
183,2015-01-01,UA,1016,DEN,IAH,600,552.0,-8.0,11.0,603.0,142.0,146.0,114.0,862,857.0,21.0,922,918.0,-4.0,0.0,0.0,0.0,0.0,0.0,0.0


fillna() sustituye los NaN de todo el dataframe por lo que es importante destacar que podemos utilizar este método dado que los únicos campos NaN del dataframe se encuentran en las variables de delay analizadas. 

In [33]:
df.isna().sum()

DATE                   0
AIRLINE                0
FLIGHT_NUMBER          0
ORIGIN_AIRPORT         0
DESTINATION_AIRPORT    0
SCHEDULED_DEPARTURE    0
DEPARTURE_TIME         0
DEPARTURE_DELAY        0
TAXI_OUT               0
WHEELS_OFF             0
SCHEDULED_TIME         0
ELAPSED_TIME           0
AIR_TIME               0
DISTANCE               0
WHEELS_ON              0
TAXI_IN                0
SCHEDULED_ARRIVAL      0
ARRIVAL_TIME           0
ARRIVAL_DELAY          0
AIR_SYSTEM_DELAY       0
SECURITY_DELAY         0
AIRLINE_DELAY          0
LATE_AIRCRAFT_DELAY    0
WEATHER_DELAY          0
OTHER_DELAY            0
dtype: int64

Resulta interesante saber si los vuelos pueden retrasarse por un único motivo o exclusivamente por uno. Vamos a investigarlo

In [34]:
# Tomamos la variable WEATHER_DELAY como referencia
weather = df.loc[(df['WEATHER_DELAY'] != df['ARRIVAL_DELAY']) & (df['WEATHER_DELAY']>0 )]
weather.head()

Unnamed: 0,DATE,AIRLINE,FLIGHT_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,OTHER_DELAY
174,2015-01-01,UA,247,PHX,IAH,600,751.0,111.0,31.0,822.0,148.0,160.0,118.0,1009,1120.0,11.0,928,1131.0,123.0,12.0,0.0,0.0,0.0,111.0,0.0
622,2015-01-01,F9,1246,DEN,DFW,630,634.0,4.0,10.0,644.0,110.0,123.0,96.0,641,920.0,17.0,920,937.0,17.0,13.0,0.0,0.0,0.0,4.0,0.0
976,2015-01-01,UA,656,PHX,ORD,702,726.0,24.0,62.0,828.0,199.0,233.0,154.0,1440,1202.0,17.0,1121,1219.0,58.0,34.0,0.0,0.0,0.0,24.0,0.0
1633,2015-01-01,DL,786,DFW,MSP,800,858.0,58.0,35.0,933.0,142.0,139.0,100.0,852,1113.0,4.0,1022,1117.0,55.0,0.0,0.0,37.0,0.0,18.0,0.0
2758,2015-01-01,AA,1415,DFW,PIT,920,928.0,8.0,36.0,1004.0,150.0,159.0,117.0,1067,1301.0,6.0,1250,1307.0,17.0,9.0,0.0,0.0,0.0,8.0,0.0


Confirmamos, el retraso puedes estar asociado a varios motivos

## CHECKPOINT. Datos de vuelos limpios

In [35]:
df.head()

Unnamed: 0,DATE,AIRLINE,FLIGHT_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,OTHER_DELAY
1,2015-01-01,AA,2336,LAX,PBI,10,2.0,-8.0,12.0,14.0,280.0,279.0,263.0,2330,737.0,4.0,750,741.0,-9.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2015-01-01,US,840,SFO,CLT,20,18.0,-2.0,16.0,34.0,286.0,293.0,266.0,2296,800.0,11.0,806,811.0,5.0,0.0,0.0,0.0,0.0,0.0,5.0
3,2015-01-01,AA,258,LAX,MIA,20,15.0,-5.0,15.0,30.0,285.0,281.0,258.0,2342,748.0,8.0,805,756.0,-9.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2015-01-01,DL,806,SFO,MSP,25,20.0,-5.0,18.0,38.0,217.0,230.0,206.0,1589,604.0,6.0,602,610.0,8.0,0.0,0.0,0.0,0.0,0.0,8.0
6,2015-01-01,NK,612,LAS,MSP,25,19.0,-6.0,11.0,30.0,181.0,170.0,154.0,1299,504.0,5.0,526,509.0,-17.0,0.0,0.0,0.0,0.0,0.0,0.0


## Info Aerolíneas

In [36]:
# Vamos a añadir la el nombre asociado a las airlines
# Cargamos los datos
airlines = pd.read_csv("airlines.csv")
airlines.head()

Unnamed: 0,IATA_CODE,AIRLINE
0,UA,United Air Lines Inc.
1,AA,American Airlines Inc.
2,US,US Airways Inc.
3,F9,Frontier Airlines Inc.
4,B6,JetBlue Airways


In [37]:
# Vamos a examinar ahora los valores nulos
airlines.isna().sum()

IATA_CODE    0
AIRLINE      0
dtype: int64

In [38]:
# Renombramos la columna "AIRLINE" para poder hacer el join con la tabla de aerolineas
df = df.rename(columns={"AIRLINE": "IATA_CODE"})
df.head()

Unnamed: 0,DATE,IATA_CODE,FLIGHT_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,OTHER_DELAY
1,2015-01-01,AA,2336,LAX,PBI,10,2.0,-8.0,12.0,14.0,280.0,279.0,263.0,2330,737.0,4.0,750,741.0,-9.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2015-01-01,US,840,SFO,CLT,20,18.0,-2.0,16.0,34.0,286.0,293.0,266.0,2296,800.0,11.0,806,811.0,5.0,0.0,0.0,0.0,0.0,0.0,5.0
3,2015-01-01,AA,258,LAX,MIA,20,15.0,-5.0,15.0,30.0,285.0,281.0,258.0,2342,748.0,8.0,805,756.0,-9.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2015-01-01,DL,806,SFO,MSP,25,20.0,-5.0,18.0,38.0,217.0,230.0,206.0,1589,604.0,6.0,602,610.0,8.0,0.0,0.0,0.0,0.0,0.0,8.0
6,2015-01-01,NK,612,LAS,MSP,25,19.0,-6.0,11.0,30.0,181.0,170.0,154.0,1299,504.0,5.0,526,509.0,-17.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
# Unimos ambas tablas
flights = df.merge(airlines, on='IATA_CODE', how='left')
flights.head()

Unnamed: 0,DATE,IATA_CODE,FLIGHT_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,OTHER_DELAY,AIRLINE
0,2015-01-01,AA,2336,LAX,PBI,10,2.0,-8.0,12.0,14.0,280.0,279.0,263.0,2330,737.0,4.0,750,741.0,-9.0,0.0,0.0,0.0,0.0,0.0,0.0,American Airlines Inc.
1,2015-01-01,US,840,SFO,CLT,20,18.0,-2.0,16.0,34.0,286.0,293.0,266.0,2296,800.0,11.0,806,811.0,5.0,0.0,0.0,0.0,0.0,0.0,5.0,US Airways Inc.
2,2015-01-01,AA,258,LAX,MIA,20,15.0,-5.0,15.0,30.0,285.0,281.0,258.0,2342,748.0,8.0,805,756.0,-9.0,0.0,0.0,0.0,0.0,0.0,0.0,American Airlines Inc.
3,2015-01-01,DL,806,SFO,MSP,25,20.0,-5.0,18.0,38.0,217.0,230.0,206.0,1589,604.0,6.0,602,610.0,8.0,0.0,0.0,0.0,0.0,0.0,8.0,Delta Air Lines Inc.
4,2015-01-01,NK,612,LAS,MSP,25,19.0,-6.0,11.0,30.0,181.0,170.0,154.0,1299,504.0,5.0,526,509.0,-17.0,0.0,0.0,0.0,0.0,0.0,0.0,Spirit Air Lines


In [40]:
# Ponemos la nueva columna a continuación del código de la aerolinea
cols = flights.columns.tolist()
cols = cols[0:2]+cols[-1:] + cols[2:-1]
flights = flights[cols] 
flights.head()

Unnamed: 0,DATE,IATA_CODE,AIRLINE,FLIGHT_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,OTHER_DELAY
0,2015-01-01,AA,American Airlines Inc.,2336,LAX,PBI,10,2.0,-8.0,12.0,14.0,280.0,279.0,263.0,2330,737.0,4.0,750,741.0,-9.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2015-01-01,US,US Airways Inc.,840,SFO,CLT,20,18.0,-2.0,16.0,34.0,286.0,293.0,266.0,2296,800.0,11.0,806,811.0,5.0,0.0,0.0,0.0,0.0,0.0,5.0
2,2015-01-01,AA,American Airlines Inc.,258,LAX,MIA,20,15.0,-5.0,15.0,30.0,285.0,281.0,258.0,2342,748.0,8.0,805,756.0,-9.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2015-01-01,DL,Delta Air Lines Inc.,806,SFO,MSP,25,20.0,-5.0,18.0,38.0,217.0,230.0,206.0,1589,604.0,6.0,602,610.0,8.0,0.0,0.0,0.0,0.0,0.0,8.0
4,2015-01-01,NK,Spirit Air Lines,612,LAS,MSP,25,19.0,-6.0,11.0,30.0,181.0,170.0,154.0,1299,504.0,5.0,526,509.0,-17.0,0.0,0.0,0.0,0.0,0.0,0.0


## Info Aeropuertos

In [41]:
# Vamos a añadir la el nombre asociado a las airlines
# Cargamos los datos
airports = pd.read_csv("airports.csv")
airports.head()

Unnamed: 0,IATA_CODE,AIRPORT,CITY,STATE,COUNTRY,LATITUDE,LONGITUDE
0,ABE,Lehigh Valley International Airport,Allentown,PA,USA,40.65236,-75.4404
1,ABI,Abilene Regional Airport,Abilene,TX,USA,32.41132,-99.6819
2,ABQ,Albuquerque International Sunport,Albuquerque,NM,USA,35.04022,-106.60919
3,ABR,Aberdeen Regional Airport,Aberdeen,SD,USA,45.44906,-98.42183
4,ABY,Southwest Georgia Regional Airport,Albany,GA,USA,31.53552,-84.19447


In [42]:
# Vamos a examinar ahora los valores nulos
airports.isna().sum()

IATA_CODE    0
AIRPORT      0
CITY         0
STATE        0
COUNTRY      0
LATITUDE     3
LONGITUDE    3
dtype: int64

In [43]:
lat_null = airports[airports['LATITUDE'].isna()]
lat_null

Unnamed: 0,IATA_CODE,AIRPORT,CITY,STATE,COUNTRY,LATITUDE,LONGITUDE
96,ECP,Northwest Florida Beaches International Airport,Panama City,FL,USA,,
234,PBG,Plattsburgh International Airport,Plattsburgh,NY,USA,,
313,UST,Northeast Florida Regional Airport (St. August...,St. Augustine,FL,USA,,


Falta información de 3 aeropuertos, así que la rellenaremos buscando sus datos en internet e introduciéndola manualmente

In [44]:
ECP_COORD = [30.3549, 85.7995]
PBG_COORD = [44.6521, 73.4679]
UST_COORD = [29.9544, 81.3429]
airports.at[96,["LATITUDE","LONGITUDE"]]= ECP_COORD
airports.at[234,["LATITUDE","LONGITUDE"]]= PBG_COORD
airports.at[313,["LATITUDE","LONGITUDE"]]= UST_COORD
airports.isna().sum()

IATA_CODE    0
AIRPORT      0
CITY         0
STATE        0
COUNTRY      0
LATITUDE     0
LONGITUDE    0
dtype: int64

In [45]:
#lat_null[lat_null.IATA_CODE.isin(top_10_airports)]

**NOTA** No haría falta dado que ninguno de estos aeropuertos pertenece al top 10

In [46]:
# Eliminamos la columna airport, ya que trabajaremos únicamente con los códigos de los aeropuertos, no con sus nombres
airports = airports.drop(columns = "AIRPORT")

In [47]:
# Creamos dos bases de datos para hacer el join con la principal tanto para aeropuertos de llegada como de salida
airports= airports.rename(columns={"IATA_CODE": "AIRPORT"})

origin_airports = airports.add_prefix('ORIGIN_')
destination_airports = airports.add_prefix('DESTINATION_')

origin_airports.head()

Unnamed: 0,ORIGIN_AIRPORT,ORIGIN_CITY,ORIGIN_STATE,ORIGIN_COUNTRY,ORIGIN_LATITUDE,ORIGIN_LONGITUDE
0,ABE,Allentown,PA,USA,40.65236,-75.4404
1,ABI,Abilene,TX,USA,32.41132,-99.6819
2,ABQ,Albuquerque,NM,USA,35.04022,-106.60919
3,ABR,Aberdeen,SD,USA,45.44906,-98.42183
4,ABY,Albany,GA,USA,31.53552,-84.19447


In [48]:
flights.head()

Unnamed: 0,DATE,IATA_CODE,AIRLINE,FLIGHT_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,OTHER_DELAY
0,2015-01-01,AA,American Airlines Inc.,2336,LAX,PBI,10,2.0,-8.0,12.0,14.0,280.0,279.0,263.0,2330,737.0,4.0,750,741.0,-9.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2015-01-01,US,US Airways Inc.,840,SFO,CLT,20,18.0,-2.0,16.0,34.0,286.0,293.0,266.0,2296,800.0,11.0,806,811.0,5.0,0.0,0.0,0.0,0.0,0.0,5.0
2,2015-01-01,AA,American Airlines Inc.,258,LAX,MIA,20,15.0,-5.0,15.0,30.0,285.0,281.0,258.0,2342,748.0,8.0,805,756.0,-9.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2015-01-01,DL,Delta Air Lines Inc.,806,SFO,MSP,25,20.0,-5.0,18.0,38.0,217.0,230.0,206.0,1589,604.0,6.0,602,610.0,8.0,0.0,0.0,0.0,0.0,0.0,8.0
4,2015-01-01,NK,Spirit Air Lines,612,LAS,MSP,25,19.0,-6.0,11.0,30.0,181.0,170.0,154.0,1299,504.0,5.0,526,509.0,-17.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
# Unimos las tres tablas
flights_origin = flights.merge(origin_airports, on='ORIGIN_AIRPORT', how='left')
flights_complete = flights_origin.merge(destination_airports, on='DESTINATION_AIRPORT', how='left')
flights_complete.head()

Unnamed: 0,DATE,IATA_CODE,AIRLINE,FLIGHT_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,OTHER_DELAY,ORIGIN_CITY,ORIGIN_STATE,ORIGIN_COUNTRY,ORIGIN_LATITUDE,ORIGIN_LONGITUDE,DESTINATION_CITY,DESTINATION_STATE,DESTINATION_COUNTRY,DESTINATION_LATITUDE,DESTINATION_LONGITUDE
0,2015-01-01,AA,American Airlines Inc.,2336,LAX,PBI,10,2.0,-8.0,12.0,14.0,280.0,279.0,263.0,2330,737.0,4.0,750,741.0,-9.0,0.0,0.0,0.0,0.0,0.0,0.0,Los Angeles,CA,USA,33.94254,-118.40807,West Palm Beach,FL,USA,26.68316,-80.09559
1,2015-01-01,US,US Airways Inc.,840,SFO,CLT,20,18.0,-2.0,16.0,34.0,286.0,293.0,266.0,2296,800.0,11.0,806,811.0,5.0,0.0,0.0,0.0,0.0,0.0,5.0,San Francisco,CA,USA,37.619,-122.37484,Charlotte,NC,USA,35.21401,-80.94313
2,2015-01-01,AA,American Airlines Inc.,258,LAX,MIA,20,15.0,-5.0,15.0,30.0,285.0,281.0,258.0,2342,748.0,8.0,805,756.0,-9.0,0.0,0.0,0.0,0.0,0.0,0.0,Los Angeles,CA,USA,33.94254,-118.40807,Miami,FL,USA,25.79325,-80.29056
3,2015-01-01,DL,Delta Air Lines Inc.,806,SFO,MSP,25,20.0,-5.0,18.0,38.0,217.0,230.0,206.0,1589,604.0,6.0,602,610.0,8.0,0.0,0.0,0.0,0.0,0.0,8.0,San Francisco,CA,USA,37.619,-122.37484,Minneapolis,MN,USA,44.88055,-93.21692
4,2015-01-01,NK,Spirit Air Lines,612,LAS,MSP,25,19.0,-6.0,11.0,30.0,181.0,170.0,154.0,1299,504.0,5.0,526,509.0,-17.0,0.0,0.0,0.0,0.0,0.0,0.0,Las Vegas,NV,USA,36.08036,-115.15233,Minneapolis,MN,USA,44.88055,-93.21692


In [50]:
len(flights_complete)

2095802

In [51]:
# Guardamos los datos preprocesados, para ser utilizados en la predicción posterior
flights_complete.to_csv("flightsCleaned.csv", index=False)