In [1]:
import os
import sys
from datetime import datetime, timedelta
import pandas as pd
# Set some Pandas options
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 50)
import numpy as np
import statsmodels as sm
from scipy.stats import chi2
import matplotlib.pyplot as plt
import matplotlib_inline.backend_inline
%config InlineBackend.figure_formats = ['svg']
%matplotlib inline
from sklearn import preprocessing

In [2]:
def get_hist(var1,var2,name_image,values_range=[0,1],nbins=20,normed=True):
    
    common_params = dict(bins=nbins,range=values_range,normed=normed)
    plt.subplots_adjust(hspace=.8)
    plt.subplot(311)
    plt.title('Default')
    #plt.hist(df.popularity, **common_params)
    plt.hist(var1, **common_params)
    plt.hist(var2, **common_params)
    plt.subplot(312)
    plt.title('Skinny shift - 2 at a time')
    plt.hist((var1, var2), **common_params)
    plt.subplot(313)
    common_params['histtype'] = 'step'
    plt.title('With steps')
    #plt.hist(df.popularity, **common_params)
    plt.hist(var1, **common_params)
    plt.hist(var2, **common_params)

    plt.savefig(name_image)
    plt.show()

In [3]:
directory_path = os.path.abspath(os.path.join('..'))
if directory_path not in sys.path:
    sys.path.append(directory_path)

In [4]:
df = pd.read_csv('/Users/iairlinker/Documents/repos/flight_delays_challenge/data/raw/dataset_SCL.csv',low_memory=False)
print(df.shape)
df.head()

(68206, 18)


Unnamed: 0,Fecha-I,Vlo-I,Ori-I,Des-I,Emp-I,Fecha-O,Vlo-O,Ori-O,Des-O,Emp-O,DIA,MES,AÑO,DIANOM,TIPOVUELO,OPERA,SIGLAORI,SIGLADES
0,2017-01-01 23:30:00,226,SCEL,KMIA,AAL,2017-01-01 23:33:00,226,SCEL,KMIA,AAL,1,1,2017,Domingo,I,American Airlines,Santiago,Miami
1,2017-01-02 23:30:00,226,SCEL,KMIA,AAL,2017-01-02 23:39:00,226,SCEL,KMIA,AAL,2,1,2017,Lunes,I,American Airlines,Santiago,Miami
2,2017-01-03 23:30:00,226,SCEL,KMIA,AAL,2017-01-03 23:39:00,226,SCEL,KMIA,AAL,3,1,2017,Martes,I,American Airlines,Santiago,Miami
3,2017-01-04 23:30:00,226,SCEL,KMIA,AAL,2017-01-04 23:33:00,226,SCEL,KMIA,AAL,4,1,2017,Miercoles,I,American Airlines,Santiago,Miami
4,2017-01-05 23:30:00,226,SCEL,KMIA,AAL,2017-01-05 23:28:00,226,SCEL,KMIA,AAL,5,1,2017,Jueves,I,American Airlines,Santiago,Miami


In [5]:
df.dtypes

Fecha-I      object
Vlo-I        object
Ori-I        object
Des-I        object
Emp-I        object
Fecha-O      object
Vlo-O        object
Ori-O        object
Des-O        object
Emp-O        object
DIA           int64
MES           int64
AÑO           int64
DIANOM       object
TIPOVUELO    object
OPERA        object
SIGLAORI     object
SIGLADES     object
dtype: object

### Generate the synthetic features asked in the instructions

In [6]:
df['Fecha-O'] = pd.to_datetime(df['Fecha-O'])
df['Fecha-I'] = pd.to_datetime(df['Fecha-I'])
ind = df[df['Fecha-O']>df['Fecha-I']].index
df.loc[ind,'min_diff'] = (df.loc[ind]['Fecha-O']-df.loc[ind]['Fecha-I']).apply(lambda x: (x.seconds)/60)
ind = df[df['Fecha-I']>df['Fecha-O']].index
df.loc[ind,'min_diff'] = (df.loc[ind]['Fecha-I']-df.loc[ind]['Fecha-O']).apply(lambda x: -1* (x.seconds)/60)
df['delay'] = df.min_diff.apply(lambda x: 1 if (x>15) else 0)

In [7]:
def get_period_day(x):
    if((x>=5) & (x<12)):
        return 'morning'
    elif((x>=12) & (x<19)):
        return 'afternoon'
    elif((x>=19) or (x<5)):
        return 'night'
df['period_day'] = df['Fecha-I'].apply(lambda x : get_period_day(x.hour))

In [8]:
def is_high_season(x):
    if( (x <= datetime(2017, 3, 3).date()) or  (x >= datetime(2017, 12, 15).date())):
        return 1
    elif( (x >= datetime(2017, 7, 15).date()) or  (x >= datetime(2017, 7, 31).date())):
        return 1
    elif( (x >= datetime(2017, 9, 11).date()) or  (x >= datetime(2017, 9, 30).date())):
        return 1
    else:
        return 0
    
df['high_season'] = df['Fecha-I'].apply(lambda x : is_high_season(x.date()))

### Check airports and external data aggregation

#### Chequeamos los aeropuertos de salida y entrada

First, we check how many departure airports I have, how many arrivals and to understand the number of possible combinations between them, which will allow me to have an idea of the complexity that this variable will add and design strategies to address it.

#### We added the international airport database

Then, we agreggate the data from the Global Airport database which will give me the name of the airport, country where is located, the latitud and longitud for each airport. This variables will give me the chance to create some new features like distance between origin and destiny and , if i had time, get the wather in the airport for each flyght.


In [9]:
len(df['Des-I'].unique())

64

In [10]:
'SCEL' in df['Des-I'].unique()

False

In [11]:
df['Ori-I'].unique()

array(['SCEL'], dtype=object)

We can see that all the flights depart from the Arturo Merino Benitez Airport in Santiago, Chile and arrive at 64 different airports. This limits the complexity of the problem, since you only have to understand the incidence of 64 possible combinations

In [12]:
airports = pd.read_csv('/Users/iairlinker/Documents/repos/flight_delays_challenge/data/external/GlobalAirportDatabase.txt',sep=':')
print(airports.shape)
airports.head(2)

(9299, 16)


Unnamed: 0,AYGA,GKA,GOROKA,GOROKA.1,PAPUA NEW GUINEA,006,004,054,S,145,023,030,E,01610,-6.082,145.392
0,AYLA,LAE,,LAE,PAPUA NEW GUINEA,0,0,0,U,0,0,0,U,0,0.0,0.0
1,AYMD,MAG,MADANG,MADANG,PAPUA NEW GUINEA,5,12,25,S,145,47,19,E,7,-5.207,145.789


We check that all the origin and destination codes are in the airport database, and if this is not the case, we will look for mechanisms to leave the column that has the ICAO codes approved between both dataframes

In [13]:
for element in df['Des-O'].unique():
    if(element not in airports.AYGA.unique()):
        print(element)

SAEZ
SPJC
SCNT
SCQP
SCAT
SCPQ
SEQM


In [14]:
airports[airports.GKA == 'LIM']

Unnamed: 0,AYGA,GKA,GOROKA,GOROKA.1,PAPUA NEW GUINEA,006,004,054,S,145,023,030,E,01610,-6.082,145.392
7781,SPIM,LIM,JORGE CHAVEZ INTERNATIONAL,LIMA,PERU,12,1,18,S,77,6,51,W,35,-12.022,-77.114


In [15]:
'SCEL' in airports.AYGA.unique()

True