In [156]:
# importacion general de librerias y de visualizacion (matplotlib y seaborn)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

plt.style.use('default') # haciendo los graficos un poco mas bonitos en matplotlib
#plt.rcParams['figure.figsize'] = (20, 10)

sns.set(style="whitegrid") # seteando tipo de grid en seaborn

pd.options.display.float_format = '{:20,.2f}'.format # suprimimos la notacion cientifica en los outputs

import warnings
warnings.filterwarnings('ignore')

## Temario

Estos son algunos de los temas que intentaremos revisar a lo largo del analisis propuesto

- Manipulando el Data Frame
   - Manejo de Indices
   - Joins

In [157]:
# carga de un data frame
#flights = pd.read_csv('../data/flight-delays/flights.csv', low_memory=False)
flights = pd.read_csv('../data/flight-delays/flights.csv', low_memory=False)

# por ejemplo se puede usar index_col para indicar un indice
airports = pd.read_csv('../data/flight-delays/airports.csv', sep=',', encoding='utf-8', index_col='IATA_CODE')

In [158]:
flights.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5819079 entries, 0 to 5819078
Data columns (total 31 columns):
YEAR                   int64
MONTH                  int64
DAY                    int64
DAY_OF_WEEK            int64
AIRLINE                object
FLIGHT_NUMBER          int64
TAIL_NUMBER            object
ORIGIN_AIRPORT         object
DESTINATION_AIRPORT    object
SCHEDULED_DEPARTURE    int64
DEPARTURE_TIME         float64
DEPARTURE_DELAY        float64
TAXI_OUT               float64
WHEELS_OFF             float64
SCHEDULED_TIME         float64
ELAPSED_TIME           float64
AIR_TIME               float64
DISTANCE               int64
WHEELS_ON              float64
TAXI_IN                float64
SCHEDULED_ARRIVAL      int64
ARRIVAL_TIME           float64
ARRIVAL_DELAY          float64
DIVERTED               int64
CANCELLED              int64
CANCELLATION_REASON    object
AIR_SYSTEM_DELAY       float64
SECURITY_DELAY         float64
AIRLINE_DELAY          float64
LATE_AIRCRAFT

In [159]:
airports.info()

<class 'pandas.core.frame.DataFrame'>
Index: 322 entries, ABE to YUM
Data columns (total 6 columns):
AIRPORT      322 non-null object
CITY         322 non-null object
STATE        322 non-null object
COUNTRY      322 non-null object
LATITUDE     319 non-null float64
LONGITUDE    319 non-null float64
dtypes: float64(2), object(4)
memory usage: 17.6+ KB


# Joins


In [160]:
airports_with_origin = pd.merge(flights, airports, left_on="ORIGIN_AIRPORT", right_on="IATA_CODE", how="inner")

In [161]:
airports_with_origin.head()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,AIRPORT,CITY,STATE,COUNTRY,LATITUDE,LONGITUDE
0,2015,1,1,4,AS,98,N407AS,ANC,SEA,5,...,,,,,Ted Stevens Anchorage International Airport,Anchorage,AK,USA,61.17,-150.0
1,2015,1,1,4,AS,108,N309AS,ANC,SEA,45,...,,,,,Ted Stevens Anchorage International Airport,Anchorage,AK,USA,61.17,-150.0
2,2015,1,1,4,DL,1560,N3743H,ANC,SEA,45,...,,,,,Ted Stevens Anchorage International Airport,Anchorage,AK,USA,61.17,-150.0
3,2015,1,1,4,AS,122,N413AS,ANC,PDX,50,...,,,,,Ted Stevens Anchorage International Airport,Anchorage,AK,USA,61.17,-150.0
4,2015,1,1,4,AS,136,N431AS,ANC,SEA,135,...,,,,,Ted Stevens Anchorage International Airport,Anchorage,AK,USA,61.17,-150.0


In [162]:
airports_with_origin_dest = pd.merge(airports_with_origin, airports, left_on="DESTINATION_AIRPORT", right_on="IATA_CODE", how="inner")

In [163]:
airports_with_origin_dest.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5332914 entries, 0 to 5332913
Data columns (total 43 columns):
YEAR                   int64
MONTH                  int64
DAY                    int64
DAY_OF_WEEK            int64
AIRLINE                object
FLIGHT_NUMBER          int64
TAIL_NUMBER            object
ORIGIN_AIRPORT         object
DESTINATION_AIRPORT    object
SCHEDULED_DEPARTURE    int64
DEPARTURE_TIME         float64
DEPARTURE_DELAY        float64
TAXI_OUT               float64
WHEELS_OFF             float64
SCHEDULED_TIME         float64
ELAPSED_TIME           float64
AIR_TIME               float64
DISTANCE               int64
WHEELS_ON              float64
TAXI_IN                float64
SCHEDULED_ARRIVAL      int64
ARRIVAL_TIME           float64
ARRIVAL_DELAY          float64
DIVERTED               int64
CANCELLED              int64
CANCELLATION_REASON    object
AIR_SYSTEM_DELAY       float64
SECURITY_DELAY         float64
AIRLINE_DELAY          float64
LATE_AIRCRAFT

In [164]:
airports_with_origin_dest = pd.merge(airports_with_origin, airports, left_on="DESTINATION_AIRPORT", right_on="IATA_CODE", how="inner",suffixes=("_ORIGIN","_DESTINATION"))

In [165]:
airports_with_origin_dest.head()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,STATE_ORIGIN,COUNTRY_ORIGIN,LATITUDE_ORIGIN,LONGITUDE_ORIGIN,AIRPORT_DESTINATION,CITY_DESTINATION,STATE_DESTINATION,COUNTRY_DESTINATION,LATITUDE_DESTINATION,LONGITUDE_DESTINATION
0,2015,1,1,4,AS,98,N407AS,ANC,SEA,5,...,AK,USA,61.17,-150.0,Seattle-Tacoma International Airport,Seattle,WA,USA,47.45,-122.31
1,2015,1,1,4,AS,108,N309AS,ANC,SEA,45,...,AK,USA,61.17,-150.0,Seattle-Tacoma International Airport,Seattle,WA,USA,47.45,-122.31
2,2015,1,1,4,DL,1560,N3743H,ANC,SEA,45,...,AK,USA,61.17,-150.0,Seattle-Tacoma International Airport,Seattle,WA,USA,47.45,-122.31
3,2015,1,1,4,AS,136,N431AS,ANC,SEA,135,...,AK,USA,61.17,-150.0,Seattle-Tacoma International Airport,Seattle,WA,USA,47.45,-122.31
4,2015,1,1,4,AS,134,N464AS,ANC,SEA,155,...,AK,USA,61.17,-150.0,Seattle-Tacoma International Airport,Seattle,WA,USA,47.45,-122.31


In [166]:
airports_with_origin_dest.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5332914 entries, 0 to 5332913
Data columns (total 43 columns):
YEAR                     int64
MONTH                    int64
DAY                      int64
DAY_OF_WEEK              int64
AIRLINE                  object
FLIGHT_NUMBER            int64
TAIL_NUMBER              object
ORIGIN_AIRPORT           object
DESTINATION_AIRPORT      object
SCHEDULED_DEPARTURE      int64
DEPARTURE_TIME           float64
DEPARTURE_DELAY          float64
TAXI_OUT                 float64
WHEELS_OFF               float64
SCHEDULED_TIME           float64
ELAPSED_TIME             float64
AIR_TIME                 float64
DISTANCE                 int64
WHEELS_ON                float64
TAXI_IN                  float64
SCHEDULED_ARRIVAL        int64
ARRIVAL_TIME             float64
ARRIVAL_DELAY            float64
DIVERTED                 int64
CANCELLED                int64
CANCELLATION_REASON      object
AIR_SYSTEM_DELAY         float64
SECURITY_DELAY       