airport_codes_and_geolocations

In [1]:
import numpy as np
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt

from pygeocoder import Geocoder
from mpl_toolkits.basemap import Basemap

%matplotlib inline

In [2]:
# where this data came from: https://www.transtats.bts.gov/OT_Delay/OT_DelayCause1.asp?pn=1
data = pd.read_csv('./project_datasets/682464398_102017_1651_airline_delay_causes.csv')

In [3]:
data.columns

Index([u'year', u' month', u'carrier', u'carrier_name', u'airport',
       u'airport_name', u'arr_flights', u'arr_del15', u'carrier_ct',
       u' weather_ct', u'nas_ct', u'security_ct', u'late_aircraft_ct',
       u'arr_cancelled', u'arr_diverted', u' arr_delay', u' carrier_delay',
       u'weather_delay', u'nas_delay', u'security_delay',
       u'late_aircraft_delay', u'Unnamed: 21'],
      dtype='object')

In [4]:
data = data.drop('Unnamed: 21', axis=1)

In [5]:
columns = []
for i, col in enumerate(data.columns):
    global columns
    col = re.sub(r'\s+', '', col)
    columns.append(col)

In [6]:
# fix the column index so there are no extraneous spaces... and can use bracket notation or dot notation
data.columns = columns
data.columns

Index([u'year', u'month', u'carrier', u'carrier_name', u'airport',
       u'airport_name', u'arr_flights', u'arr_del15', u'carrier_ct',
       u'weather_ct', u'nas_ct', u'security_ct', u'late_aircraft_ct',
       u'arr_cancelled', u'arr_diverted', u'arr_delay', u'carrier_delay',
       u'weather_delay', u'nas_delay', u'security_delay',
       u'late_aircraft_delay'],
      dtype='object')

In [7]:
data = data.dropna(axis=0)

In [8]:
data.airport_name.value_counts()

Los Angeles, CA: Los Angeles International                             121
Las Vegas, NV: McCarran International                                  121
Portland, OR: Portland International                                   121
San Diego, CA: San Diego International                                 121
Seattle, WA: Seattle/Tacoma International                              119
Phoenix, AZ: Phoenix Sky Harbor International                          113
Chicago, IL: Chicago O'Hare International                              110
San Francisco, CA: San Francisco International                         110
Austin, TX: Austin - Bergstrom International                           110
New Orleans, LA: Louis Armstrong New Orleans International             110
Washington, DC: Ronald Reagan Washington National                      110
Detroit, MI: Detroit Metro Wayne County                                110
Philadelphia, PA: Philadelphia International                           109
New York, NY: LaGuardia  

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11173 entries, 0 to 11192
Data columns (total 21 columns):
year                   11173 non-null int64
month                  11173 non-null int64
carrier                11173 non-null object
carrier_name           11173 non-null object
airport                11173 non-null object
airport_name           11173 non-null object
arr_flights            11173 non-null float64
arr_del15              11173 non-null float64
carrier_ct             11173 non-null float64
weather_ct             11173 non-null float64
nas_ct                 11173 non-null float64
security_ct            11173 non-null float64
late_aircraft_ct       11173 non-null float64
arr_cancelled          11173 non-null float64
arr_diverted           11173 non-null float64
arr_delay              11173 non-null float64
carrier_delay          11173 non-null float64
weather_delay          11173 non-null float64
nas_delay              11173 non-null float64
security_delay         11

In [25]:
full_airports = data.airport.unique()
# full_airports = data.airport_name.unique()
full_airports

array(['DFW', 'DTW', 'SEA', 'JFK', 'SJC', 'ORD', 'PHX', 'STL', 'LAX',
       'MCO', 'DEN', 'MIA', 'KOA', 'IAH', 'AUS', 'LAS', 'SLC', 'TUS',
       'STT', 'BOS', 'FLL', 'SFO', 'OGG', 'TPA', 'SNA', 'OKC', 'HNL',
       'PHL', 'LGA', 'RDU', 'DCA', 'RIC', 'ATL', 'LBB', 'CLT', 'ELP',
       'SAN', 'BNA', 'JAC', 'SMF', 'EWR', 'IAD', 'LIH', 'SJU', 'ABQ',
       'ORF', 'JAX', 'MSY', 'SAT', 'MCI', 'GUC', 'IND', 'PDX', 'BWI',
       'MSP', 'MKE', 'TUL', 'ONT', 'RSW', 'RNO', 'DSM', 'MFE', 'PSP',
       'OMA', 'EGE', 'PBI', 'SDF', 'PIT', 'FAT', 'DAY', 'STX', 'COS',
       'CMH', 'MTJ', 'HDN', 'BDL', 'MEM', 'CLE', 'HOU', 'BOI', 'OAK',
       'GEG', 'ANC', 'BUF', 'SYR', 'ALB', 'PVD', 'ROC', 'ILM', 'ICT',
       'PWM', 'GSO', 'CHS', 'MDT', 'BHM', 'ADQ', 'BET', 'BRW', 'SCC',
       'FAI', 'JNU', 'KTN', 'YAK', 'CDV', 'SIT', 'PSG', 'WRG', 'OME',
       'OTZ', 'BUR', 'BLI', 'ADK', 'SWF', 'LGB', 'PSE', 'BQN', 'HPN',
       'SAV', 'SRQ', 'BTV', 'ORH', 'DAB', 'CVG', 'BIS', 'AVL', 'GRR',
       'FNT', 'MYR',

Replace all the double quotes with single quotes makes the data easier to parse

In [26]:
full_airports_good = []
for good_string in full_airports:
    good_string = good_string.replace("\"", '\'')
    full_airports_good.append(good_string)
full_airports_good

['DFW',
 'DTW',
 'SEA',
 'JFK',
 'SJC',
 'ORD',
 'PHX',
 'STL',
 'LAX',
 'MCO',
 'DEN',
 'MIA',
 'KOA',
 'IAH',
 'AUS',
 'LAS',
 'SLC',
 'TUS',
 'STT',
 'BOS',
 'FLL',
 'SFO',
 'OGG',
 'TPA',
 'SNA',
 'OKC',
 'HNL',
 'PHL',
 'LGA',
 'RDU',
 'DCA',
 'RIC',
 'ATL',
 'LBB',
 'CLT',
 'ELP',
 'SAN',
 'BNA',
 'JAC',
 'SMF',
 'EWR',
 'IAD',
 'LIH',
 'SJU',
 'ABQ',
 'ORF',
 'JAX',
 'MSY',
 'SAT',
 'MCI',
 'GUC',
 'IND',
 'PDX',
 'BWI',
 'MSP',
 'MKE',
 'TUL',
 'ONT',
 'RSW',
 'RNO',
 'DSM',
 'MFE',
 'PSP',
 'OMA',
 'EGE',
 'PBI',
 'SDF',
 'PIT',
 'FAT',
 'DAY',
 'STX',
 'COS',
 'CMH',
 'MTJ',
 'HDN',
 'BDL',
 'MEM',
 'CLE',
 'HOU',
 'BOI',
 'OAK',
 'GEG',
 'ANC',
 'BUF',
 'SYR',
 'ALB',
 'PVD',
 'ROC',
 'ILM',
 'ICT',
 'PWM',
 'GSO',
 'CHS',
 'MDT',
 'BHM',
 'ADQ',
 'BET',
 'BRW',
 'SCC',
 'FAI',
 'JNU',
 'KTN',
 'YAK',
 'CDV',
 'SIT',
 'PSG',
 'WRG',
 'OME',
 'OTZ',
 'BUR',
 'BLI',
 'ADK',
 'SWF',
 'LGB',
 'PSE',
 'BQN',
 'HPN',
 'SAV',
 'SRQ',
 'BTV',
 'ORH',
 'DAB',
 'CVG',
 'BIS',
 'AVL',


We need all the unique airports so we can loop through them and get their geolocation

In [27]:
airports = data.airport_name.unique()
airports[0:10]

array(['Dallas/Fort Worth, TX: Dallas/Fort Worth International',
       'Detroit, MI: Detroit Metro Wayne County',
       'Seattle, WA: Seattle/Tacoma International',
       'New York, NY: John F. Kennedy International',
       'San Jose, CA: Norman Y. Mineta San Jose International',
       "Chicago, IL: Chicago O'Hare International",
       'Phoenix, AZ: Phoenix Sky Harbor International',
       'St. Louis, MO: St Louis Lambert International',
       'Los Angeles, CA: Los Angeles International',
       'Orlando, FL: Orlando International'], dtype=object)

This is the main function that gets the airport abbreviation, its name and geolocation

In [28]:
def get_airport_code(airport_name):
    """function takes in a loosely typed name of city and or airport and 
    returns the airports' three digit code, full name, and coordinates
    It's good to specify the airport if a city has two, such as Dallas Love Field
    and Dallas Fort Worth """
    g = Geocoder.geocode(airport_name)
    proto_code = g.formatted_address.encode('utf-8')    
    airport_code = re.search('\(([^)]+)', proto_code).group(1)
    return [airport_code, str(g.airport), g.coordinates]
#     return [airport_code, g.coordinates]

In [29]:
for full_port in full_airports_good[0:10]:
    try:
        print (get_airport_code(full_port))
    except:
        print full_port
#         full_port =  full_port.split(' ')[0]        
#         full_port = full_port.rstrip(',')
#         print get_airport_code(ppp + 'Airport')

['DFW', 'Dallas/Fort Worth International Airport', (32.8998091, -97.0403352)]
['DTW', 'Detroit Metropolitan Wayne County Airport', (42.2161722, -83.3553842)]
SEA
['JFK', 'John F. Kennedy International Airport', (40.6413111, -73.77813909999999)]
['SJC', 'Norman Y. Mineta San Jose International Airport', (37.3639472, -121.9289375)]
['ORD', "O'Hare International Airport", (41.9741625, -87.9073214)]
['PHX', 'Phoenix Sky Harbor International Airport', (33.4372686, -112.0077881)]
STL
['LAX', 'Los Angeles International Airport', (33.9415889, -118.40853)]
['MCO', 'Orlando International Airport', (28.4311577, -81.308083)]


In [30]:
short1 = full_airports_good[0:10]

In [31]:
for full_port in short1:
    try:
        get_airport_code(full_port)
    except:
        ppp = full_port
        print get_airport_code(ppp + 'Airport')

['SEA', 'Seattle-Tacoma International Airport', (47.4502499, -122.3088165)]
['STL', 'St. Louis Lambert International Airport', (38.7503222, -90.37545209999999)]


In [17]:
for full_port in full_airports_good:
    try:
        print (get_airport_code(full_port))
    except:
        error_string =  full_port.split(',')[0]
#         error_string = error_string.rstrip(',')
        print get_airport_code(error_string + 'Airport')

['DFW', 'Dallas/Fort Worth International Airport', (32.8998091, -97.0403352)]
['DTW', 'Detroit Metropolitan Wayne County Airport', (42.2161722, -83.3553842)]
['SEA', 'Seattle-Tacoma International Airport', (47.4502499, -122.3088165)]
['JFK', 'John F. Kennedy International Airport', (40.6413111, -73.77813909999999)]
['SJC', 'Norman Y. Mineta San Jose International Airport', (37.3639472, -121.9289375)]
['ORD', "O'Hare International Airport", (41.9741625, -87.9073214)]
['PHX', 'Phoenix Sky Harbor International Airport', (33.4372686, -112.0077881)]
['STL', 'St. Louis Lambert International Airport', (38.7503222, -90.37545209999999)]
['LAX', 'Los Angeles International Airport', (33.9415889, -118.40853)]
['MCO', 'Orlando International Airport', (28.4311577, -81.308083)]
['DEN', 'Denver International Airport', (39.8560963, -104.6737376)]
['MIA', 'Miami International Airport', (25.795865, -80.2870457)]
['KOA', 'Kona International Airport', (19.736916, -156.0429246)]
['IAH', 'George Bush Interco

AttributeError: 'NoneType' object has no attribute 'group'

In [18]:
for full_port in full_airports_good:
    get_airport_code(full_port)

AttributeError: 'NoneType' object has no attribute 'group'

In [44]:
def get_airport_code2(airport_name):
    """function takes in a loosely typed name of city and or airport and 
    returns the airports' three digit code, full name, and coordinates
    It's good to specify the airport if a city has two, such as Dallas Love Field
    and Dallas Fort Worth """
    g = Geocoder.geocode(airport_name)
    proto_code = g.formatted_address.encode('utf-8')    
    airport_code = proto_code[proto_code.find("(")+1:proto_code.find(")")]
    return [airport_code, proto_code, g.coordinates]

In [45]:
get_airport_code2('EGE')

['Eagle County Regional Airport, 217 Eldon Wilson Rd, Gypsum, CO 81637, US',
 'Eagle County Regional Airport, 217 Eldon Wilson Rd, Gypsum, CO 81637, USA',
 (39.6401478, -106.9136058)]

In [24]:
for full_port in full_airports_good:
    try:
        print (get_airport_code2(full_port))
    except:
        error_string =  full_port.split(',')[0]
#         error_string = error_string.rstrip(',')
        print get_airport_code(error_string + 'Airport')

['DFW International Airport (DFW), 2400 Aviation Drive North, DFW Airport, TX 75261, USA', (32.8998091, -97.0403352)]
['Detroit Metro Airport, Detroit, MI 48242, USA', (42.2253146, -83.3477632)]
['Seattle-Tacoma International Airport (SEA), 17801 International Blvd, Seattle, WA 98158, USA', (47.4502499, -122.3088165)]
['John F. Kennedy International Airport (JFK), Queens, NY 11430, USA', (40.6413111, -73.77813909999999)]
['Norman Y. Mineta San Jose International Airport (SJC), 1701 Airport Blvd, San Jose, CA 95110, USA', (37.3639472, -121.9289375)]
["O'Hare International Airport (ORD), 10000 W O'Hare Ave, Chicago, IL 60666, USA", (41.9741625, -87.9073214)]
['Phoenix Sky Harbor International Airport (PHX), 3400 E Sky Harbor Blvd, Phoenix, AZ 85034, USA', (33.4372686, -112.0077881)]
['St. Louis Lambert International Airport (STL), 10701 Lambert International Blvd, St. Louis, MO 63145, USA', (38.7503222, -90.37545209999999)]
['Los Angeles International Airport (LAX), 1 World Way, Los Ange

['Montrose Regional Airport, 2100 Airport Rd, Montrose, CO 81401, USA', (38.5026155, -107.89635)]
['Yampa Valley Regional Airport - HDN, 11005 Co Rd 51A, Hayden, CO 81639, USA', (40.4814042, -107.2166138)]
['Bradley International Airport (BDL), Schoephoester Rd, Windsor Locks, CT 06096, USA', (41.9388735, -72.68603139999999)]
['Memphis International Airport (MEM), 2491 Winchester Rd, Memphis, TN 38116, USA', (35.0420679, -89.9791729)]
['Cleveland Hopkins International Airport (CLE), 5300 Riverside Dr, Cleveland, OH 44135, USA', (41.4124339, -81.84799249999999)]
['William P. Hobby Airport (HOU), 7800 Airport Blvd, Houston, TX 77061, USA', (29.6459109, -95.2768859)]
['Boise Airport (BOI), 3201 W Airport Way #1000, Boise, ID 83705, USA', (43.5658231, -116.2223159)]
['Oakland International Airport (OAK), 1 Airport Dr, Oakland, CA 94621, USA', (37.7125689, -122.2197428)]
['Spokane International Airport (GEG), 9000 W Airport Dr, Spokane, WA 99224, USA', (47.62174779999999, -117.534812)]
['Te

AttributeError: 'NoneType' object has no attribute 'group'