In [18]:
# imports
import time
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import plotly.plotly as py
%matplotlib inline
fig_size = [12,9]
plt.rcParams["figure.figsize"] = fig_size

plt.style.use("ggplot")



In [2]:
airports = pd.read_csv("../data/flight-delays/airports.csv")
airlines = pd.read_csv("../data/flight-delays/airlines.csv")
flights = pd.read_csv("../data/flight-delays/flights.csv")


Columns (7,8) have mixed types. Specify dtype option on import or set low_memory=False.



In [3]:
# Segment the data

df = flights.loc[(flights["YEAR"]==2015) & (flights["MONTH"] == 1) ,:]
df.shape

(469968, 31)

In [None]:
# Plot the airports

data = [
    dict(
    type = "scattergeo",
    locationmode = "USA-states",
    lon = airports["LONGITUDE"],
    lat = airports["LATITUDE"],
    text = airports["AIRPORT"],
    marker = dict(
        size = 8,
        opacity = 0.8,
        reversescale = True,
        autocolorscale = False,
        symbol = "square",
        line = dict(
            width = 1,
            color = 'rgba(102, 102, 102)'
        )
    )
    )
]

layout = dict(
    title = "US Airports",
    colorbar = True,
    geo = dict(
        scope = "usa",
        projection = dict ( type = "albers usa"),
        showland = True,
        landcolor = "rgb(250, 250, 250)",
        subunitcolor = "rgb(217, 217, 217)",
        countrycolor = "rgb(217, 217, 217)",
        countrywidth = 0.5,
        subunitwidth = 0.5
    ),
)

fig = dict( data=data, layout=layout)

py.iplot( fig, validate = False, filename = "d3-airport-s")

In [None]:
# Now plot the flights between the airports for January
df.head()

In [None]:
# space

%timeit flights.loc[(flights["ORIGIN_AIRPORT"] == airports["IATA_CODE"][176]),]


In [7]:
# space

january_flights = flights.loc[(flights["MONTH"] == 1),:]

routes = ["{},{}".format(a,b) for a,b in zip(january_flights["ORIGIN_AIRPORT"],january_flights["DESTINATION_AIRPORT"])]

print(len(routes))

ndf = pd.DataFrame({
    "ORIGIN_LONGITUDE": np.nan,
    "ORIGIN_LATITUDE": np.nan,
    "ORIGIN_AIRPORT": [x.split(",")[0] for x in routes],
    "DEPARTURE_LONGITUDE": np.nan,
    "DEPARTURE_LATITUDE": np.nan,
    "DEPARTURE_AIRPORT": [x.split(",")[1] for x in routes],
    "AVERAGE_DELAY": np.nan
})

# %timeit np.mean(flights.loc[(flights["ORIGIN_AIRPORT"] == airports["IATA_CODE"][176]) & (flights["ORIGIN_AIRPORT"] == airports["IATA_CODE"][278]) & (flights["MONTH"] == 1),].apply(lambda x: x["ARRIVAL_DELAY"] + x["DEPARTURE_DELAY"],axis=1))



469968


In [None]:
print(airports["IATA_CODE"][278])

In [29]:
def fillgeo(df):
    df["DEPARTURE_LATITUDE"] = df["DEPARTURE_AIRPORT"].apply(lambda x: airports.loc[airports["IATA_CODE"] == x,"LATITUDE"].values[0])
    print("Finished departure lats")
    df["DEPARTURE_LONGITUDE"] = df["DEPARTURE_AIRPORT"].apply(lambda x: airports.loc[airports["IATA_CODE"] == x,"LONGITUDE"].values[0])
    print("Finished departure longs")
    df["ORIGIN_LATITUDE"] = df["ORIGIN_AIRPORT"].apply(lambda x: airports.loc[airports["IATA_CODE"] == x,"LATITUDE"].values[0])
    print("Finished origin lats")
    df["ORIGIN_LONGITUDE"] = df["ORIGIN_AIRPORT"].apply(lambda x: airports.loc[airports["IATA_CODE"] == x,"LONGITUDE"].values[0])
    print("Finished origin longs")
    return df

def fill_average_delay(df,flights):
    start = time.time()
    # get relevant flights
#     mylist = []
#     for a,b in zip(df["ORIGIN_AIRPORT"],df["DEPARTURE_AIRPORT"]):
#         mylist.append(flights.loc[(flights["ORIGIN_AIRPORT"] == a) & (flights["ORIGIN_AIRPORT"] == b), ["ARRIVAL_DELAY","DEPARTURE_DELAY"]])
    # temp_flights = flights.loc[(flights["ORIGIN_AIRPORT"] == df["ORIGIN_AIRPORT"]) & (flights["ORIGIN_AIRPORT"] == df["ORIGIN_AIRPORT"]) ,]
    #values = df.apply(lambda x: np.mean((flights.loc[(flights["ORIGIN_AIRPORT"] == x["ORIGIN_AIRPORT"]) & (flights["ORIGIN_AIRPORT"] == x["ORIGIN_AIRPORT"]) ,["ARRIVAL_DELAY","DEPARTURE_DELAY"]]).apply(lambda x: sum(x),axis=1) ,axis=1))
    df["AVERAGE_DELAY"] = df.apply(lambda x: np.mean((flights.loc[(flights["ORIGIN_AIRPORT"] == x["ORIGIN_AIRPORT"]) & (flights["ORIGIN_AIRPORT"] == x["ORIGIN_AIRPORT"]), ["ARRIVAL_DELAY","DEPARTURE_DELAY"]]).apply(lambda y: y["ARRIVAL_DELAY"] + y["DEPARTURE_DELAY"],axis=1)) ,axis=1)
    end = time.time()
    print(end - start)
    return df


In [17]:
ndf = fillgeo(ndf)
ndf.head()

Finished departure lats
Finished departure longs
Finished departure lats
Finished departure longs


Unnamed: 0,AVERAGE_DELAY,DEPARTURE_AIRPORT,DEPARTURE_LATITUDE,DEPARTURE_LONGITUDE,ORIGIN_AIRPORT,ORIGIN_LATITUDE,ORIGIN_LONGITUDE
0,,SEA,47.44898,-122.30931,ANC,61.17432,-149.99619
1,,PBI,26.68316,-80.09559,LAX,33.94254,-118.40807
2,,CLT,35.21401,-80.94313,SFO,37.619,-122.37484
3,,MIA,25.79325,-80.29056,LAX,33.94254,-118.40807
4,,ANC,61.17432,-149.99619,SEA,47.44898,-122.30931


In [13]:
airports.head()

Unnamed: 0,IATA_CODE,AIRPORT,CITY,STATE,COUNTRY,LATITUDE,LONGITUDE
0,ABE,Lehigh Valley International Airport,Allentown,PA,USA,40.65236,-75.4404
1,ABI,Abilene Regional Airport,Abilene,TX,USA,32.41132,-99.6819
2,ABQ,Albuquerque International Sunport,Albuquerque,NM,USA,35.04022,-106.60919
3,ABR,Aberdeen Regional Airport,Aberdeen,SD,USA,45.44906,-98.42183
4,ABY,Southwest Georgia Regional Airport,Albany,GA,USA,31.53552,-84.19447


In [30]:
ndf = fill_average_delay(ndf,january_flights)

KeyboardInterrupt: 

In [24]:
january_flights.head()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,2015,1,1,4,AS,98,N407AS,ANC,SEA,5,...,408.0,-22.0,0,0,,,,,,
1,2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,...,741.0,-9.0,0,0,,,,,,
2,2015,1,1,4,US,840,N171US,SFO,CLT,20,...,811.0,5.0,0,0,,,,,,
3,2015,1,1,4,AA,258,N3HYAA,LAX,MIA,20,...,756.0,-9.0,0,0,,,,,,
4,2015,1,1,4,AS,135,N527AS,SEA,ANC,25,...,259.0,-21.0,0,0,,,,,,
