## Second exercise: top 10 arrival airports in the world in 2013 (using the bookings file)

**Problem solved using pandas**

**The name of the corresponding city and airport is provided for each airport_code**

In [2]:
# Display the first line of the bookings.csv file to identify the key column parameters and delimiter
from itertools import islice
with open("bookings.csv") as f:
    print(list(islice(f, 2)))

['act_date           ^source^pos_ctry^pos_iata^pos_oid  ^rloc          ^cre_date           ^duration^distance^dep_port^dep_city^dep_ctry^arr_port^arr_city^arr_ctry^lst_port^lst_city^lst_ctry^brd_port^brd_city^brd_ctry^off_port^off_city^off_ctry^mkt_port^mkt_city^mkt_ctry^intl^route          ^carrier^bkg_class^cab_class^brd_time           ^off_time           ^pax^year^month^oid      \n', '2013-03-05 00:00:00^1A    ^DE      ^a68dd7ae953c8acfb187a1af2dcbe123^1a11ae49fcbf545fd2afc1a24d88d2b7^ea65900e72d71f4626378e2ebd298267^2013-02-22 00:00:00^1708^0^ZRH     ^ZRH     ^CH      ^LHR     ^LON     ^GB      ^ZRH     ^ZRH     ^CH      ^LHR     ^LON     ^GB      ^ZRH     ^ZRH     ^CH      ^LHRZRH  ^LONZRH  ^CHGB    ^1^LHRZRH         ^VI^T        ^Y        ^2013-03-07 08:50:00^2013-03-07 11:33:37^-1^2013^3^NULL     \n']


In [3]:
import pandas as pd

# Create a Dataframe object of the bookings.csv file
bookings = pd.read_csv('bookings.csv',sep='^',usecols={'arr_port','pax','year'})
bookings["arr_port"] = bookings["arr_port"].map(str.strip) #Strip out the extra whitespaces in arr_port (required for merging)

# Drop records whith travel date different than 2013 (probably useless. Only 1 NaN record was found)
bookings = bookings[bookings.year == 2013]
bookings=bookings.drop('year',1)

# Use of Groupby method to compute the top 10 arrival airports.
top10 = bookings.groupby(by='arr_port').sum().sort_values(by='pax',ascending=False)[:10]
top10

Unnamed: 0_level_0,pax
arr_port,Unnamed: 1_level_1
LHR,88809.0
MCO,70930.0
LAX,70530.0
LAS,69630.0
JFK,66270.0
CDG,64490.0
BKK,59460.0
MIA,58150.0
SFO,58000.0
DXB,55590.0


In [4]:
# Create a Dataframe object of the airport_list.csv file
airports_list = pd.read_csv('airports.csv',usecols={'IATA/FAA','City','Country','Airport_Name'})
airports_list.head(3)

Unnamed: 0,Airport_Name,City,Country,IATA/FAA
0,Goroka,Goroka,Papua New Guinea,GKA
1,Madang,Madang,Papua New Guinea,MAG
2,Mount Hagen,Mount Hagen,Papua New Guinea,HGU


In [5]:
# Merge the 2 dataframes into a new object using the IATA code. Change the index (optional).
top10_with_Cities=pd.merge(top10,airports_list,right_on='IATA/FAA', how='left',left_index=True)
top10_with_Cities.index = [1,2,3,4,5,6,7,8,9,10]
top10_with_Cities['pax'] = top10_with_Cities['pax'].astype(int)
top10_with_Cities

Unnamed: 0,pax,Airport_Name,City,Country,IATA/FAA
1,88809,Heathrow,London,United Kingdom,LHR
2,70930,Orlando Intl,Orlando,United States,MCO
3,70530,Los Angeles Intl,Los Angeles,United States,LAX
4,69630,Mc Carran Intl,Las Vegas,United States,LAS
5,66270,John F Kennedy Intl,New York,United States,JFK
6,64490,Charles De Gaulle,Paris,France,CDG
7,59460,Suvarnabhumi Intl,Bangkok,Thailand,BKK
8,58150,Miami Intl,Miami,United States,MIA
9,58000,San Francisco Intl,San Francisco,United States,SFO
10,55590,Dubai Intl,Dubai,United Arab Emirates,DXB
