# Import dependancies

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import bz2

import neobase as nb

import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [2]:
# windows file paths
#bookings_path = r'../../../data/challenge/bookings.csv.bz2'
#searches_path = r'../../../data/challenge/searches.csv.bz2'
# linux file paths
bookings_path = '../../data/challenge/bookings.csv.bz2'
searches_path = '../../data/challenge/searches.csv.bz2'

Since bookings file is 4.4Gb, we could use a sample to explore the data.

In [3]:
sample = pd.read_csv(bookings_path, compression='bz2', sep= '^', header=0, nrows = 100_000)
sample.to_csv('../../data/challenge/sample.csv', index=False)

# Explore the data available

In [4]:
sample_path = '../../data/challenge/sample.csv'
sample = pd.read_csv(sample_path)
sample.columns

Index(['act_date           ', 'source', 'pos_ctry', 'pos_iata', 'pos_oid  ',
       'rloc          ', 'cre_date           ', 'duration', 'distance',
       'dep_port', 'dep_city', 'dep_ctry', 'arr_port', 'arr_city', 'arr_ctry',
       'lst_port', 'lst_city', 'lst_ctry', 'brd_port', 'brd_city', 'brd_ctry',
       'off_port', 'off_city', 'off_ctry', 'mkt_port', 'mkt_city', 'mkt_ctry',
       'intl', 'route          ', 'carrier', 'bkg_class', 'cab_class',
       'brd_time           ', 'off_time           ', 'pax', 'year', 'month',
       'oid      '],
      dtype='object')

Some columns seem to have a indefinite amount of white spaces.

In [5]:
sample.columns = [col.strip() for col in sample.columns]
sample.columns

Index(['act_date', 'source', 'pos_ctry', 'pos_iata', 'pos_oid', 'rloc',
       'cre_date', 'duration', 'distance', 'dep_port', 'dep_city', 'dep_ctry',
       'arr_port', 'arr_city', 'arr_ctry', 'lst_port', 'lst_city', 'lst_ctry',
       'brd_port', 'brd_city', 'brd_ctry', 'off_port', 'off_city', 'off_ctry',
       'mkt_port', 'mkt_city', 'mkt_ctry', 'intl', 'route', 'carrier',
       'bkg_class', 'cab_class', 'brd_time', 'off_time', 'pax', 'year',
       'month', 'oid'],
      dtype='object')

We could also get rid of he spaces in some columns using loc

In [6]:
sample.loc[:,'arr_port'] = sample['arr_port'].str.strip()

Or we could rename all columns using a dictionary.

In [7]:
new_cols = {}
for col in sample.columns:
    new_cols[col] = col.strip()
sample = sample.rename(new_cols, axis=1)

# Exercise 1

Count the number of lines in python for each file

In order to do it as fast as possible we can select one column to count all lines.

bookings_lenght = len(pd.read_csv(
                                  bookings_path,
                                  compression='bz2', 
                                  sep= '^', 
                                  header=0, 
                                  usecols=['pax'], 
                                  ))
print(f'Bookings.csv has {bookings_lenght} lines.')

searches = pd.read_csv(
                       searches_path,
                       compression='bz2', 
                       sep= '^', 
                       header=0,
                       nrows=10
                       )
searches.columns

searches_lenght = len(pd.read_csv(
                                searches_path,
                                compression='bz2', 
                                sep= '^', 
                                header=0, 
                                usecols=['Date'],
                                ))
print(f'searches.csv has {searches_lenght} lines.')

# Exercise 2

Top 10 arrival airports in the world in 2013 (using the bookings file)

• Arrival airport is the column **arr_port**. It is the IATA code for the airport

• To get the total number of passengers for an airport, you can *sum* the column
**pax**, grouping by **arr_port**. Note that there is negative pax. That corresponds to
cancelations. So to get the total number of passengers that have actually
booked, you should sum including the negatives (that will remove the canceled
bookings).

• Print the top 10 arrival airports in the standard output, including the number of
passengers.

• Bonus point: Get the name of the city or airport corresponding to that airport
(programatically, we suggest to have a look at GeoBases in Github)

• Bonus point: Solve this problem using pandas (instead of any other approach)

In [8]:
df_full = pd.read_csv(
                     bookings_path,
                     compression='bz2', 
                     sep= '^', 
                     header=0, 
                     usecols=['arr_port', 'pax', 'year'], 
                     #nrows = 100000
                     chunksize=100_000 # con este parametro se convierte en un iterador
                    )

In [9]:
with df_full as df_iter:
    for chunk in df_full:
        print(len(chunk))
        break

100000


Once we have a valid loop over the full data set, and a way to look for the top airports let's look at the full data.

In [None]:
top_airports = []
i=0
with df_full as df_iter:
    for chunk in df_full:
        top_chunk = chunk.groupby('arr_port')['pax'].sum().reset_index().sort_values(by='pax', ascending=False)
        top_airports = top_airports.append(top_chunk)

: 

: 

In [68]:
top_airports

[]

In [None]:
chunk.groupby('arr_port')['pax'].sum().reset_index().sort_values(by='pax', ascending=False).head(10)

In [10]:
len(top_airports)

101

In [7]:
top_airports.groupby('arr_port')['pax'].sum().reset_index().sort_values(by='pax', ascending=False).head(10)

AttributeError: 'list' object has no attribute 'groupby'

https://pypi.org/project/GeoBases/

# Exercise 3

Plot the monthly number of searches for flights arriving at Málaga, Madrid or Barcelona

• For the arriving airport, you can use the
Destination column in the searches file.

• Plot a curve for Málaga, another one for
Madrid, and another one for Barcelona, in
the same figure.

• Bonus point: Solving this problem using
pandas (instead of any other approach)

# Exercise 4
Match searches with bookings

• For every search in the searches file, find out whether
the search ended up in a booking or not (using the info
in the bookings file). For instance, search and booking
origin and destination should match.

• For the bookings file, origin and destination are the
columns dep_port and arr_port, respectively.

• Generate a CSV file with the search data, and an
additional field, containing 1 if the search ended up in
a booking, and 0 otherwise.

# Exercise 5

Write a Web Service

• Wrap the output of the second exercise in a
web service that returns the data in JSON
format (instead of printing to the standard
output).

• The web service should accept a parameter
n>0. For the top 10 airports, n is 10. For the X
top airports, n is X 