In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Let's assume that you combined the code from the previous 2 exercises with code
from the lesson on how to build requests, and downloaded all the data locally.
The files are in a directory "data", named after the carrier and airport:
"{}-{}.html".format(carrier, airport), for example "FL-ATL.html".

The table with flight info has a table class="dataTDRight". Your task is to
use 'process_file()' to extract the flight data from that table as a list of
dictionaries, each dictionary containing relevant data from the file and table
row. This is an example of the data structure you should return:

data = [{"courier": "FL",
         "airport": "ATL",
         "year": 2012,
         "month": 12,
         "flights": {"domestic": 100,
                     "international": 100}
        },
         {"courier": "..."}
]

Note - year, month, and the flight data should be integers.
You should skip the rows that contain the TOTAL data for a year.

There are couple of helper functions to deal with the data files.
Please do not change them for grading purposes.
All your changes should be in the 'process_file()' function.

The 'data/FL-ATL.html' file in the tab above is only a part of the full data,
covering data through 2003. The test() code will be run on the full table, but
the given file should provide an example of what you will get.
"""
from bs4 import BeautifulSoup
from zipfile import ZipFile
import os

datadir = "data"


def open_zip(datadir):
    with ZipFile('{0}.zip'.format(datadir), 'r') as myzip:
        myzip.extractall()


def process_all(datadir):
    files = os.listdir(datadir)
    return files




In [None]:
def process_file(f):
    """
    This function extracts data from the file given as the function argument in
    a list of dictionaries. This is example of the data structure you should
    return:

    data = [{"courier": "FL",
             "airport": "ATL",
             "year": 2012,
             "month": 12,
             "flights": {"domestic": 100,
                         "international": 100}
            },
            {"courier": "..."}
    ]


    Note - year, month, and the flight data should be integers.
    You should skip the rows that contain the TOTAL data for a year.
    """
    data = []
    info = {}
    info["courier"], info["airport"] = f[:6].split("-")
    # Note: create a new dictionary for each entry in the output data list.
    # If you use the info dictionary defined here each element in the list 
    # will be a reference to the same info dictionary.
    with open("{}/{}".format(datadir, f), "r") as html:

        soup = BeautifulSoup(html)

    return data




In [36]:
html_f = open('./data/FL-ATL.html')
data = []
info = {}
soup = BeautifulSoup(html_f,"html.parser")

In [37]:
table = soup.find("table", attrs={"class":"dataTDRight"})
headings = [th.get_text() for th in table.find("tr").find_all("td")]
headings

[u'Year', u'Month', u'DOMESTIC', u'INTERNATIONAL', u'TOTAL']

In [56]:
for row in table.find_all("tr")[1:]:

    row_info = [td.get_text() for td in row.find_all("td")]
    if row_info[1] != 'TOTAL':
        info['year'] = int(row_info[0])
        info['month'] = int(row_info[1])
        info['flights']={}
        info['flights']['domestic'] = int(row_info[2].replace(',', ''))
        info['flights']['international'] = int(row_info[3].replace(',', ''))
        data.append(info)



In [57]:
data

[{'flights': {'domestic': 798879, 'international': 97094},
  'month': 12,
  'year': 2003},
 {'flights': {'domestic': 798879, 'international': 97094},
  'month': 12,
  'year': 2003},
 {'flights': {'domestic': 798879, 'international': 97094},
  'month': 12,
  'year': 2003},
 {'flights': {'domestic': 798879, 'international': 97094},
  'month': 12,
  'year': 2003},
 {'flights': {'domestic': 798879, 'international': 97094},
  'month': 12,
  'year': 2003},
 {'flights': {'domestic': 798879, 'international': 97094},
  'month': 12,
  'year': 2003},
 {'flights': {'domestic': 798879, 'international': 97094},
  'month': 12,
  'year': 2003},
 {'flights': {'domestic': 798879, 'international': 97094},
  'month': 12,
  'year': 2003},
 {'flights': {'domestic': 798879, 'international': 97094},
  'month': 12,
  'year': 2003},
 {'flights': {'domestic': 798879, 'international': 97094},
  'month': 12,
  'year': 2003},
 {'flights': {'domestic': 798879, 'international': 97094},
  'month': 12,
  'year': 2003},

In [22]:
soup.find(id='DataGrid1').find_all('td')

[<td>Year</td>,
 <td>Month</td>,
 <td>DOMESTIC</td>,
 <td>INTERNATIONAL</td>,
 <td>TOTAL</td>,
 <td style="font-family: Verdana, Geneva, Arial, Helvetica, sans-serif;">2002</td>,
 <td style="font-family: Verdana, Geneva, Arial, Helvetica, sans-serif;">10</td>,
 <td style="font-family: Verdana, Geneva, Arial, Helvetica, sans-serif;">815,489</td>,
 <td style="font-family: Verdana, Geneva, Arial, Helvetica, sans-serif;">92,565</td>,
 <td style="font-family: Verdana, Geneva, Arial, Helvetica, sans-serif;">908,054</td>,
 <td style="font-family: Verdana, Geneva, Arial, Helvetica, sans-serif;">2002</td>,
 <td style="font-family: Verdana, Geneva, Arial, Helvetica, sans-serif;">11</td>,
 <td style="font-family: Verdana, Geneva, Arial, Helvetica, sans-serif;">766,775</td>,
 <td style="font-family: Verdana, Geneva, Arial, Helvetica, sans-serif;">91,342</td>,
 <td style="font-family: Verdana, Geneva, Arial, Helvetica, sans-serif;">858,117</td>,
 <td style="font-family: Verdana, Geneva, Arial, Helv

In [None]:
def test():
    print "Running a simple test..."
    open_zip(datadir)
    files = process_all(datadir)
    data = []
    # Test will loop over three data files.
    for f in files:
        data += process_file(f)
        
    assert len(data) == 399  # Total number of rows
    for entry in data[:3]:
        assert type(entry["year"]) == int
        assert type(entry["month"]) == int
        assert type(entry["flights"]["domestic"]) == int
        assert len(entry["airport"]) == 3
        assert len(entry["courier"]) == 2
    assert data[0]["courier"] == 'FL'
    assert data[0]["month"] == 10
    assert data[-1]["airport"] == "ATL"
    assert data[-1]["flights"] == {'international': 108289, 'domestic': 701425}
    
    print "... success!"

if __name__ == "__main__":
    test()