# Pull Data out of HTML and XML files
An example to show how to extract data from the [AirTrans website](https://www.transtats.bts.gov/Data_Elements.aspx?Data=2) using the python library BeautifulSoup.

In [1]:
# Import the required libraries.
from bs4 import BeautifulSoup
import requests
import os
from pprint import pprint

fname = 'virgin_and_logan_airport.html'

# Create a function which finds the id codes of the Carriers and Airports and make a list of them.
def options(soup, id):
    option_values = []
    code_list = soup.find(id=id)
    for option in code_list.find_all('option'):
        # Extract the Carriers list excluding all the combination values such as AllUS.
        if 'All' not in option['value']: # All is inside AllUS and AllForeign too.
            option_values.append(option['value'])
    return option_values

# Create a function which prints the Carriers and Airports codes.
def print_list(label, codes):
    print('\n%s:' % label)
    for c in codes:
        print(c)

# Create a function which opens the downloaded local html file, gets the codes and prints them out.
def main():
    soup = BeautifulSoup(open(fname, encoding="utf8"), 'lxml')
    codes = options(soup, 'CarrierList')
    print_list('Carriers', codes)
    codes = options(soup, 'AirportList')
    print_list('Airports', codes)

# Call the function to find the Carrier and Airport codes.
if False:
    main()

In [2]:
# Create a persistent session. Persist cookies across requests.
s = requests.Session()

# Get the webpage.
r = s.get('https://www.transtats.bts.gov/Data_Elements.aspx?Data=2')

# Parse the webpage using the BeautifulSoup library.
soup = BeautifulSoup(r.text, 'lxml')

# Find the required values getting help from the web browser's developer tools (inspect element).
viewstate_element = soup.find(id='__VIEWSTATE')
viewstate = viewstate_element['value']
eventvalidation_element = soup.find(id='__EVENTVALIDATION')
eventvalidation = eventvalidation_element['value']
viewstategenerator_element = soup.find(id='__VIEWSTATEGENERATOR')
viewstategenerator = viewstategenerator_element['value']

# Make an HTTP POST request (Submit data to be processed to a specified resource).
# Get sample codes from the previous generated list.
r = s.post("https://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
           data = (
                   ("__EVENTTARGET", ""),
                   ("__EVENTARGUMENT", ""),
                   ("__VIEWSTATE", viewstate),
                   ("__VIEWSTATEGENERATOR",viewstategenerator),
                   ("__EVENTVALIDATION", eventvalidation),
                   ("CarrierList", "VX"),
                   ("AirportList", "BOS"),
                   ("Submit", "Submit")
                   ))

# Open again the local html file with write mode which is used to edit and write new information to the file.
# Get the body of the response decoded to unicode text and write it to the file.
def export_to_file():
    outfile = open('{0}-{1}.html'.format('VX', 'BOS'), 'w')
    with open(fname, 'r') as f:
        outfile.write(r.text)
    return outfile

export_to_file()
    
# Rename file.
# new_fname = 'VX-BOS.html'
# os.rename(fname, new_fname)

<_io.TextIOWrapper name='VX-BOS.html' mode='w' encoding='UTF-8'>

Extract and process the flight data from the local file.

In [3]:
def process_file(filename):
    data = []
    info = {}
    info['courier'], info['airport'] = filename[:6].split('-')
    # Create a new dictionary for each entry in the output data list.
    # Use the info dictionary defined here.
    # Each element in the list is a reference to the same info dictionary.
    with open(filename, 'r') as f:
        soup = BeautifulSoup(f,'lxml')
        rows = soup.find_all('tr', 'dataTDRight')
#         print(rows)
        for row in rows:
            tds = row.find_all('td')
            if tds[1].text != 'TOTAL':
                info['year'] = tds[0].text
                info['month'] = tds[1].text
#                 info['flights'] = {'domestic': int(tds[2].text.replace(',', '')), 
#                                    'international': int(tds[3].text.replace(',', ''))}
                data.append(info.copy())
        
    return data

#          # Alternative solution
#         table = soup.find("table", "dataTDRight")
#         for i, tr in enumerate(table.find_all("tr")[1:]) :
#             td_list = []
#             flights = {}
#             for td in tr.find_all("td"):
#                 td_list.append(td.get_text())
#                 print(td_list)
#             if td_list[0] == "TOTAL" or td_list[1] == "TOTAL":
#                 continue 
#             else:
#                 info['year'] = int(td_list[0])
#                 info['month'] = int(td_list[1])
#                 # flights["domestic"]= int(td_list[2].replace(",",""))      
#                 # flights["international"] = int(td_list[3].replace(",","")) 
#                 info['flights'] = flights
#                 data.append(info.copy())
            
#         return data

if False:
    pprint(process_file('VX-BOS.html'))

* Create a method to split xml files.

In [4]:
def outfile_generator(fname):
    count = -1
    while True:
        count += 1
        yield open('[0]-[1]'.format(fname, count), 'w')

def split_file(fname):
    # Create the pattern variable on which the file is split.
    pattern = ''
    
    # Create the iterator for the filename.
    outfile_iterator = outfile_generator(fname)
    
    with open(fname, 'r') as initial_file:
        for line in initial_file:
            # Create the new file.
            if pattern in line:
                outfile = next(outfile_iterator)
            # Write the line.
            outfile.write(line)