# Lesson 1 - Data Extraction Fundamentals

## Beatles Diskography

In [52]:
# Your task is to read the input DATAFILE line by line, and for the first 10 lines (not including the header)
# split each line on "," and then for each line, create a dictionary
# where the key is the header title of the field, and the value is the value of that field in the row.
# The function parse_file should return a list of dictionaries,
# each data line in the file being a single list entry.
# Field names and values should not contain extra whitespace, like spaces or newline characters.
# You can use the Python string method strip() to remove the extra whitespace.
# You have to parse only the first 10 data lines in this exercise,
# so the returned list should have 10 entries!
import os

DATADIR = "data"
DATAFILE = "beatles-diskography.csv"


def parse_file(datafile):
    data = []
    rows = []
    with open(datafile, "r") as f:
        for line in f:        
            line = line.split(',')
            for i in range(len(line)):
                line[i] = line[i].strip()
            rows.append(line)
        for j in range(10):
            data.append(dict(zip(rows[0],rows[j+1])))
        
        print data[0]
        print data[9]
    return data


def test():
    # a simple test of your implemetation
    datafile = os.path.join(DATADIR, DATAFILE)
    d = parse_file(datafile)
    #print d
    firstline = {'Title': 'Please Please Me', 'UK Chart Position': '1', 'Label': 'Parlophone(UK)', 'Released': '22 March 1963', 'US Chart Position': '-', 'RIAA Certification': 'Platinum', 'BPI Certification': 'Gold'}
    tenthline = {'Title': '', 'UK Chart Position': '1', 'Label': 'Parlophone(UK)', 'Released': '10 July 1964', 'US Chart Position': '-', 'RIAA Certification': '', 'BPI Certification': 'Gold'}

    assert d[0] == firstline
    assert d[9] == tenthline
    
test()

{'Title': 'Please Please Me', 'UK Chart Position': '1', 'Label': 'Parlophone(UK)', 'Released': '22 March 1963', 'US Chart Position': '-', 'RIAA Certification': 'Platinum', 'BPI Certification': 'Gold'}
{'Title': '', 'UK Chart Position': '1', 'Label': 'Parlophone(UK)', 'Released': '10 July 1964', 'US Chart Position': '-', 'RIAA Certification': '', 'BPI Certification': 'Gold'}


## ERCOT Hourly Load

In [66]:
#!/usr/bin/env python
"""
Your task is as follows:
- read the provided Excel file
- find and return the min, max and average values for the COAST region
- find and return the time value for the min and max entries
- the time values should be returned as Python tuples

Please see the test function for the expected return format

"""

import xlrd
import numpy as np
import pandas as pd
from zipfile import ZipFile
datafile = "./data/2013_ERCOT_Hourly_Load_Data.xls"


def open_zip(datafile):
    with ZipFile(datafile, 'r') as myzip:
        myzip.extractall()


def parse_file(datafile):
    workbook = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)

    ### example on how you can get the data
    sheet_data = pd.DataFrame([[sheet.cell_value(r, col) for col in range(sheet.ncols)] for r in range(sheet.nrows)])

    maxcoast = sheet_data[1][1:].max()
    print maxcoast
    maxcoastidx = sheet_data[1][1:].idxmax()
    print maxcoastidx
    maxtime = xlrd.xldate_as_tuple(sheet.cell_value(maxcoastidx, 0), 0)
    print maxtime
    
    mincoast = sheet_data[1][1:].min()
    print mincoast
    mincoastidx = sheet_data[1][1:].idxmin()
    mintime = xlrd.xldate_as_tuple(sheet.cell_value(mincoastidx, 0), 0)
    print mintime
    
    meancoast = sheet_data[1][1:].mean()
    print meancoast

    ### other useful methods:
    #print "\nROWS, COLUMNS, and CELLS:"
    #print "Number of rows in the sheet:", 
    #print sheet.nrows
    #print "Type of data in cell (row 3, col 2):", 
    #print sheet.cell_type(3, 2)
    #print "Value in cell (row 3, col 2):", 
    #print sheet.cell_value(3, 2)
    #print "Get a slice of values in column 3, from rows 1-3:"
    #print sheet.col_values(3, start_rowx=1, end_rowx=4)

    # print "\nDATES:"
    # print "Type of data in cell (row 1, col 0):", 
    # print sheet.cell_type(1, 0)
    # exceltime = sheet.cell_value(1, 0)
    # print "Time in Excel format:",
    # print exceltime
    # print "Convert time to a Python datetime tuple, from the Excel float:",
    # print xlrd.xldate_as_tuple(exceltime, 0)
    
    
    data = {
            'maxtime': maxtime,
            'maxvalue': maxcoast,
            'mintime': mintime,
            'minvalue': mincoast,
            'avgcoast': meancoast
    }
    return data


def test():
    #open_zip(datafile)
    data = parse_file(datafile)

    assert data['maxtime'] == (2013, 8, 13, 17, 0, 0)
    assert round(data['maxvalue'], 10) == round(18779.02551, 10)


test()

18779.02551
5392
(2013, 8, 13, 17, 0, 0)
6602.113899
(2013, 2, 3, 4, 0, 0)
10976.9334607


## Musicbrainz

In [85]:
"""
To experiment with this code freely you will have to run this code locally.
Take a look at the main() function for an example of how to use the code. We
have provided example json output in the other code editor tabs for you to look
at, but you will not be able to run any queries through our UI.
"""
import json
import requests

BASE_URL = "http://musicbrainz.org/ws/2/"
ARTIST_URL = BASE_URL + "artist/"


# query parameters are given to the requests.get function as a dictionary; this
# variable contains some starter parameters.
query_type = {  "simple": {},
                "atr": {"inc": "aliases+tags+ratings"},
                "aliases": {"inc": "aliases"},
                "releases": {"inc": "releases"}}


def query_site(url, params, uid="", fmt="json"):
    """
    This is the main function for making queries to the musicbrainz API. The
    query should return a json document.
    """
    params["fmt"] = fmt
    r = requests.get(url + uid, params=params)
    print "requesting", r.url

    if r.status_code == requests.codes.ok:
        return r.json()
    else:
        r.raise_for_status()


def query_by_name(url, params, name):
    """
    This adds an artist name to the query parameters before making an API call
    to the function above.
    """
    params["query"] = "artist:" + name
    return query_site(url, params)


def pretty_print(data, indent=4):
    """
    After we get our output, we can use this function to format it to be more
    readable.
    """
    if type(data) == dict:
        print json.dumps(data, indent=indent, sort_keys=True)
    else:
        print data


def main():
    """
    Below is an example investigation to help you get started in your
    exploration. Modify the function calls and indexing below to answer the
    questions on the next quiz.

    HINT: Note how the output we get from the site is a multi-level JSON
    document, so try making print statements to step through the structure one
    level at a time or copy the output to a separate output file. Experimenting
    and iteration will be key to understand the structure of the data!
    """

    # Query for information in the database about bands named Nirvana
    results = query_by_name(ARTIST_URL, query_type["simple"], "Nirvana")
    #pretty_print(results)

    # Isolate information from the 4th band returned (index 3)
    #print "\nARTIST:"
    #pretty_print(results["artists"][4])

    # Query for releases from that band using the artist_id
    artist_id = results["artists"][4]["id"]
    artist_data = query_site(ARTIST_URL, query_type["releases"], artist_id)
    releases = artist_data["releases"]

    # Print information about releases from the selected band
    #print "\nONE RELEASE:"
    #pretty_print(releases[6], indent=2)

    release_titles = [r["title"] for r in releases]
    print "\nALL TITLES:"
    for t in release_titles:
        print t

if __name__ == '__main__':
    main()

requesting http://musicbrainz.org/ws/2/artist/?query=artist%3ANirvana&fmt=json
requesting http://musicbrainz.org/ws/2/artist/5b11f4ce-a62d-471e-81fc-a69a8278c7da?fmt=json&inc=releases

ALL TITLES:
Never Mind the Bollocks Here’s Nirvana
Blew
Sliver
Bleach
Sliver
Love Buzz / Big Cheese
Smells Like Teen Spirit
Smells Like Teen Spirit
Here She Comes Now / Venus in Furs
Smells Like Teen Spirit
Smells Like Teen Spirit
Bleach
Sliver
Smells Like Teen Spirit
Blew
Candy / Molly’s Lips
Bleach
Sliver
Smells Like Teen Spirit
Nevermind
Bleach
Smells Like Teen Spirit
Bleach
Sliver
Nevermind


### Quiz

In [139]:
"""
To experiment with this code freely you will have to run this code locally.
Take a look at the main() function for an example of how to use the code. We
have provided example json output in the other code editor tabs for you to look
at, but you will not be able to run any queries through our UI.
"""
import json
import requests

BASE_URL = "http://musicbrainz.org/ws/2/"
ARTIST_URL = BASE_URL + "artist/"


# query parameters are given to the requests.get function as a dictionary; this
# variable contains some starter parameters.
query_type = {  "simple": {},
                "atr": {"inc": "aliases+tags+ratings"},
                "aliases": {"inc": "aliases"},
                "releases": {"inc": "releases"}}


def query_site(url, params, uid="", fmt="json"):
    """
    This is the main function for making queries to the musicbrainz API. The
    query should return a json document.
    """
    params["fmt"] = fmt
    r = requests.get(url + uid, params=params)
    print "requesting", r.url

    if r.status_code == requests.codes.ok:
        return r.json()
    else:
        r.raise_for_status()


def query_by_name(url, params, name):
    """
    This adds an artist name to the query parameters before making an API call
    to the function above.
    """
    params["query"] = "artist:" + name
    return query_site(url, params)


def pretty_print(data, indent=4):
    """
    After we get our output, we can use this function to format it to be more
    readable.
    """
    if type(data) == dict:
        print json.dumps(data, indent=indent, sort_keys=True)
    else:
        print data


def main():
    """
    Below is an example investigation to help you get started in your
    exploration. Modify the function calls and indexing below to answer the
    questions on the next quiz.

    HINT: Note how the output we get from the site is a multi-level JSON
    document, so try making print statements to step through the structure one
    level at a time or copy the output to a separate output file. Experimenting
    and iteration will be key to understand the structure of the data!
    """

    # Query for information in the database about bands named First Aid Kit, Queen, Beatles, Nirvana One Direction
    results = query_by_name(ARTIST_URL, query_type["simple"], "One Direction")
    #pretty_print(results)

    # Determine number of bands named First Aid Kit
    #print "\nNumber of artists named First Aid Kit:"
    #nFAK = 0
    #for n in results["artists"]:
    #    if n['name'] == 'First Aid Kit':
    #        nFAK += 1
    #print nFAK
    
    # Determine begin-area name for Queen
    #print results["artists"][2]["begin-area"]["name"]
    
    # Determine Spanish alias for Beatles
    #for n in results["artists"][8]['aliases']:
    #    if n['locale'] == 'es':
    #        print n['name']
    
    # Determine disambiguation for Nirvana
    #print results['artists'][4]['disambiguation']
    
    # Determine, when One Direction was formed
    #print results['artists'][0]['life-span']['begin']
    pretty_print(results['artists'][0])
    
    
    


if __name__ == '__main__':
    main()

requesting http://musicbrainz.org/ws/2/artist/?query=artist%3AOne+Direction&fmt=json
{
    "aliases": [
        {
            "begin-date": null, 
            "end-date": null, 
            "locale": null, 
            "name": "1D", 
            "primary": null, 
            "sort-name": "1D", 
            "type": "Artist name"
        }
    ], 
    "area": {
        "id": "8a754a16-0027-3a29-b6d7-2b40ea0481ed", 
        "name": "United Kingdom", 
        "sort-name": "United Kingdom"
    }, 
    "begin-area": {
        "id": "f03d09b3-39dc-4083-afd6-159e3f0d462f", 
        "name": "London", 
        "sort-name": "London"
    }, 
    "country": "GB", 
    "id": "1a425bbd-cca4-4b2c-aeb7-71cb176c828a", 
    "life-span": {
        "begin": "2010-07", 
        "ended": null
    }, 
    "name": "One Direction", 
    "score": "100", 
    "sort-name": "One Direction", 
    "tags": [
        {
            "count": 2, 
            "name": "pop"
        }, 
        {
            "count": 1, 
   

# Lesson 2 - Problem Set: Data Extraction Fundamentals
## Solar and Wind Energy

In [56]:
#!/usr/bin/env python
"""
Your task is to process the supplied file and use the csv module to extract data from it.
The data comes from NREL (National Renewable Energy Laboratory) website. Each file
contains information from one meteorological station, in particular - about amount of
solar and wind energy for each hour of day.

Note that the first line of the datafile is neither data entry, nor header. It is a line
describing the data source. You should extract the name of the station from it.

The data should be returned as a list of lists (not dictionaries).
You can use the csv modules "reader" method to get data in such format.
Another useful method is next() - to get the next line from the iterator.
You should only change the parse_file function.
"""
import csv
import os

DATADIR = "data"
DATAFILE = "745090.csv"


def parse_file(datafile):
    name = ""
    data = []
    with open(datafile,'rb') as f:
        reader = csv.reader(f, delimiter=',')
        for row in reader:
            data.append(row)
        #    print row
    name = data[0][1]
    data = data[2:]
    
    # Do not change the line below
    return (name, data)


def test():
    datafile = os.path.join(DATADIR, DATAFILE)
    name, data = parse_file(datafile)

    assert name == "MOUNTAIN VIEW MOFFETT FLD NAS"
    assert data[0][1] == "01:00"
    assert data[2][0] == "01/01/2005"
    assert data[2][5] == "2"


if __name__ == "__main__":
    test()

## Time and Value

In [50]:
# -*- coding: utf-8 -*-
'''
Find the time and value of max load for each of the regions
COAST, EAST, FAR_WEST, NORTH, NORTH_C, SOUTHERN, SOUTH_C, WEST
and write the result out in a csv file, using pipe character | as the delimiter.

An example output can be seen in the "example.csv" file.
'''

import xlrd
import os
import csv
from zipfile import ZipFile

datafile = "./data/2013_ERCOT_Hourly_Load_Data.xls"
outfile = "./data/2013_Max_Loads.csv"


def open_zip(datafile):
    with ZipFile('{0}.zip'.format(datafile), 'r') as myzip:
        myzip.extractall()


def parse_file(datafile):
    workbook = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)
    data = []
    
    # YOUR CODE HERE
    # Remember that you can use xlrd.xldate_as_tuple(sometime, 0) to convert
    # Excel date to Python tuple of (year, month, day, hour, minute, second)

    num_cols = sheet.ncols   # Number of columns
    for col_idx in range(1, num_cols-1):    # Iterate through columns
        col_max = 0
        for row_idx in range(1, sheet.nrows):  # Iterate through rows
            cell_obj = sheet.cell(row_idx, col_idx)  # Get cell object by row, col
            if cell_obj.value > col_max:
                col_max = cell_obj.value
                max_time = xlrd.xldate_as_tuple(sheet.cell(row_idx, 0).value, 0)
                #print max_time
        data.append((max_time,col_max))
    print data
    return data

def save_file(data, filename):
    # YOUR CODE HERE
    with open(filename, 'wb') as csvfile:
        writer = csv.writer(csvfile, delimiter='|', lineterminator='\n')
        writer.writerows(data)
    
def test():
    #open_zip(datafile)
    data = parse_file(datafile)
    save_file(data, outfile)

    number_of_rows = 0
    stations = []

    ans = {'FAR_WEST': {'Max Load': '2281.2722140000024',
                        'Year': '2013',
                        'Month': '6',
                        'Day': '26',
                        'Hour': '17'}}
    correct_stations = ['COAST', 'EAST', 'FAR_WEST', 'NORTH',
                        'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST']
    fields = ['Year', 'Month', 'Day', 'Hour', 'Max Load']

    with open(outfile) as of:
        csvfile = csv.DictReader(of, delimiter="|")
        for line in csvfile:
            station = line['Station']
            if station == 'FAR_WEST':
                for field in fields:
                    # Check if 'Max Load' is within .1 of answer
                    if field == 'Max Load':
                        max_answer = round(float(ans[station][field]), 1)
                        max_line = round(float(line[field]), 1)
                        assert max_answer == max_line

                    # Otherwise check for equality
                    else:
                        assert ans[station][field] == line[field]

            number_of_rows += 1
            stations.append(station)

        # Output should be 8 lines not including header
        assert number_of_rows == 8

        # Check Station Names
        assert set(stations) == set(correct_stations)

        
if __name__ == "__main__":
    test()

[((2013, 8, 13, 17, 0, 0), 18779.025510000003), ((2013, 8, 5, 17, 0, 0), 2380.1654089999956), ((2013, 6, 26, 17, 0, 0), 2281.2722140000024), ((2013, 8, 7, 17, 0, 0), 1544.7707140000005), ((2013, 8, 7, 18, 0, 0), 24415.570226999993), ((2013, 8, 8, 16, 0, 0), 5494.157645), ((2013, 8, 8, 18, 0, 0), 11433.30491600001), ((2013, 8, 7, 17, 0, 0), 1862.6137649999998)]


KeyError: 'Station'

# Lesson 3 - Data in More Complex Formats
## Biomed

In [6]:
#!/usr/bin/env python
# Your task here is to extract data from xml on authors of an article
# and add it to a list, one item for an author.
# See the provided data structure for the expected format.
# The tags for first name, surname and email should map directly
# to the dictionary keys
import xml.etree.ElementTree as ET

article_file = "./data/exampleResearchArticle.xml"


def get_root(fname):
    tree = ET.parse(fname)
    return tree.getroot()


def get_authors(root):
    authors = []
    for author in root.findall('./fm/bibl/aug/au'):
        data = {
                "fnm": None,
                "snm": None,
                "email": None
        }

        # YOUR CODE HERE
        data["fnm"] = author.find('fnm').text
        data["snm"] = author.find('snm').text
        data["email"] = author.find('email').text
        
        print data

        authors.append(data)

    return authors


def test():
    solution = [{'fnm': 'Omer', 'snm': 'Mei-Dan', 'email': 'omer@extremegate.com'}, {'fnm': 'Mike', 'snm': 'Carmont', 'email': 'mcarmont@hotmail.com'}, {'fnm': 'Lior', 'snm': 'Laver', 'email': 'laver17@gmail.com'}, {'fnm': 'Meir', 'snm': 'Nyska', 'email': 'nyska@internet-zahav.net'}, {'fnm': 'Hagay', 'snm': 'Kammar', 'email': 'kammarh@gmail.com'}, {'fnm': 'Gideon', 'snm': 'Mann', 'email': 'gideon.mann.md@gmail.com'}, {'fnm': 'Barnaby', 'snm': 'Clarck', 'email': 'barns.nz@gmail.com'}, {'fnm': 'Eugene', 'snm': 'Kots', 'email': 'eukots@gmail.com'}]
    
    root = get_root(article_file)
    data = get_authors(root)

    assert data[0] == solution[0]
    assert data[1]["fnm"] == solution[1]["fnm"]


test()

{'fnm': 'Omer', 'snm': 'Mei-Dan', 'email': 'omer@extremegate.com'}
{'fnm': 'Mike', 'snm': 'Carmont', 'email': 'mcarmont@hotmail.com'}
{'fnm': 'Lior', 'snm': 'Laver', 'email': 'laver17@gmail.com'}
{'fnm': 'Meir', 'snm': 'Nyska', 'email': 'nyska@internet-zahav.net'}
{'fnm': 'Hagay', 'snm': 'Kammar', 'email': 'kammarh@gmail.com'}
{'fnm': 'Gideon', 'snm': 'Mann', 'email': 'gideon.mann.md@gmail.com'}
{'fnm': 'Barnaby', 'snm': 'Clarck', 'email': 'barns.nz@gmail.com'}
{'fnm': 'Eugene', 'snm': 'Kots', 'email': 'eukots@gmail.com'}


## Biomed iid fields

In [39]:
#!/usr/bin/env python
# Your task here is to extract data from xml on authors of an article
# and add it to a list, one item for an author.
# See the provided data structure for the expected format.
# The tags for first name, surname and email should map directly
# to the dictionary keys, but you have to extract the attributes from the "insr" tag
# and add them to the list for the dictionary key "insr"
import xml.etree.ElementTree as ET

article_file = "./data/exampleResearchArticle.xml"


def get_root(fname):
    tree = ET.parse(fname)
    return tree.getroot()


def get_authors(root):
    authors = []
    for author in root.findall('./fm/bibl/aug/au'):
        data = {
                "fnm": None,
                "snm": None,
                "email": None,
                "insr": []
        }

        # YOUR CODE HERE
        data["fnm"] = author.find('fnm').text
        data["snm"] = author.find('snm').text
        data["email"] = author.find('email').text
        
        data["insr"] = []
        insrlist = author.findall('insr')
        
        for item in insrlist:
            data["insr"].append(item.attrib['iid'])
        
        print data

        authors.append(data)

    return authors


def test():
    solution = [{'insr': ['I1'], 'fnm': 'Omer', 'snm': 'Mei-Dan', 'email': 'omer@extremegate.com'},
                {'insr': ['I2'], 'fnm': 'Mike', 'snm': 'Carmont', 'email': 'mcarmont@hotmail.com'},
                {'insr': ['I3', 'I4'], 'fnm': 'Lior', 'snm': 'Laver', 'email': 'laver17@gmail.com'},
                {'insr': ['I3'], 'fnm': 'Meir', 'snm': 'Nyska', 'email': 'nyska@internet-zahav.net'},
                {'insr': ['I8'], 'fnm': 'Hagay', 'snm': 'Kammar', 'email': 'kammarh@gmail.com'},
                {'insr': ['I3', 'I5'], 'fnm': 'Gideon', 'snm': 'Mann', 'email': 'gideon.mann.md@gmail.com'},
                {'insr': ['I6'], 'fnm': 'Barnaby', 'snm': 'Clarck', 'email': 'barns.nz@gmail.com'},
                {'insr': ['I7'], 'fnm': 'Eugene', 'snm': 'Kots', 'email': 'eukots@gmail.com'}]

    root = get_root(article_file)
    data = get_authors(root)

    assert data[0] == solution[0]
    assert data[1]["insr"] == solution[1]["insr"]


test()

{'insr': ['I1'], 'fnm': 'Omer', 'snm': 'Mei-Dan', 'email': 'omer@extremegate.com'}
{'insr': ['I2'], 'fnm': 'Mike', 'snm': 'Carmont', 'email': 'mcarmont@hotmail.com'}
{'insr': ['I3', 'I4'], 'fnm': 'Lior', 'snm': 'Laver', 'email': 'laver17@gmail.com'}
{'insr': ['I3'], 'fnm': 'Meir', 'snm': 'Nyska', 'email': 'nyska@internet-zahav.net'}
{'insr': ['I8'], 'fnm': 'Hagay', 'snm': 'Kammar', 'email': 'kammarh@gmail.com'}
{'insr': ['I3', 'I5'], 'fnm': 'Gideon', 'snm': 'Mann', 'email': 'gideon.mann.md@gmail.com'}
{'insr': ['I6'], 'fnm': 'Barnaby', 'snm': 'Clarck', 'email': 'barns.nz@gmail.com'}
{'insr': ['I7'], 'fnm': 'Eugene', 'snm': 'Kots', 'email': 'eukots@gmail.com'}


## Transtats - BeautifulSoup

In [77]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Please note that the function 'make_request' is provided for your reference only.
# You will not be able to to actually use it from within the Udacity web UI.
# Your task is to process the HTML using BeautifulSoup, extract the hidden
# form field values for "__EVENTVALIDATION" and "__VIEWSTATE" and set the appropriate
# values in the data dictionary.
# All your changes should be in the 'extract_data' function
from bs4 import BeautifulSoup
import requests
import json

html_page = "./data/page_source.html"

def extract_data(page):
    data = {"eventvalidation": "",
            "viewstate": ""}
    with open(page, "r") as html:
        # do something here to find the necessary values
        soup = BeautifulSoup(html, "lxml")
        data["viewstate"] = soup.find(id="__VIEWSTATE")['value']
        data["eventvalidation"] = soup.find(id="__EVENTVALIDATION")['value']
    print 'Eventvalidation: '+data["eventvalidation"][:10]
    print 'Viewstate: '+data["viewstate"][:10]
    return data


def make_request(data):
    eventvalidation = data["eventvalidation"]
    viewstate = data["viewstate"]

    r = requests.post("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
                    data={'AirportList': "BOS",
                          'CarrierList': "VX",
                          'Submit': 'Submit',
                          "__EVENTTARGET": "",
                          "__EVENTARGUMENT": "",
                          "__EVENTVALIDATION": eventvalidation,
                          "__VIEWSTATE": viewstate
                    })

    return r.text


def test():
    data = extract_data(html_page)
    assert data["eventvalidation"] != ""
    assert data["eventvalidation"].startswith("/wEWjAkCoIj1ng0")
    assert data["viewstate"].startswith("/wEPDwUKLTI")
    
test()

Eventvalidation: /wEWjAkCoI
Viewstate: /wEPDwUKLT


# Lesson 4 - Problem Set: Data in More Complex Formats
## Carrier List

In [94]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Your task in this exercise is to modify 'extract_carrier()` to get a list of
all airlines. Exclude all of the combination values like "All U.S. Carriers"
from the data that you return. You should return a list of codes for the
carriers.

All your changes should be in the 'extract_carrier()' function. The
'options.html' file in the tab above is a stripped down version of what is
actually on the website, but should provide an example of what you should get
from the full file.

Please note that the function 'make_request()' is provided for your reference
only. You will not be able to to actually use it from within the Udacity web UI.
"""

from bs4 import BeautifulSoup
html_page = "./data/options.html"


def extract_carriers(page):
    data = []

    with open(page, "r") as html:
        # do something here to find the necessary values
        soup = BeautifulSoup(html, "lxml")
        carriers = soup.find(id="CarrierList").find_all('option')
        for carrier in carriers:
            if not 'All' in carrier['value']:
                data.append(carrier['value'])
                #print carrier['value']
    print data
    return data


def make_request(data):
    eventvalidation = data["eventvalidation"]
    viewstate = data["viewstate"]
    airport = data["airport"]
    carrier = data["carrier"]

    r = s.post("https://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
               data = (("__EVENTTARGET", ""),
                       ("__EVENTARGUMENT", ""),
                       ("__VIEWSTATE", viewstate),
                       ("__VIEWSTATEGENERATOR",viewstategenerator),
                       ("__EVENTVALIDATION", eventvalidation),
                       ("CarrierList", carrier),
                       ("AirportList", airport),
                       ("Submit", "Submit")))

    return r.text

def test():
    data = extract_carriers(html_page)
    assert len(data) == 16
    assert "FL" in data
    assert "NK" in data

if __name__ == "__main__":
    test()

['FL', 'AS', 'AA', 'MQ', '5Y', 'DL', 'EV', 'F9', 'HA', 'B6', 'OO', 'WN', 'NK', 'US', 'UA', 'VX']


## Airport List

In [95]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Complete the 'extract_airports()' function so that it returns a list of airport
codes, excluding any combinations like "All".

Refer to the 'options.html' file in the tab above for a stripped down version
of what is actually on the website. The test() assertions are based on the
given file.
"""

from bs4 import BeautifulSoup
html_page = "./data/options.html"


def extract_airports(page):
    data = []
    with open(page, "r") as html:
        # do something here to find the necessary values
        soup = BeautifulSoup(html, "lxml")
        airports = soup.find(id="AirportList").find_all('option')
        for airport in airports:
            if not 'All' in airport['value']:
                data.append(airport['value'])
    print data
    return data


def test():
    data = extract_airports(html_page)
    assert len(data) == 15
    assert "ATL" in data
    assert "ABR" in data

if __name__ == "__main__":
    test()

['ATL', 'BWI', 'BOS', 'CLT', 'MDW', 'ORD', 'DFW', 'DEN', 'DTW', 'FLL', 'IAH', 'LAS', 'LAX', 'ABR', 'ABI']


## Processing All

In [113]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Let's assume that you combined the code from the previous 2 exercises with code
from the lesson on how to build requests, and downloaded all the data locally.
The files are in a directory "data", named after the carrier and airport:
"{}-{}.html".format(carrier, airport), for example "FL-ATL.html".

The table with flight info has a table class="dataTDRight". Your task is to
use 'process_file()' to extract the flight data from that table as a list of
dictionaries, each dictionary containing relevant data from the file and table
row. This is an example of the data structure you should return:

data = [{"courier": "FL",
         "airport": "ATL",
         "year": 2012,
         "month": 12,
         "flights": {"domestic": 100,
                     "international": 100}
        },
         {"courier": "..."}
]

Note - year, month, and the flight data should be integers.
You should skip the rows that contain the TOTAL data for a year.

There are couple of helper functions to deal with the data files.
Please do not change them for grading purposes.
All your changes should be in the 'process_file()' function.

The 'data/FL-ATL.html' file in the tab above is only a part of the full data,
covering data through 2003. The test() code will be run on the full table, but
the given file should provide an example of what you will get.
"""
from bs4 import BeautifulSoup
from zipfile import ZipFile
import os

datadir = "data_airport"


def open_zip(datadir):
    with ZipFile('{0}.zip'.format(datadir), 'r') as myzip:
        myzip.extractall()


def process_all(datadir):
    files = os.listdir(datadir)
    return files


def process_file(f):
    """
    This function extracts data from the file given as the function argument in
    a list of dictionaries. This is example of the data structure you should
    return:

    data = [{"courier": "FL",
             "airport": "ATL",
             "year": 2012,
             "month": 12,
             "flights": {"domestic": 100,
                         "international": 100}
            },
            {"courier": "..."}
    ]


    Note - year, month, and the flight data should be integers.
    You should skip the rows that contain the TOTAL data for a year.
    """
    data = []
    info = {}
    info["courier"], info["airport"] = f[:6].split("-")
    # Note: create a new dictionary for each entry in the output data list.
    # If you use the info dictionary defined here each element in the list 
    # will be a reference to the same info dictionary.
    with open("{}/{}".format(datadir, f), "r") as html:

        soup = BeautifulSoup(html, "lxml")
        table = soup.find("table", id="DataGrid1")
        #items = table.find_all("td")
        #for item in items:
        #    print item.find("td")
        rawdata = []
        for node in table.findAll('td'):
            rawdata.append(''.join(node.findAll(text=True)))
        #print rawdata
              
        for i in range(5, len(rawdata), 5):
            data_dict = {"courier": info["courier"],
                 "airport": info["airport"],
                 "year": None,
                 "month": None,
                 "flights": {"domestic": None,
                             "international": None}
                }
            if not rawdata[i+1] == 'TOTAL':
                data_dict["year"] = int(rawdata[i])                
                data_dict["month"] = int(rawdata[i+1])
                #print int(rawdata[i+2].replace(',',''))
                data_dict["flights"]["domestic"] = int(rawdata[i+2].replace(',',''))
                data_dict["flights"]["international"] = int(rawdata[i+3].replace(',',''))
                
                #print data_dict
                data.append(data_dict)
        
    print data

    return data


def test():
    print "Running a simple test..."
    #open_zip(datadir)
    files = process_all(datadir)
    data = []
    # Test will loop over three data files.
    for f in files:
        data += process_file(f)
    
    #assert len(data) == 399  # Total number of rows
    for entry in data[:3]:
        #print type(entry["year"])
        assert type(entry["year"]) == int
        assert type(entry["month"]) == int
        assert type(entry["flights"]["domestic"]) == int
        assert len(entry["airport"]) == 3
        assert len(entry["courier"]) == 2
    assert data[0]["courier"] == 'FL'
    assert data[0]["month"] == 10
    assert data[-1]["airport"] == "ATL"
    assert data[-1]["flights"] == {'international': 108289, 'domestic': 701425}
    
    print "... success!"

if __name__ == "__main__":
    test()

Running a simple test...
[{'airport': 'BWI', 'month': 10, 'flights': {'international': 92565, 'domestic': 815489}, 'courier': 'AS', 'year': 2002}, {'airport': 'BWI', 'month': 11, 'flights': {'international': 91342, 'domestic': 766775}, 'courier': 'AS', 'year': 2002}, {'airport': 'BWI', 'month': 12, 'flights': {'international': 96881, 'domestic': 782175}, 'courier': 'AS', 'year': 2002}, {'airport': 'BWI', 'month': 1, 'flights': {'international': 98053, 'domestic': 785651}, 'courier': 'AS', 'year': 2003}, {'airport': 'BWI', 'month': 2, 'flights': {'international': 85965, 'domestic': 690750}, 'courier': 'AS', 'year': 2003}, {'airport': 'BWI', 'month': 3, 'flights': {'international': 97929, 'domestic': 797634}, 'courier': 'AS', 'year': 2003}, {'airport': 'BWI', 'month': 4, 'flights': {'international': 89398, 'domestic': 766639}, 'courier': 'AS', 'year': 2003}, {'airport': 'BWI', 'month': 5, 'flights': {'international': 87671, 'domestic': 789857}, 'courier': 'AS', 'year': 2003}, {'airport':

AssertionError: 

## Patent Database

In [141]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
This and the following exercise are using US Patent database. The patent.data
file is a small excerpt of much larger datafiles that are available for
download from US Patent website. These files are pretty large ( >100 MB each).
The original file is ~600MB large, you might not be able to open it in a text
editor.

The data itself is in XML, however there is a problem with how it's formatted.
Please run this script and observe the error. Then find the line that is
causing the error. You can do that by just looking at the datafile in the web
UI, or programmatically. For quiz purposes it does not matter, but as an
exercise we suggest that you try to do it programmatically.

NOTE: You do not need to correct the error - for now, just find where the error
is occurring.
"""

import xml.etree.ElementTree as ET

PATENTS = './data/patent.data'

def get_root(fname):
    #tree = []
    #line = 0
    #for event, element in ET.iterparse(fname):
    #    line += 1
    #    print line
    #    tree.append(element)
    
    try:
        tree = ET.parse(fname)
    except ET.ParseError as err:
        lineno, column = err.position
        #print lineno
    
    return lineno


line = get_root(PATENTS)
print line

657


## Processing Patents - Split .xml files

In [152]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# So, the problem is that the gigantic file is actually not a valid XML, because
# it has several root elements, and XML declarations.
# It is, a matter of fact, a collection of a lot of concatenated XML documents.
# So, one solution would be to split the file into separate documents,
# so that you can process the resulting files as valid XML documents.

import xml.etree.ElementTree as ET

PATENTS = './data/patent.data'

def get_root(fname):
    tree = ET.parse(fname)
    return tree.getroot()


def split_file(filename):
    """
    Split the input file into separate files, each containing a single patent.
    As a hint - each patent declaration starts with the same line that was
    causing the error found in the previous exercises.
    
    The new files should be saved with filename in the following format:
    "{}-{}".format(filename, n) where n is a counter, starting from 0.
    """

    # Open file to read
    with open(filename, "r") as r:

        # Counter, initial value -1 in order to have 0 as first file index
        n=-1

        # Start reading file line by line
        for i, line in enumerate(r):

            # If '?xml' is found in line, increase counter n
            if '?xml' in line:
                n+=1              

            # Write lines to file    
            with open("{}-{}".format(PATENTS, n), "a") as f:
                f.write(line)

def test():
    split_file(PATENTS)
    for n in range(4):
        try:
            fname = "{}-{}".format(PATENTS, n)
            f = open(fname, "r")
            if not f.readline().startswith("<?xml"):
                print "You have not split the file {} in the correct boundary!".format(fname)
            f.close()
        except:
            print "Could not find file {}. Check if the filename is correct!".format(fname)


test()


# Lesson 5  - Data Quality
## Example - Cleaning Blueprint

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
from collections import defaultdict
import re

osm_file = open("chicago.osm", "r")

street_type_re = re.compile(r'\S+\.?$', re.IGNORECASE)
street_types = defaultdict(int)

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()

        street_types[street_type] += 1

def print_sorted_dict(d):
    keys = d.keys()
    keys = sorted(keys, key=lambda s: s.lower())
    for k in keys:
        v = d[k]
        print "%s: %d" % (k, v) 

def is_street_name(elem):
    return (elem.tag == "tag") and (elem.attrib['k'] == "addr:street")

def audit():
    for event, elem in ET.iterparse(osm_file):
        if is_street_name(elem):
            audit_street_type(street_types, elem.attrib['v'])    
    print_sorted_dict(street_types)    

if __name__ == '__main__':
    audit()

## Correcting Validity

In [None]:
"""
Your task is to check the "productionStartYear" of the DBPedia autos datafile for valid values.
The following things should be done:
- check if the field "productionStartYear" contains a year
- check if the year is in range 1886-2014
- convert the value of the field to be just a year (not full datetime)
- the rest of the fields and values should stay the same
- if the value of the field is a valid year in the range as described above,
  write that line to the output_good file
- if the value of the field is not a valid year as described above, 
  write that line to the output_bad file
- discard rows (neither write to good nor bad) if the URI is not from dbpedia.org
- you should use the provided way of reading and writing data (DictReader and DictWriter)
  They will take care of dealing with the header.

You can write helper functions for checking the data and writing the files, but we will call only the 
'process_file' with 3 arguments (inputfile, output_good, output_bad).
"""
import csv
import pprint

INPUT_FILE = 'autos.csv'
OUTPUT_GOOD = 'autos-valid.csv'
OUTPUT_BAD = 'FIXME-autos.csv'

def process_file(input_file, output_good, output_bad):
    # store data into lists for output
    data_good = []
    data_bad = []
    with open(input_file, "r") as f:
        reader = csv.DictReader(f)
        header = reader.fieldnames
        for row in reader:
            # validate URI value
            if row['URI'].find("dbpedia.org") < 0:
                continue

            ps_year = row['productionStartYear'][:4]
            try: # use try/except to filter valid items
                ps_year = int(ps_year)
                row['productionStartYear'] = ps_year
                if (ps_year >= 1886) and (ps_year <= 2014):
                    data_good.append(row)
                else:
                    data_bad.append(row)
            except ValueError: # non-numeric strings caught by exception
                if ps_year == 'NULL':
                    data_bad.append(row)

    # Write processed data to output files
    with open(output_good, "w") as good:
        writer = csv.DictWriter(good, delimiter=",", fieldnames= header)
        writer.writeheader()
        for row in data_good:
            writer.writerow(row)

    with open(output_bad, "w") as bad:
        writer = csv.DictWriter(bad, delimiter=",", fieldnames= header)
        writer.writeheader()
        for row in data_bad:
            writer.writerow(row)

def test():

    process_file(INPUT_FILE, OUTPUT_GOOD, OUTPUT_BAD)


if __name__ == "__main__":
    test()

# Lesson 6 - Problem Set: Data Quality
## Auditing Data Quality

In [10]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
In this problem set you work with cities infobox data, audit it, come up with a
cleaning idea and then clean it up. In the first exercise we want you to audit
the datatypes that can be found in some particular fields in the dataset.
The possible types of values can be:
- NoneType if the value is a string "NULL" or an empty string ""
- list, if the value starts with "{"
- int, if the value can be cast to int
- float, if the value can be cast to float, but CANNOT be cast to int.
   For example, '3.23e+07' should be considered a float because it can be cast
   as float but int('3.23e+07') will throw a ValueError
- 'str', for all other values

The audit_file function should return a dictionary containing fieldnames and a 
SET of the types that can be found in the field. e.g.
{"field1": set([type(float()), type(int()), type(str())]),
 "field2": set([type(str())]),
  ....
}
The type() function returns a type object describing the argument given to the 
function. You can also use examples of objects to create type objects, e.g.
type(1.1) for a float: see the test function below for examples.

Note that the first three rows (after the header row) in the cities.csv file
are not actual data points. The contents of these rows should note be included
when processing data types. Be sure to include functionality in your code to
skip over or detect these rows.
"""
import codecs
import csv
import json
import pprint

CITIES = './data/cities.csv'

FIELDS = ["name", "timeZone_label", "utcOffset", "homepage", "governmentType_label",
          "isPartOf_label", "areaCode", "populationTotal", "elevation",
          "maximumElevation", "minimumElevation", "populationDensity",
          "wgs84_pos#lat", "wgs84_pos#long", "areaLand", "areaMetro", "areaUrban"]

def audit_file(filename, fields):
    fieldtypes = {}

    with open(filename, "r") as f:
        reader = csv.DictReader(f)
        header = reader.fieldnames
        for row in reader:
            # validate URI value
            if row['URI'].find("dbpedia.org") < 0:
                continue
            
            # audit only required fields
            for field in fields:
                row[field] = row[field].strip()
                if row[field]

        #    ps_year = row['productionStartYear'][:4]
        #    try: # use try/except to filter valid items
        #        ps_year = int(ps_year)
        #        row['productionStartYear'] = ps_year
        #        if (ps_year >= 1886) and (ps_year <= 2014):
        #            data_good.append(row)
        #        else:
        #            data_bad.append(row)
        #    except ValueError: # non-numeric strings caught by exception
        #        if ps_year == 'NULL':
        #            data_bad.append(row)
        
    
    return header


def test():
    fieldtypes = audit_file(CITIES, FIELDS)

    pprint.pprint(fieldtypes)

    assert fieldtypes["areaLand"] == set([type(1.1), type([]), type(None)])
    assert fieldtypes['areaMetro'] == set([type(1.1), type(None)])
    
if __name__ == "__main__":
    test()

-1
-1
-1
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7

TypeError: list indices must be integers, not str

In [6]:
fieldtypes = audit_file(CITIES, FIELDS)

['URI', 'rdf-schema#label', 'rdf-schema#comment', 'administrativeDistrict_label', 'administrativeDistrict', 'anthem_label', 'anthem', 'area', 'areaCode', 'areaLand', 'areaMetro', 'areaRural', 'areaTotal', 'areaUrban', 'areaWater', 'city_label', 'city', 'code', 'country_label', 'country', 'daylightSavingTimeZone_label', 'daylightSavingTimeZone', 'district_label', 'district', 'division_label', 'division', 'elevation', 'federalState_label', 'federalState', 'foundingDate', 'foundingPerson_label', 'foundingPerson', 'foundingYear', 'governingBody_label', 'governingBody', 'government_label', 'government', 'governmentType_label', 'governmentType', 'isPartOf_label', 'isPartOf', 'isoCodeRegion_label', 'isoCodeRegion', 'leader_label', 'leader', 'leaderName_label', 'leaderName', 'leaderParty_label', 'leaderParty', 'leaderTitle', 'location_label', 'location', 'maximumElevation', 'mayor_label', 'mayor', 'minimumElevation', 'motto', 'municipality_label', 'municipality', 'part_label', 'part', 'percent

In [7]:
fieldtypes['URI']

TypeError: list indices must be integers, not str

# Lesson 12 - Problem Set: Deeper Into SQL
## DB API Playground

In [41]:
##  Here's a playground to help you get a little more comfortable using DB API
##  Nothing here is graded, you've done a ton of work so far and I think 
##  it'd be great to relax and play a bit!

##  Connect to the Chinook database and try a few of your own ideas.
##  This is also a great place to try experiments building queries 
##  for the next few quizzes!

import sqlite3

# Fetch records from either chinook.db
db = sqlite3.connect("chinook.db")
c = db.cursor()
# How many Pop songs have an MPEG audio file format?
#QUERY = "SELECT count(*) FROM Track, Genre, MediaType WHERE Genre.Name = 'Pop' AND MediaType.Name = 'MPEG audio file';" #--> Why doesn't this work?
QUERY = "SELECT count(*) FROM Track JOIN Genre JOIN MediaType on Track.GenreId = Genre.GenreId and Track.MediaTypeId = MediaType.MediaTypeId WHERE Genre.Name = 'Pop' AND MediaType.Name = 'MPEG audio file';"
# How many unique customers have purchased Jazz?
QUERY = "SELECT Genre.Name, count(DISTINCT Customer.Email) FROM Customer JOIN Invoice JOIN InvoiceLine JOIN Genre JOIN Track ON Track.GenreId = Genre.GenreId AND Track.TrackId = InvoiceLine.TrackId AND InvoiceLine.InvoiceId = Invoice.InvoiceId AND Customer.CustomerId = Invoice.InvoiceId WHERE Genre.Name = 'Jazz';"
# Genre with most songs below average song length
QUERY = "SELECT Genre.Name, count(Genre.Name) FROM Genre JOIN Track JOIN (SELECT avg(Milliseconds) AS average FROM Track) AS subq ON Track.GenreId = Genre.GenreId WHERE Track.Milliseconds<average GROUP BY Genre.Name ORDER BY count(Genre.Name) DESC;"
c.execute(QUERY)
rows = c.fetchall()

'''Uncomment to see your query in python'''
#print "Row data:"
#print rows

'''Uncomment to print your query by row'''
#print "your output:"
#for row in rows:
#  print "  ", row[0:]

'''Uncomment to see your query as a pandas dataframe.
This is similar to the output you've been seeing throughout this course
You can learn more about pandas dataframes in our Intro to Data Analysis course!'''

import pandas as pd    
df = pd.DataFrame(rows)
print df

db.close()


                     0     1
0                 Rock  1162
1                Latin   569
2   Alternative & Punk   320
3                Metal   300
4                 Jazz   116
5                Blues    72
6            Classical    60
7             R&B/Soul    60
8               Reggae    58
9                  Pop    46
10          Soundtrack    43
11         Alternative    38
12         Hip Hop/Rap    34
13   Electronica/Dance    28
14               World    28
15      Easy Listening    24
16         Heavy Metal    23
17          Bossa Nova    14
18       Rock And Roll    12
19               Drama     1
20               Opera     1


# Lesson 13 - Case Study: Open Street Map Data
## Iterative Parsing

In [9]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Your task is to use the iterative parsing to process the map file and
find out not only what tags are there, but also how many, to get the
feeling on how much of which data you can expect to have in the map.
Fill out the count_tags function. It should return a dictionary with the 
tag name as the key and number of times this tag can be encountered in 
the map as value.

Note that your code will be tested with a different data file than the 'example.osm'
"""
import xml.etree.cElementTree as ET
from collections import defaultdict
import pprint

osm_tags = defaultdict(int)

def count_tags(filename):
    osm_file = open(filename, "r")
    for event, elem in ET.iterparse(osm_file):
        osm_tag = elem.tag
        osm_tags[osm_tag] += 1
        #print "Element: " + str(elem.tag)
    return osm_tags


def test():

    tags = count_tags('./osm_case_study/example.osm')
    pprint.pprint(tags)
    assert tags == {'bounds': 1,
                     'member': 3,
                     'nd': 4,
                     'node': 20,
                     'osm': 1,
                     'relation': 1,
                     'tag': 7,
                     'way': 1}

    

if __name__ == "__main__":
    test()

defaultdict(<type 'int'>, {'node': 20, 'nd': 4, 'bounds': 1, 'member': 3, 'tag': 7, 'relation': 1, 'way': 1, 'osm': 1})


## Tag Types

In [16]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import pprint
import re
"""
Your task is to explore the data a bit more.
Before you process the data and add it into your database, you should check the
"k" value for each "<tag>" and see if there are any potential problems.

We have provided you with 3 regular expressions to check for certain patterns
in the tags. As we saw in the quiz earlier, we would like to change the data
model and expand the "addr:street" type of keys to a dictionary like this:
{"address": {"street": "Some value"}}
So, we have to see if we have such tags, and if we have any tags with
problematic characters.

Please complete the function 'key_type', such that we have a count of each of
four tag categories in a dictionary:
  "lower", for tags that contain only lowercase letters and are valid,
  "lower_colon", for otherwise valid tags with a colon in their names,
  "problemchars", for tags with problematic characters, and
  "other", for other tags that do not fall into the other three categories.
See the 'process_map' and 'test' functions for examples of the expected format.
"""


lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys):
    if element.tag == "tag":
        if lower.search(element.attrib['k']):
            keys['lower'] += 1
            print "lower: " + element.attrib['k']
        elif lower_colon.search(element.attrib['k']):
            keys['lower_colon'] += 1
            print "lower_colon: " + element.attrib['k']
        elif problemchars.search(element.attrib['k']):
            keys['problemchars'] += 1
            print "problemchars: " + element.attrib['k']
        else:
            keys['other'] += 1
            print "other: " + element.attrib['k']
    return keys



def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys



def test():
    # You can use another testfile 'map.osm' to look at your solution
    # Note that the assertion below will be incorrect then.
    # Note as well that the test function here is only used in the Test Run;
    # when you submit, your code will be checked against a different dataset.
    keys = process_map('./osm_case_study/example_tag_types.osm')
    pprint.pprint(keys)
    assert keys == {'lower': 5, 'lower_colon': 0, 'other': 1, 'problemchars': 1}


if __name__ == "__main__":
    test()

lower: highway
problemchars: amenity?
lower: cuisine
other: NAME
lower: highway
lower: restriction
lower: type
{'lower': 5, 'lower_colon': 0, 'other': 1, 'problemchars': 1}


## Exploring Users

In [35]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import pprint
import re
"""
Your task is to explore the data a bit more.
The first task is a fun one - find out how many unique users
have contributed to the map in this particular area!

The function process_map should return a set of unique user IDs ("uid")
"""

def get_user(element):
    if any(element.tag in s for s in ["node", "way", "relation"]):
        user = element.attrib['user']
        return user
    else:
        pass


def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        if not get_user(element) == None:
            users.add(get_user(element))

    return users


def test():

    users = process_map('./osm_case_study/example_tag_types.osm')
    pprint.pprint(users)
    assert len(users) == 6



if __name__ == "__main__":
    test()

set(['Umbugbene',
     'bbmiller',
     'fredr',
     'linuxUser16',
     'uboot',
     'woodpeck_fixbot'])


## Auditing Street Names

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

#osm_file = open("./osm_project/schleswig-holstein-latest.osm", "r")
osm_file = open("./osm_case_study/example.osm", "r")

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
street_types = defaultdict(set)

expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", "Trail", "Parkway", "Commons"]

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

def print_sorted_dict(d):
    keys = d.keys()
    keys = sorted(keys, key=lambda s: s.lower())
    for k in keys:
        v = d[k]
        print "%s: %d" % (k, v) 

def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

def audit():
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    pprint.pprint(dict(street_types))
    #print_sorted_dict(street_types)    

if __name__ == '__main__':
    audit()

## Improving Street Names

In [46]:
"""
Your task in this exercise has two steps:

- audit the OSMFILE and change the variable 'mapping' to reflect the changes needed to fix 
    the unexpected street types to the appropriate ones in the expected list.
    You have to add mappings only for the actual problems you find in this OSMFILE,
    not a generalized solution, since that may and will depend on the particular area you are auditing.
- write the update_name function, to actually fix the street name.
    The function takes a string with street name as an argument and should return the fixed name
    We have provided a simple test so that you see what exactly is expected
"""
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

OSMFILE = "./osm_case_study/example_improving_street_names.osm"
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]

# UPDATE THIS VARIABLE
mapping = { "St": "Street",
            "St.": "Street",
            "Ave": "Avenue",
            "Rd.": "Road"
            }


def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types


def update_name(name, mapping):
    m = street_type_re.search(name) 
    if m: 
        street_type = m.group() 
        if street_type not in expected: 
            name = re.sub(street_type_re, mapping[street_type], name)
    return name


def test():
    st_types = audit(OSMFILE)
    assert len(st_types) == 3
    pprint.pprint(dict(st_types))

    for st_type, ways in st_types.iteritems():
        for name in ways:
            better_name = update_name(name, mapping)
            print name, "=>", better_name
            if name == "West Lexington St.":
                assert better_name == "West Lexington Street"
            if name == "Baldwin Rd.":
                assert better_name == "Baldwin Road"


if __name__ == '__main__':
    test()

{'Ave': set(['N. Lincoln Ave', 'North Lincoln Ave']),
 'Rd.': set(['Baldwin Rd.']),
 'St.': set(['West Lexington St.'])}
N. Lincoln Ave => N. Lincoln Avenue
North Lincoln Ave => North Lincoln Avenue
West Lexington St. => West Lexington Street
Baldwin Rd. => Baldwin Road


## Preparing for Database SQL

In [56]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
After auditing is complete the next step is to prepare the data to be inserted into a SQL database.
To do so you will parse the elements in the OSM XML file, transforming them from document format to
tabular format, thus making it possible to write to .csv files.  These csv files can then easily be
imported to a SQL database as tables.

The process for this transformation is as follows:
- Use iterparse to iteratively step through each top level element in the XML
- Shape each element into several data structures using a custom function
- Utilize a schema and validation library to ensure the transformed data is in the correct format
- Write each data structure to the appropriate .csv files

We've already provided the code needed to load the data, perform iterative parsing and write the
output to csv files. Your task is to complete the shape_element function that will transform each
element into the correct format. To make this process easier we've already defined a schema (see
the schema.py file in the last code tab) for the .csv files and the eventual tables. Using the 
cerberus library we can validate the output against this schema to ensure it is correct.

## Shape Element Function
The function should take as input an iterparse Element object and return a dictionary.

### If the element top level tag is "node":
The dictionary returned should have the format {"node": .., "node_tags": ...}

The "node" field should hold a dictionary of the following top level node attributes:
- id
- user
- uid
- version
- lat
- lon
- timestamp
- changeset
All other attributes can be ignored

The "node_tags" field should hold a list of dictionaries, one per secondary tag. Secondary tags are
child tags of node which have the tag name/type: "tag". Each dictionary should have the following
fields from the secondary tag attributes:
- id: the top level node id attribute value
- key: the full tag "k" attribute value if no colon is present or the characters after the colon if one is.
- value: the tag "v" attribute value
- type: either the characters before the colon in the tag "k" value or "regular" if a colon
        is not present.

Additionally,

- if the tag "k" value contains problematic characters, the tag should be ignored
- if the tag "k" value contains a ":" the characters before the ":" should be set as the tag type
  and characters after the ":" should be set as the tag key
- if there are additional ":" in the "k" value they and they should be ignored and kept as part of
  the tag key. For example:

  <tag k="addr:street:name" v="Lincoln"/>
  should be turned into
  {'id': 12345, 'key': 'street:name', 'value': 'Lincoln', 'type': 'addr'}

- If a node has no secondary tags then the "node_tags" field should just contain an empty list.

The final return value for a "node" element should look something like:

{'node': {'id': 757860928,
          'user': 'uboot',
          'uid': 26299,
       'version': '2',
          'lat': 41.9747374,
          'lon': -87.6920102,
          'timestamp': '2010-07-22T16:16:51Z',
      'changeset': 5288876},
 'node_tags': [{'id': 757860928,
                'key': 'amenity',
                'value': 'fast_food',
                'type': 'regular'},
               {'id': 757860928,
                'key': 'cuisine',
                'value': 'sausage',
                'type': 'regular'},
               {'id': 757860928,
                'key': 'name',
                'value': "Shelly's Tasty Freeze",
                'type': 'regular'}]}

### If the element top level tag is "way":
The dictionary should have the format {"way": ..., "way_tags": ..., "way_nodes": ...}

The "way" field should hold a dictionary of the following top level way attributes:
- id
-  user
- uid
- version
- timestamp
- changeset

All other attributes can be ignored

The "way_tags" field should again hold a list of dictionaries, following the exact same rules as
for "node_tags".

Additionally, the dictionary should have a field "way_nodes". "way_nodes" should hold a list of
dictionaries, one for each nd child tag.  Each dictionary should have the fields:
- id: the top level element (way) id
- node_id: the ref attribute value of the nd tag
- position: the index starting at 0 of the nd tag i.e. what order the nd tag appears within
            the way element

The final return value for a "way" element should look something like:

{'way': {'id': 209809850,
         'user': 'chicago-buildings',
         'uid': 674454,
         'version': '1',
         'timestamp': '2013-03-13T15:58:04Z',
         'changeset': 15353317},
 'way_nodes': [{'id': 209809850, 'node_id': 2199822281, 'position': 0},
               {'id': 209809850, 'node_id': 2199822390, 'position': 1},
               {'id': 209809850, 'node_id': 2199822392, 'position': 2},
               {'id': 209809850, 'node_id': 2199822369, 'position': 3},
               {'id': 209809850, 'node_id': 2199822370, 'position': 4},
               {'id': 209809850, 'node_id': 2199822284, 'position': 5},
               {'id': 209809850, 'node_id': 2199822281, 'position': 6}],
 'way_tags': [{'id': 209809850,
               'key': 'housenumber',
               'type': 'addr',
               'value': '1412'},
              {'id': 209809850,
               'key': 'street',
               'type': 'addr',
               'value': 'West Lexington St.'},
              {'id': 209809850,
               'key': 'street:name',
               'type': 'addr',
               'value': 'Lexington'},
              {'id': '209809850',
               'key': 'street:prefix',
               'type': 'addr',
               'value': 'West'},
              {'id': 209809850,
               'key': 'street:type',
               'type': 'addr',
               'value': 'Street'},
              {'id': 209809850,
               'key': 'building',
               'type': 'regular',
               'value': 'yes'},
              {'id': 209809850,
               'key': 'levels',
               'type': 'building',
               'value': '1'},
              {'id': 209809850,
               'key': 'building_id',
               'type': 'chicago',
               'value': '366409'}]}
"""

import csv
import codecs
import pprint
import re
import xml.etree.cElementTree as ET

#import cerberus

#import schema

OSM_PATH = "./prepare_for_db/example.osm"

NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

#SCHEMA = schema.schema

# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']


def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""
	
    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements
    tag_type = default_tag_type
    nd_pos = 0

    # parse tags for nodes and ways
    tags_dict = {}
    for tag in element.iter('tag'):
        if not problem_chars.search(tag.attrib['k']):
            tag_key = tag.attrib['k']
            if ':' in tag.attrib['k']:
                tag_type = re.split(':',tag.attrib['k'])[0]
                tag_key = re.split(':',tag.attrib['k'], maxsplit=1)[1]
            tags_dict = { \
                         'id': element.attrib['id'], \
                         'key': tag_key, \
                         'value': tag.attrib['v'], \
                         'type': tag_type \
                         }
            tags.append(tags_dict)

    # parse nd tags for ways
    nd_dict = {}
    for tag in element.iter('nd'):
        nd_dict = { \
                   'id': element.attrib['id'], \
                   'node_id': tag.attrib['ref'], \
                   'position': nd_pos \
                   }
        way_nodes.append(nd_dict)
        nd_pos += 1

    # return dictionries for nodes and ways	
    if element.tag == 'node':
        for node_field in node_attr_fields:
            node_attribs[node_field] = element.attrib[node_field]
        #print tags
        return {'node': node_attribs, 'node_tags': tags}
    elif element.tag == 'way':
        for way_field in way_attr_fields:
            way_attribs[way_field] = element.attrib[way_field]        
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}


# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


#def validate_element(element, validator, schema=SCHEMA):
#    """Raise ValidationError if element does not match schema"""
#    if validator.validate(element, schema) is not True:
#        field, errors = next(validator.errors.iteritems())
#        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
#        error_string = pprint.pformat(errors)
#        
#        raise Exception(message_string.format(field, error_string))


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file, \
         codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        if validate is True:
			validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])


if __name__ == '__main__':
    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
    process_map(OSM_PATH, validate=False)