## LESSON 3 NOTES

Measures of Data Quality:
- **Validity**: Conforms to a schema
    - determing what the constraints are on individual fields and checking to ensure the field values conform to those restraints.
- **Accuracy**: conforms to a gold standard
- **Completness**: All Records??
- **Consistency**: Matches other data?
- **Uniformity**: Same Units

Blueprint for Cleaning Data:
1. Audit the Data: programatically check data, possibly use statistical means to check for outliers etc. 
2. Create a data cleaning plan: Use info from audit to:
    - Identify Causes
    - Define Causes
    - Test
3. Execute Plan: i.e. run a script to clean the data. 
4. Manually Correct: If needed.

Go back and Audit the data again... Do a few iterations to build confidence in the data.

### LESSON 2.a - 

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Please note that the function 'make_request' is provided for your reference only.
# You will not be able to to actually use it from within the Udacity web UI.
# Your task is to process the HTML using BeautifulSoup, extract the hidden
# form field values for "__EVENTVALIDATION" and "__VIEWSTATE" and set the appropriate
# values in the data dictionary.
# All your changes should be in the 'extract_data' function
from bs4 import BeautifulSoup
import requests
import json

html_page = "page_source.html"

def extract_data(page):
    data = {"eventvalidation": "",
            "viewstate": ""}
    with open(page, "r") as html:
        soup = BeautifulSoup(html, "lxml")
        ev = soup.find(id="__EVENTVALIDATION")
        data["eventvalidation"] = ev["value"]

        vs = soup.find(id="__VIEWSTATE")
        data["viewstate"] = vs["value"]

    return data

def make_request(data):
    eventvalidation = data["eventvalidation"]
    viewstate = data["viewstate"]

    r = requests.post("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
                    data={'AirportList': "BOS",
                          'CarrierList': "VX",
                          'Submit': 'Submit',
                          "__EVENTTARGET": "",
                          "__EVENTARGUMENT": "",
                          "__EVENTVALIDATION": eventvalidation,
                          "__VIEWSTATE": viewstate
                    })

    return r.text


def test():
    data = extract_data(html_page)
    assert data["eventvalidation"] != ""
    assert data["eventvalidation"].startswith("/wEWjAkCoIj1ng0")
    assert data["viewstate"].startswith("/wEPDwUKLTI")

    
test()

## LESSON 3 Exercises

In [44]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
In this problem set you work with another type of infobox data, audit it, clean it, 
come up with a data model, insert it into a MongoDB and then run some queries against your database.
The set contains data about Arachnid class.
Your task in this exercise is to parse the file, process only the fields that are listed in the
FIELDS dictionary as keys, and return a list of dictionaries of cleaned values. 

The following things should be done:
- keys of the dictionary changed according to the mapping in FIELDS dictionary
- trim out redundant description in parenthesis from the 'rdf-schema#label' field, like "(spider)"
- if 'name' is "NULL" or contains non-alphanumeric characters, set it to the same value as 'label'.
- if a value of a field is "NULL", convert it to None
- if there is a value in 'synonym', it should be converted to an array (list)
  by stripping the "{}" characters and splitting the string on "|". Rest of the cleanup is up to you,
  eg removing "*" prefixes etc. If there is a singular synonym, the value should still be formatted
  in a list.
- strip leading and ending whitespace from all fields, if there is any
- the output structure should be as follows:
{ 'label': 'Argiope',
  'uri': 'http://dbpedia.org/resource/Argiope_(spider)',
  'description': 'The genus Argiope includes rather large and spectacular spiders that often ...',
  'name': 'Argiope',
  'synonym': ["One", "Two"],
  'classification': {
                    'family': 'Orb-weaver spider',
                    'class': 'Arachnid',
                    'phylum': 'Arthropod',
                    'order': 'Spider',
                    'kingdom': 'Animal',
                    'genus': None
                    }
}
  * Note that the value associated with the classification key is a dictionary with
    taxonomic labels.
"""
import codecs
import csv
import json
import pprint
import re

DATAFILE = 'arachnid.csv'
FIELDS ={'rdf-schema#label': 'label',
         'URI': 'uri',
         'rdf-schema#comment': 'description',
         'synonym': 'synonym',
         'name': 'name',
         'family_label': 'family',
         'class_label': 'class',
         'phylum_label': 'phylum',
         'order_label': 'order',
         'kingdom_label': 'kingdom',
         'genus_label': 'genus'}

def process_file(filename, fields):

    
    process_fields = fields.keys()
    data = []
    with open(filename, "r") as f:
        reader = csv.DictReader(f)
        for i in range(3):
            l = reader.next()
        count = 0
        for line in reader:
            for col in line:
                if col in FIELDS.keys():
                    col = FIELDS.get(col)
                    
                data.append(line)
        print data[0]

                
                
                
            #FIELDS[] = line.pop[]
            # YOUR CODE HERE

    #return data
process_file(DATAFILE, FIELDS)

def parse_array(v):
    if (v[0] == "{") and (v[-1] == "}"):
        v = v.lstrip("{")
        v = v.rstrip("}")
        v_array = v.split("|")
        v_array = [i.strip() for i in v_array]
        return v_array
    return [v]


# def test():
#     data = process_file(DATAFILE, FIELDS)
#     print "Your first entry:"
#     pprint.pprint(data[0])
#     first_entry = {
#         "synonym": None, 
#         "name": "Argiope", 
#         "classification": {
#             "kingdom": "Animal", 
#             "family": "Orb-weaver spider", 
#             "order": "Spider", 
#             "phylum": "Arthropod", 
#             "genus": None, 
#             "class": "Arachnid"
#         }, 
#         "uri": "http://dbpedia.org/resource/Argiope_(spider)", 
#         "label": "Argiope", 
#         "description": "The genus Argiope includes rather large and spectacular spiders that often have a strikingly coloured abdomen. These spiders are distributed throughout the world. Most countries in tropical or temperate climates host one or more species that are similar in appearance. The etymology of the name is from a Greek name meaning silver-faced."
#     }

#     assert len(data) == 76
#     assert data[0] == first_entry
#     assert data[17]["name"] == "Ogdenia"
#     assert data[48]["label"] == "Hydrachnidiae"
#     assert data[14]["synonym"] == ["Cyrene Peckham & Peckham"]

# if __name__ == "__main__":
#     test()

{'22-rdf-syntax-ns#type_label': '{animal|arachnid|eukaryote|species|owl#Thing}', 'family': 'http://dbpedia.org/resource/Orb-weaver_spider', 'conservationStatusSystem': 'NULL', 'family_label': 'Orb-weaver spider', 'depiction': 'http://upload.wikimedia.org/wikipedia/commons/f/fd/Argiope_sp.jpg', 'phylum': 'http://dbpedia.org/resource/Arthropod', 'thumbnail_label': '200px-Argiope_sp.jpg', 'conservationStatus': 'NULL', 'species': 'NULL', 'rdf-schema#label': 'Argiope (spider)', 'order_label': 'Spider', 'binomialAuthority': 'NULL', 'division_label': 'NULL', 'kingdom_label': 'Animal', 'binomialAuthority_label': 'NULL', 'thumbnail': 'http://upload.wikimedia.org/wikipedia/commons/thumb/f/fd/Argiope_sp.jpg/200px-Argiope_sp.jpg', 'kingdom': 'http://dbpedia.org/resource/Animal', 'division': 'NULL', 'class_label': 'Arachnid', 'phylum_label': 'Arthropod', 'URI': 'http://dbpedia.org/resource/Argiope_(spider)', '22-rdf-syntax-ns#type': '{http://dbpedia.org/ontology/Animal|http://dbpedia.org/ontology/A