# P3 Notes

* be weary of the accuracy of data
* Some basic steps in assessing data
    * Test assumptions about values (what values does the data have?), data types and shape of data
    * Identify error or ourliers
    * Find missing values

Tabular data:
* Row (items) is a whole row
* Field is a column
* Value is a column meeting a row (single cell)

### Parsing CSV Files Exercise

##### Starting Code

In [None]:
# Your task is to read the input DATAFILE line by line, and for the first 10 lines (not including the header)
# split each line on "," and then for each line, create a dictionary
# where the key is the header title of the field, and the value is the value of that field in the row.
# The function parse_file should return a list of dictionaries,
# each data line in the file being a single list entry.
# Field names and values should not contain extra whitespace, like spaces or newline characters.
# You can use the Python string method strip() to remove the extra whitespace.
# You have to parse only the first 10 data lines in this exercise,
# so the returned list should have 10 entries!

import os

DATADIR = ""
DATAFILE = "beatles-diskography.csv"

def parse_file(datafile):
    data = []
    with open(datafile, "r") as f:
        for line in f:
            print(line)

    return data


def test():
    # a simple test of your implemetation
    datafile = os.path.join(DATADIR, DATAFILE)
    d = parse_file(datafile)
    firstline = {'Title': 'Please Please Me', 'UK Chart Position': '1', 'Label': 'Parlophone(UK)', 'Released': '22 March 1963', 'US Chart Position': '-', 'RIAA Certification': 'Platinum', 'BPI Certification': 'Gold'}
    tenthline = {'Title': '', 'UK Chart Position': '1', 'Label': 'Parlophone(UK)', 'Released': '10 July 1964', 'US Chart Position': '-', 'RIAA Certification': '', 'BPI Certification': 'Gold'}

    assert d[0] == firstline
    assert d[9] == tenthline

    
test()

##### Answer

Notes:
* Should end up with 10 dicts (not including header)

In [None]:
import os

# strings for directory and data file
DATADIR = "supporting-files/"
DATAFILE = "beatles-diskography.csv"

# function to parse the data file
def parse_file(datafile):
    # an empty list that will be filled up
    data = []
    # here, "rb" indicates open in read binary mode
    with open(datafile, "r") as f:
        # .readline() is a Python method to read single line from file
        # .split() returns a list of words in a string
        # so header is a list of the words from the first line (which is the header)
        header = f.readline().split(",")
        # counter is for my for-loop
        counter = 0
        for line in f:
            if counter == 10:
                break
            
            fields = line.split(",")
            entry = {}
            
            # enumerate returns an enumerate object which holds a tuple of a count plus the object being enumerated
            for i, value in enumerate(fields):
                # .strip() returns a copy of the string with the leading and trailing characters removed
                entry[header[i].strip()] = value.strip()
                
            data.append(entry)
            counter += 1
            
    return data

In [None]:
def test():
    # a simple test of your implemetation
    datafile = os.path.join(DATADIR, DATAFILE)
    d = parse_file(datafile)
    firstline = {'Title': 'Please Please Me', 'UK Chart Position': '1', 'Label': 'Parlophone(UK)', 'Released': '22 March 1963', 'US Chart Position': '-', 'RIAA Certification': 'Platinum', 'BPI Certification': 'Gold'}
    tenthline = {'Title': '', 'UK Chart Position': '1', 'Label': 'Parlophone(UK)', 'Released': '10 July 1964', 'US Chart Position': '-', 'RIAA Certification': '', 'BPI Certification': 'Gold'}

    assert d[0] == firstline
    assert d[9] == tenthline

    
test()

### Intro to XLRD

In [None]:
# import dec
import xlrd

datafile = "supporting-files/2013_ERCOT_Hourly_Load_Data.xls"


def parse_file(datafile):
    workbook = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)
    
    # A list of lists where first list is headers, all other lists are the rows
    # .cell_value is a xlrd.sheet method. Returns value of cell in given row and column
    # List comprehension [item for other_item in looping_list]
    # Here, item is [sheet.cell_value(r, col) for col in range(sheet.ncols)] and becomes a list with each loop
    # The loop is for r in range(sheet.nrows)
    # Basically, it's a way to iterate over each cell in the spreadsheet 
    # In sheet.cell_value(r, col), r and col correspond to a 2D matrix
    # Using LC, I can move over each cell in the matrix which is added to the data list as a list
    # r is filled first, then col is filled a number of times(range(sheet.ncols)) for each r
        # like [[0,0 0,1 0,2 ...], [1,0 1,1 1,2 ...], ...]
    # grab a row index, grab a column index, after each loop, create an item for the list via sheet.cell_value()
    data = [[sheet.cell_value(r, col) for col in range(sheet.ncols)] 
                for r in range(sheet.nrows)]

    print ("\nList Comprehension")
    print ("data[3][2]:",)
    print (data[3][2])

    print ("\nCells in a nested loop:")
    # this loop simply grabs the 50th row and prints it
    for row in range(sheet.nrows):
        for col in range(sheet.ncols):
            if row == 50:
                print(sheet.cell_value(row, col),)

    ### other useful methods:
    print ("\nROWS, COLUMNS, and CELLS:")
    print ("Number of rows in the sheet:",)
    print (sheet.nrows)
    print ("Type of data in cell (row 3, col 2):",)
    print (sheet.cell_type(3, 2))
    print ("Value in cell (row 3, col 2):",)
    print (sheet.cell_value(3, 2))
    print ("Get a slice of values in column 3, from rows 1-3:")
    print (sheet.col_values(3, start_rowx=1, end_rowx=4))

    print ("\nDATES:")
    print ("Type of data in cell (row 1, col 0):",)
    print (sheet.cell_type(1, 0))
    exceltime = sheet.cell_value(1, 0)
    print ("Time in Excel format:",)
    print (exceltime)
    print ("Convert time to a Python datetime tuple, from the Excel float:",)
    print (xlrd.xldate_as_tuple(exceltime, 0))

    return data

data = parse_file(datafile)

##### Challenge

In [None]:
#!/usr/bin/env python
"""
Your task is as follows:
- read the provided Excel file
# should be in memory as some kind of dataframe or list or dict
- find and return the min, max and average values for the COAST region
- find and return the time value for the min and max entries
- the time values should be returned as Python tuples

Please see the test function for the expected return format
"""

import xlrd
from zipfile import ZipFile
datafile = "supporting-files/2013_ERCOT_Hourly_Load_Data.zip"

def open_zip(datafile):
    with ZipFile('{0}.zip'.format(datafile), 'r') as myzip:
        myzip.extractall()


def parse_file(datafile):
    workbook = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)

    ### example on how you can get the data
    #sheet_data = [[sheet.cell_value(r, col) for col in range(sheet.ncols)] for r in range(sheet.nrows)]

    ### other useful methods:
    # print "\nROWS, COLUMNS, and CELLS:"
    # print "Number of rows in the sheet:", 
    # print sheet.nrows
    # print "Type of data in cell (row 3, col 2):", 
    # print sheet.cell_type(3, 2)
    # print "Value in cell (row 3, col 2):", 
    # print sheet.cell_value(3, 2)
    # print "Get a slice of values in column 3, from rows 1-3:"
    # print sheet.col_values(3, start_rowx=1, end_rowx=4)

    # print "\nDATES:"
    # print "Type of data in cell (row 1, col 0):", 
    # print sheet.cell_type(1, 0)
    # exceltime = sheet.cell_value(1, 0)
    # print "Time in Excel format:",
    # print exceltime
    # print "Convert time to a Python datetime tuple, from the Excel float:",
    # print xlrd.xldate_as_tuple(exceltime, 0)
    
    
    data = {
            'maxtime': (0, 0, 0, 0, 0, 0),
            'maxvalue': 0,
            'mintime': (0, 0, 0, 0, 0, 0),
            'minvalue': 0,
            'avgcoast': 0
    }
    return data


def test():
    open_zip(datafile)
    data = parse_file(datafile)

    assert data['maxtime'] == (2013, 8, 13, 17, 0, 0)
    assert round(data['maxvalue'], 10) == round(18779.02551, 10)


test()

### JSON Playground

##### Use XML to obtain a JSON object

In [None]:
# import declarations
import json
import requests


BASE_URL = "http://musicbrainz.org/ws/2/"
ARTIST_URL = BASE_URL + "artist/"

# query parameters are given to the requests.get function as a dictionary
# this variable contains some starter parameters, and is a dict with some nested dicts
# Q: How to know what query parameters to use as key and values?
# Q: why are some a nested dict?
query_type = {  "simple": {},
                "atr": {"inc": "aliases+tags+ratings"},
                "aliases": {"inc": "aliases"},
                "releases": {"inc": "releases"}}

# This is the main function for making queries to the musicbrainz API.
# Return type is a json object
    # 

# note, all params with = are optional
# url: of type string
# params: of type dict
# uid="": initialized and optional - of type string
# fmt="json": initialized and optional - of type string
def query_site(url, params, uid="", fmt="json"):
    # appends params with ('fmt':'json'), which here is a nested dict from the query_type dict
    # params comes in the the proper dict value (which is a nested dict)
    # so this creates a new dict which the requests module knows how to handle:
        # {"inc":"releases", "fmt":"json"}
    params["fmt"] = fmt
    
    # requests.get sends a GET request, returns a requests.Response object.
    # a Response object contains a server's response to an HTTP request
    #Input params for requests.get are:
        # url: a url string
        # params: a dict to also be sent in the request
    # in this case:
        # url is ARTIST_URL concatenated with uid, which is "" or whatever new string is passed
        # params is a dict from query type ({"inc": "releases"})
            # this dict will be sent in the query string for the request
            # note, the requests package handles formatting the url string for me, so it handles the dict properly
    r = requests.get(url + uid, params=params)
    
    # r is now a requests.Response object - so it's properties can be accessed
    print("Requesting From URL: ", r.url) 

    # return JSON object if exists, or throw error if not
    if r.status_code == requests.codes.ok:
        return r.json()
    else:
        r.raise_for_status()


def query_by_name(url, params, name):
    # This adds an artist name to the query parameters before making
    # an API call to the function above.
    params["query"] = "artist:" + name
    return query_site(url, params)


def pretty_print(data, indent=4):
    # After we get our output, we can format it to be more readable
    # by using this function.
    if type(data) == dict:
        print(json.dumps(data, indent=indent, sort_keys=True)) 
    else:
        print(data) 


def main():
    '''
    Modify the function calls and indexing below to answer the questions on
    the next quiz. HINT: Note how the output we get from the site is a
    multi-level JSON document, so try making print statements to step through
    the structure one level at a time or copy the output to a separate output
    file.
    '''
    results = query_by_name(ARTIST_URL, query_type["simple"], "First And Kit")
    pretty_print(results)

    artist_id = results["artists"][1]["id"]
    print("\nARTIST:") 
    pretty_print(results["artists"][1])
    print("RELEASES: ", len(results))

    artist_data = query_site(ARTIST_URL, query_type["releases"], artist_id)
    releases = artist_data["releases"]
    print("\nONE RELEASE:") 
    pretty_print(releases[0], indent=2)
    release_titles = [r["title"] for r in releases]

    print("\nALL TITLES:") 
    for t in release_titles:
        print(t)


# this is the top-level code block, first to be executed after import declarations
if __name__ == '__main__':
    main()

##### Problem Set: Using CSV Module

In [None]:
"""
Your task is to process the supplied file and use the csv module to extract data from it.
The data comes from NREL (National Renewable Energy Laboratory) website. Each file
contains information from one meteorological station, in particular - about amount of
solar and wind energy for each hour of day.

Note that the first line of the datafile is neither data entry, nor header. It is a line
describing the data source. You should extract the name of the station from it.

The data should be returned as a list of lists (not dictionaries).
You can use the csv modules "reader" method to get data in such format.
Another useful method is next() - to get the next line from the iterator.
You should only change the parse_file function.
"""
import csv
import os

DATADIR = ""
DATAFILE = "supporting-files/745090.csv"


def parse_file(datafile):
    name = ""
    data = []
    with open(datafile,'r') as f:
        header = f.readline().split(',')
        name = header[1]
        
        # create in_reader object
        in_reader = csv.reader(f)
        
        # write data from in_reader to data list
        next(in_reader)
        for i in in_reader:
            data.append(i)
    # Do not change the line below
    name = name[1:-1]
    return (name, data)

parse_file(DATAFILE)

def test():
    datafile = os.path.join(DATADIR, DATAFILE)
    name, data = parse_file(datafile)

    assert name == "MOUNTAIN VIEW MOFFETT FLD NAS"
    assert data[0][1] == "01:00"
    assert data[2][0] == "01/01/2005"
    assert data[2][5] == "2"


if __name__ == "__main__":
    test()

##### Problem Set: Excel to CSV

In [None]:
'''
Algo:
LOAD FILE INTO
EXPORT AS CSV '''
# -*- coding: utf-8 -*-
'''
Find the time and value of max load for each of the regions
COAST, EAST, FAR_WEST, NORTH, NORTH_C, SOUTHERN, SOUTH_C, WEST
and write the result out in a csv file, using pipe character | as the delimiter.

An example output can be seen in the "example.csv" file.
'''

import xlrd
import os
import csv
from zipfile import ZipFile

datafile = "2013_ERCOT_Hourly_Load_Data.xls"
outfile = "2013_Max_Loads.csv"


def open_zip(datafile):
    with ZipFile('{0}.zip'.format(datafile), 'r') as myzip:
        myzip.extractall()


def parse_file(datafile):
    workbook = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)
    data = None
    # YOUR CODE HERE
    # Remember that you can use xlrd.xldate_as_tuple(sometime, 0) to convert
    # Excel date to Python tuple of (year, month, day, hour, minute, second)
    return data

def save_file(data, filename):
    # YOUR CODE HERE

    
def test():
    open_zip(datafile)
    data = parse_file(datafile)
    save_file(data, outfile)

    number_of_rows = 0
    stations = []

    ans = {'FAR_WEST': {'Max Load': '2281.2722140000024',
                        'Year': '2013',
                        'Month': '6',
                        'Day': '26',
                        'Hour': '17'}}
    correct_stations = ['COAST', 'EAST', 'FAR_WEST', 'NORTH',
                        'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST']
    fields = ['Year', 'Month', 'Day', 'Hour', 'Max Load']

    with open(outfile) as of:
        csvfile = csv.DictReader(of, delimiter="|")
        for line in csvfile:
            station = line['Station']
            if station == 'FAR_WEST':
                for field in fields:
                    # Check if 'Max Load' is within .1 of answer
                    if field == 'Max Load':
                        max_answer = round(float(ans[station][field]), 1)
                        max_line = round(float(line[field]), 1)
                        assert max_answer == max_line

                    # Otherwise check for equality
                    else:
                        assert ans[station][field] == line[field]

            number_of_rows += 1
            stations.append(station)

        # Output should be 8 lines not including header
        assert number_of_rows == 8

        # Check Station Names
        assert set(stations) == set(correct_stations)

        
if __name__ == "__main__":
    test()


### Extracting Data from XML Quiz

In [None]:
#!/usr/bin/env python
# Your task here is to extract data from xml on authors of an article
# and add it to a list, one item for an author.
# See the provided data structure for the expected format.
# The tags for first name, surname and email should map directly
# to the dictionary keys

# note, format is a list of dicts

import xml.etree.ElementTree as ET

article_file = "supporting-files/exampleResearchArticle.xml"


def get_root(fname):
    tree = ET.parse(fname)
    return tree.getroot()


# function takes in a root object
def get_authors(root):
    # list holds the dicts
    authors = []
    
    # root.findall contains all the objects of au tag
    for author in root.findall('./fm/bibl/aug/au'):
        # with each loop, data will be 
        data = {
                "fnm": None,
                "snm": None,
                "email": None
        }

        # YOUR CODE HERE
        data['fnm'] = author.find('fnm').text
        data['snm'] = author.find('snm').text
        data['email'] = author.find('email').text

        authors.append(data)

    return authors


def test():
    solution = [{'fnm': 'Omer', 'snm': 'Mei-Dan', 'email': 'omer@extremegate.com'}, {'fnm': 'Mike', 'snm': 'Carmont', 'email': 'mcarmont@hotmail.com'}, {'fnm': 'Lior', 'snm': 'Laver', 'email': 'laver17@gmail.com'}, {'fnm': 'Meir', 'snm': 'Nyska', 'email': 'nyska@internet-zahav.net'}, {'fnm': 'Hagay', 'snm': 'Kammar', 'email': 'kammarh@gmail.com'}, {'fnm': 'Gideon', 'snm': 'Mann', 'email': 'gideon.mann.md@gmail.com'}, {'fnm': 'Barnaby', 'snm': 'Clarck', 'email': 'barns.nz@gmail.com'}, {'fnm': 'Eugene', 'snm': 'Kots', 'email': 'eukots@gmail.com'}]
    
    # root loads the root object
    root = get_root(article_file)
    # data takes the root object in as an input param
    data = get_authors(root)

    assert data[0] == solution[0]
    assert data[1]["fnm"] == solution[1]["fnm"]


test()

##### NEED: complete XML and JSON challenges

In [None]:
import xml.etree.ElementTree as ET
import pprint

tree = ET.parse('supporting-files/exampleresearcharticle.xml')
root = tree.getroot()

print("\nChildren of root")
for child in root:
    print(child.tag)

### Wrangling JSON Quiz

Handling Attributes Quiz

In [None]:
#!/usr/bin/env python
# Your task here is to extract data from xml on authors of an article
# and add it to a list, one item for an author.
# See the provided data structure for the expected format.
# The tags for first name, surname and email should map directly
# to the dictionary keys, but you have to extract the attributes from the "insr" tag
# and add them to the list for the dictionary key "insr"
import xml.etree.ElementTree as ET

article_file = "supporting-files/exampleResearchArticle.xml"


def get_root(fname):
    tree = ET.parse(fname)
    return tree.getroot()


def get_authors(root):
    authors = []
    for author in root.findall('./fm/bibl/aug/au'):
        data = {
                "fnm": None,
                "snm": None,
                "email": None,
                "insr": []
        }

        # YOUR CODE HERE
        data['fnm'] = author.find('fnm').text
        data['snm'] = author.find('snm').text
        data['email'] = author.find('email').text
        insr = author.findall('./insr')
        for i in insr:
            data['insr'].append(i.attrib['iid'])

        authors.append(data)

    return authors


def test():
    solution = [{'insr': ['I1'], 'fnm': 'Omer', 'snm': 'Mei-Dan', 'email': 'omer@extremegate.com'},
                {'insr': ['I2'], 'fnm': 'Mike', 'snm': 'Carmont', 'email': 'mcarmont@hotmail.com'},
                {'insr': ['I3', 'I4'], 'fnm': 'Lior', 'snm': 'Laver', 'email': 'laver17@gmail.com'},
                {'insr': ['I3'], 'fnm': 'Meir', 'snm': 'Nyska', 'email': 'nyska@internet-zahav.net'},
                {'insr': ['I8'], 'fnm': 'Hagay', 'snm': 'Kammar', 'email': 'kammarh@gmail.com'},
                {'insr': ['I3', 'I5'], 'fnm': 'Gideon', 'snm': 'Mann', 'email': 'gideon.mann.md@gmail.com'},
                {'insr': ['I6'], 'fnm': 'Barnaby', 'snm': 'Clarck', 'email': 'barns.nz@gmail.com'},
                {'insr': ['I7'], 'fnm': 'Eugene', 'snm': 'Kots', 'email': 'eukots@gmail.com'}]

    root = get_root(article_file)
    data = get_authors(root)

    assert data[0] == solution[0]
    assert data[1]["insr"] == solution[1]["insr"]


test()

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Please note that the function 'make_request' is provided for your reference only.
# You will not be able to to actually use it from within the Udacity web UI.
# Your task is to process the HTML using BeautifulSoup, extract the hidden
# form field values for "__EVENTVALIDATION" and "__VIEWSTATE" and set the appropriate
# values in the data dictionary.
# All your changes should be in the 'extract_data' function
from bs4 import BeautifulSoup
import requests
import json

html_page = "supporting-files/page_source.html"


def extract_data(page):
    data = {"eventvalidation": "",
            "viewstate": ""}
    with open(page, "r") as html:
        # do something here to find the necessary values
        pass

    return data


def make_request(data):
    eventvalidation = data["eventvalidation"]
    viewstate = data["viewstate"]

    r = requests.post("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
                    data={'AirportList': "BOS",
                          'CarrierList': "VX",
                          'Submit': 'Submit',
                          "__EVENTTARGET": "",
                          "__EVENTARGUMENT": "",
                          "__EVENTVALIDATION": eventvalidation,
                          "__VIEWSTATE": viewstate
                    })

    return r.text


def test():
    data = extract_data(html_page)
    assert data["eventvalidation"] != ""
    assert data["eventvalidation"].startswith("/wEWjAkCoIj1ng0")
    assert data["viewstate"].startswith("/wEPDwUKLTI")

    
test()

Hidden phrase 1:
You got it!


Here's your first HIDDEN PHRASE.


HIDDEN PHRASE 1 of 3: 


"SELECT sql, statement FROM Udacious WHERE queryId = 35;"

 After unlocking all the phrases you'll be ready to collect your gift at the end of the course :)
 
 
 
 
 
 
 
THAT IS CORRECT!!


Here's your first HIDDEN PHRASE.


HIDDEN PHRASE 2 of 3: 


Table Name:
Udacious 


Columns:
problemSet INTEGER, node INTEGER,  queryId INTEGER, title TEXT, sql TEXT, statement TEXT"






WOW! You made it...Congratulations on all of you hard work.


HIDDEN MESSAGE #3 = Awesome.db


Head to the Gift page and plug in your hidden messages!

animals
This table lists individual animals in the zoo. Each animal has only one row. There may be multiple animals with the same name, or even multiple animals with the same name and species.
name — the animal's name (example: 'George')
species — the animal's species (example: 'gorilla')
birthdate — the animal's date of birth (example: '1998-05-18')
diet
This table matches up species with the foods they eat. Every species in the zoo eats at least one sort of food, and many eat more than one. If a species eats more than one food, there will be more than one row for that species.
species — the name of a species (example: 'hyena')
food — the name of a food that species eats (example: 'meat')
taxonomy
This table gives the (partial) biological taxonomic names for each species in the zoo. It can be used to find which species are more closely related to each other evolutionarily.
name — the common name of the species (e.g. 'jackal')
species — the taxonomic species name (e.g. 'aureus')
genus — the taxonomic genus name (e.g. 'Canis')
family — the taxonomic family name (e.g. 'Canidae')
t_order — the taxonomic order name (e.g. 'Carnivora')
If you've never heard of this classification, don't worry about it; the details won't be necessary for this course. But if you're curious, Wikipedia articles Taxonomy and Biological classification may help.

ordernames
This table gives the common names for each of the taxonomic orders in the taxonomy table.
t_order — the taxonomic order name (e.g. 'Cetacea')
name — the common name (e.g. 'whales and dolphins')

#
# Uncomment one of these QUERY variables at a time by deleting the #.
# Use "Test Run" to run it.
# You'll see the results below.  Then try your own queries as well!
#

QUERY = "select max(name) from animals;"

QUERY = "select * from animals limit 10;"

QUERY = "select * from animals where species = 'orangutan' order by birthdate;"

#QUERY = "select name from animals where species = 'orangutan' order by birthdate desc;"

#QUERY = "select name, birthdate from animals order by name limit 10 offset 20;"

#QUERY = "select species, min(birthdate) from animals group by species;"

#QUERY = '''
#select name, count(*) as num from animals
#group by name
#order by num desc
#limit 5;
#'''



SELECT Email, FirstName, LastName, Total FROM Customer LEFT JOIN Invoice ON Customer.CustomerId = Invoice.CustomerId;

SELECT CustomerId, SUM(Total) FROM Invoice GROUP BY CustomerId;

SELECT Customer.CustomerId, Customer.Email, Customer.FirstName, Customer.LastName, SUM(Invoice.Total) AS Total FROM Customer LEFT JOIN Invoice ON Customer.CustomerId = Invoice.CustomerId GROUP BY Customer.CustomerId;

SELECT Invoice.CustomerId, Customer.Email, Customer.FirstName, Customer.LastName, SUM(Total) AS Total FROM Invoice LEFT JOIN Customer On Invoice.CustomerId = Customer.CustomerId GROUP BY FirstName;

In [None]:
##  Rock Music Lives on!  After the success of your recent email campaign,  
##  you're interested in targeting your long standing Rock Music audience!
##  You'll need to collect a list of emails containing each of your Rock Music listeners.

##  Use your query to return the email, first name, last name, and Genre of all Rock Music listeners!
##  Return you list ordered alphabetically by email address starting with A.
##  Can you find a way to deal with duplicate email addresses so no one receives multiple emails?


QUERY ='''
SELECT ...
'''

'''
---VISUAL GUIDE---

Before query...

##############     ###############     #################     ############      ###########
#  Customer  #     #  Invoice    #     #  InvoiceLine  #     #  Track   #      #  Genre  # 
##############     ###############     #################     ############      ###########
| CustomerId | --> | CustomerId  |     |  TrackId      | --> | TrackId  |      |  Name   |
+============+     +=============+     +===============+     +==========+      +=========+
|  Email     |     |  InvoiceId  | --> |  InvoiceId    |     | GenreId  | -->  | GenreId |
+============+     +=============+     +===============+     +==========+      +=========+
|  FirstName |                                                  
+============+
|  LastName  |                                                              
+============+

After query...

###############################################
#                 CustomerGenre               #   <-----RESULT!
###############################################
|  Email  |  FirstName  |  LastName  | Genre  |
+=========+=============+============+========+
'''

##### Challenges

In [52]:
# command = '''
# SELECT 
# Customer.Email, 
# Customer.FirstName, 
# Customer.LastName, 
# Genre.Name AS Genre 
# FROM Customer 
#     JOIN Invoice 
#         ON Customer.CustomerId=Invoice.CustomerId
#     JOIN InvoiceLine 
#         ON Invoice.InvoiceId=InvoiceLine.InvoiceId
#     JOIN Track
#         ON InvoiceLine.TrackId=Track.TrackId 
#     JOIN Genre
#         ON Track.GenreId=Genre.GenreId
# WHERE Genre.GenreId = 1 
# GROUP BY Customer.CustomerId 
# ORDER BY Customer.Email ASC
# '''


# command = '''
# SELECT 
# BillingCity, SUM(Total ) 
# FROM Invoice 
# GROUP BY BillingCity 
# ORDER BY SUM(Total) DESC
# limit 1
# '''


# command = '''
# SELECT 
# Invoice.BillingCity,
# COUNT(Genre.Name),
# Genre.Name
# FROM Invoice 
#     JOIN InvoiceLine
#         ON Invoice.InvoiceId=InvoiceLine.InvoiceId
#     JOIN Track
#         ON InvoiceLine.TrackId=Track.TrackID
#     JOIN Genre
#         ON Track.GenreId=Genre.GenreId
# WHERE Invoice.BillingCity = 'Prague'
# GROUP BY Genre.Name
# ORDER BY COUNT(Genre.Name) DESC
# limit 3
# '''


# command = '''
# SELECT 
# Artist.Name as Artist, 
# COUNT(Genre.Name) as count 
# FROM Genre
#     JOIN Track
#         ON Genre.GenreId=Track.GenreId
#     JOIN Album
#         ON Track.AlbumId=Album.AlbumId
#     JOIN Artist
#         ON Album.ArtistId=Artist.ArtistId
# WHERE Genre.GenreId = 1
# GROUP BY Artist.Name
# ORDER BY COUNT(Genre.Name) DESC
# limit 10
# '''


# command = '''
# SELECT
# Invoice.BillingCity,
# Count(Genre.Name) AS NumTracks
# FROM Invoice
#     JOIN InvoiceLine
#         ON Invoice.InvoiceId=InvoiceLine.InvoiceId
#     JOIN Track
#         ON InvoiceLine.TrackId=Track.TrackId
#     JOIN Genre
#         ON Track.GenreId=Genre.GenreId
# WHERE Genre.GenreId = 4 and Invoice.BillingCountry = 'France'
# GROUP BY Invoice.BillingCity
# ORDER BY NumTracks DESC
# '''


# command = '''
# SELECT SUM(Total)
# FROM 
# (SELECT COUNT(*) AS Total
# FROM Invoice
# GROUP BY BillingCountry
# ORDER BY Total DESC
# LIMIT 5);
# '''


# command = '''
# SELECT BillingCity, BillingState, BillingCountry, Total
# FROM Invoice,
# (SELECT AVG(Total) AS Average
# FROM Invoice) as Subquery
# WHERE Total > average;
# '''


# command = '''
# SELECT FirstName, LastName, BillingCity, BillingState, BillingCountry, Total
# FROM Invoice
#     JOIN Customer
#         JOIN (SELECT avg(Total) AS Average FROM Invoice) AS Subquery
# WHERE Total > Average;
# '''


# command = '''
# SELECT FirstName, LastName, BillingCity, BillingState, BillingCountry, Total
# FROM Invoice
#     JOIN Customer
#         JOIN (SELECT avg(Total) AS Average FROM Invoice) AS Subquery
# WHERE Total > Average;
# '''


# command = '''
# SELECT COUNT(DISTINCT Invoice.CustomerId), Invoice.CustomerId, Invoice.InvoiceId, InvoiceLine.InvoiceLineId, InvoiceLine.TrackId, Track.GenreId
# FROM Invoice
#     JOIN InvoiceLine
#         ON Invoice.InvoiceId=InvoiceLine.InvoiceId
#     JOIN Track
#         ON InvoiceLine.TrackId=Track.TrackId
# WHERE Track.GenreId=2
# '''

In [3]:
import sqlite3
import pandas as pd
# stop truncated columns on a DataFrame
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 6)

conn = sqlite3.connect('supporting-files/chinook.db')
c = conn.cursor()

In [31]:
# query to find average song length:
# SELECT AVG(Track.Milliseconds) AverageSongLenth FROM Track

# test vector join average song lenth
command = '''
SELECT *, COUNT(Track.GenreId)
FROM Track
    JOIN (SELECT AVG(Track.Milliseconds) AverageSongLength FROM Track) AS Subquery
WHERE Track.Milliseconds < Subquery.AverageSongLength
GROUP BY Track.GenreId
'''


# command = '''
# SELECT BillingCity, BillingState, BillingCountry, Total
# FROM Invoice,
# (SELECT AVG(Total) AS Average
# FROM Invoice) as Subquery
# WHERE Total > average;
# '''

c.execute(command)

<sqlite3.Cursor at 0x111eb0f80>

In [32]:
df = pd.read_sql(command, conn)
df

Unnamed: 0,TrackId,Name,AlbumId,MediaTypeId,GenreId,Composer,Milliseconds,Bytes,UnitPrice,AverageSongLength,COUNT(Track.GenreId)
0,3355,Love Comes,265,5,1,"Darius ""Take One"" Minwalla/Jon Auer/Ken String...",199923,3240609,0.99,393599.212104,1162
1,3357,OAM's Blues,267,5,2,Aaron Goldberg,266936,4292028,0.99,393599.212104,116
2,3145,Sweet Lady Luck,141,1,3,Vandenberg,273737,8919163,0.99,393599.212104,300
...,...,...,...,...,...,...,...,...,...,...,...
18,3478,Slowness,323,2,23,,215386,3644793,0.99,393599.212104,38
19,3502,"Quintet for Horn, Violin, 2 Violas, and Cello ...",346,2,24,Wolfgang Amadeus Mozart,221331,3665114,0.99,393599.212104,60
20,3451,"Die Zauberflöte, K.620: ""Der Hölle Rache Kocht...",317,2,25,Wolfgang Amadeus Mozart,174813,2861468,0.99,393599.212104,1


In [11]:
# -*- coding: utf-8 -*-
'''
Find the time and value of max load for each of the regions
COAST, EAST, FAR_WEST, NORTH, NORTH_C, SOUTHERN, SOUTH_C, WEST
and write the result out in a csv file, using pipe character | as the delimiter.

An example output can be seen in the "example.csv" file.
'''

import xlrd
import os
import csv
# from zipfile import ZipFile

datafile = "supporting-files/2013_ERCOT_Hourly_Load_Data.xls"
outfile = "2013_Max_Loads.csv"


# def open_zip(datafile):
#     with ZipFile('{0}.zip'.format(datafile), 'r') as myzip:
#         myzip.extractall()


# parses an excel file to make a dict
def parse_file(datafile):
#     workbook = xlrd.open_workbook(datafile)
    # open the workbook
    workbook = xlrd.open_workbook(datafile)
    
    # open the sheet
    sheet = workbook.sheet_by_index(0)
    
    # empty dict
    data = {}
    
    # process all rows that contain station data
    for n in range (1, 9):
        # uses sheet.cell_value() function to grab the header (EAST, FAR_WEST, etc.)
        # sheet.cell_value(row, col)
        station = sheet.cell_value(0, n)
        
        # creates a list out of all the values from a column
        cv = sheet.col_values(n, start_rowx=1, end_rowx=None)
        
        # find max value from the list
        maxval = max(cv)
        
        # using .index function, find the index position of the max value
        maxpos = cv.index(maxval) + 1
        
        # use the position of the max value to find the timestamp
        maxtime = sheet.cell_value(maxpos, 0)
        
        # converts the timestamp into a tuple
        realtime = xlrd.xldate_as_tuple(maxtime, 0)
        
        # dict operation to add the station name as the key
        # and its value as a nested dict with the maximum value float and tuple of date 
        data[station] = {"maxval": maxval,
                         "maxtime": realtime}

    print(data)
    return data

# uses the dict to make a csv file
def save_file(data, filename):
    # open desired file name in write mode
    with open(filename, "w") as f:
        
        # creates writer object with a delimeter
        w = csv.writer(f, delimiter='|')
        
        # .writerow function to make the first row
        w.writerow(["Station", "Year", "Month", "Day", "Hour", "Max Load"])
        
        
        for s in data:
            # unpack the tuple - note, underscores are just going to be ignored
            year, month, day, hour, _ , _= data[s]["maxtime"]
            
            # note, s in this loop is the Station because looping over a dict
            # loops over the keys
            w.writerow([s, year, month, day, hour, data[s]["maxval"]])

    
def test():
#     open_zip(datafile)
    data = parse_file(datafile)
    save_file(data, outfile)

    number_of_rows = 0
    stations = []

    ans = {'FAR_WEST': {'Max Load': '2281.2722140000024',
                        'Year': '2013',
                        'Month': '6',
                        'Day': '26',
                        'Hour': '17'}}
    correct_stations = ['COAST', 'EAST', 'FAR_WEST', 'NORTH',
                        'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST']
    fields = ['Year', 'Month', 'Day', 'Hour', 'Max Load']

    with open(outfile) as of:
        csvfile = csv.DictReader(of, delimiter="|")
        for line in csvfile:
            station = line['Station']
            if station == 'FAR_WEST':
                for field in fields:
                    # Check if 'Max Load' is within .1 of answer
                    if field == 'Max Load':
                        max_answer = round(float(ans[station][field]), 1)
                        max_line = round(float(line[field]), 1)
                        assert max_answer == max_line

                    # Otherwise check for equality
                    else:
                        assert ans[station][field] == line[field]

            number_of_rows += 1
            stations.append(station)

        # Output should be 8 lines not including header
        assert number_of_rows == 8

        # Check Station Names
        assert set(stations) == set(correct_stations)

        
if __name__ == "__main__":
    test()


{'COAST': {'maxval': 18779.025510000003, 'maxtime': (2013, 8, 13, 17, 0, 0)}, 'SOUTH_C': {'maxval': 11433.30491600001, 'maxtime': (2013, 8, 8, 18, 0, 0)}, 'NORTH': {'maxval': 1544.7707140000005, 'maxtime': (2013, 8, 7, 17, 0, 0)}, 'NORTH_C': {'maxval': 24415.570226999993, 'maxtime': (2013, 8, 7, 18, 0, 0)}, 'WEST': {'maxval': 1862.6137649999998, 'maxtime': (2013, 8, 7, 17, 0, 0)}, 'SOUTHERN': {'maxval': 5494.157645, 'maxtime': (2013, 8, 8, 16, 0, 0)}, 'EAST': {'maxval': 2380.1654089999956, 'maxtime': (2013, 8, 5, 17, 0, 0)}, 'FAR_WEST': {'maxval': 2281.2722140000024, 'maxtime': (2013, 6, 26, 17, 0, 0)}}


In [14]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Your task in this exercise is to modify 'extract_carrier()` to get a list of
all airlines. Exclude all of the combination values like "All U.S. Carriers"
from the data that you return. You should return a list of codes for the
carriers.

All your changes should be in the 'extract_carrier()' function. The
'options.html' file in the tab above is a stripped down version of what is
actually on the website, but should provide an example of what you should get
from the full file.

Please note that the function 'make_request()' is provided for your reference
only. You will not be able to to actually use it from within the Udacity web UI.
"""

from bs4 import BeautifulSoup
html_page = "options.html"


def extract_carriers(page):
    data = []

    with open(page, "r") as html:
        # do something here to find the necessary values
        soup = BeautifulSoup(html, "lxml")

    return data


def make_request(data):
    eventvalidation = data["eventvalidation"]
    viewstate = data["viewstate"]
    airport = data["airport"]
    carrier = data["carrier"]

    r = s.post("https://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
               data = (("__EVENTTARGET", ""),
                       ("__EVENTARGUMENT", ""),
                       ("__VIEWSTATE", viewstate),
                       ("__VIEWSTATEGENERATOR",viewstategenerator),
                       ("__EVENTVALIDATION", eventvalidation),
                       ("CarrierList", carrier),
                       ("AirportList", airport),
                       ("Submit", "Submit")))

    return r.text


def test():
    data = extract_carriers(html_page)
    assert len(data) == 16
    assert "FL" in data
    assert "NK" in data

if __name__ == "__main__":
    test()

FileNotFoundError: [Errno 2] No such file or directory: 'popular-viewed-1.json'

In [106]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# So, the problem is that the gigantic file is actually not a valid XML, because
# it has several root elements, and XML declarations.
# It is, a matter of fact, a collection of a lot of concatenated XML documents.
# So, one solution would be to split the file into separate documents,
# so that you can process the resulting files as valid XML documents.

"""
    Split the input file into separate files, each containing a single patent.
    As a hint - each patent declaration starts with the same line that was
    causing the error found in the previous exercises.
    
    The new files should be saved with filename in the following format:
    "{}-{}".format(filename, n) where n is a counter, starting from 0.
"""


import xml.etree.ElementTree as ET
PATENTS = 'supporting-files/patent.data'

def get_root(fname):
    tree = ET.parse(fname)
    return tree.getroot()


def split_file(filename):

    # open file and throw all rows into a list
    doc_list = []
    with open (filename, 'r') as f:
        for row in f:
            doc_list.append(row)
    
    
    # find indices where xml header appears
    search_str = '<?xml version="1.0" encoding="UTF-8"?>\n'
    indices = [i for i, x in enumerate(doc_list) if x == search_str]
    

    # make new list with nest list of each document
    indices.append(len(doc_list))
    s_doc_list = [doc_list[indices[i]:indices[i+1]] for i in range(len(indices)-1)]
    
    for n in range(len(s_doc_list)):
        content = s_doc_list[n]
        with open("{}-{}".format(filename, n), 'w') as w:
            for string in content:
                w.write(string)
    
    pass

split_file(PATENTS)

In [73]:
    # REQUIREMENTS
        # input a .data file (concatenated XML files)
        # output four XML docs
            # patent.data-0
            # patent.data-1
            # ...
    # EXECUTION
        # with-open doc
            # throw contents into a list
        # split into multiple lists
            # REQUIREMENTS: one list must be sliced based on another list
            # RULES: list 1 with n items must be sliced at second list:
            # COMPUTATION:
                # find indices of all xml headers
                # indices list appended with length of document
                # new list comprises old list cut at i to i+1 from reference list
                    # performed one less than the length of the reference list

In [82]:
str_list = ["string 1 ", "string 2"]

with open('newfile.txt', 'w') as w:
    for string in str_list:
        w.write(string+"\n")

In [65]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Your task is to use the iterative parsing to process the map file and
find out not only what tags are there, but also how many, to get the
feeling on how much of which data you can expect to have in the map.
Fill out the count_tags function. It should return a dictionary with the 
tag name as the key and number of times this tag can be encountered in 
the map as value.

osm_check = {'tag':number_instances,...}

Note that your code will be tested with a different data file than the 'example.osm'
"""
import xml.etree.cElementTree as ET
import pprint

# REQS:
    # make dict
# EX:
    # REQS:
        # make dict
    # RULES:
        # node must be uniquely identified
        # node must be added to a dict and not repeated
        # use xml package to make root
    # COMP:
        # iterate over each line
            # if does not contain '/'
                # add to dict
                    # if exists already, add and increment value by 1
                    # else, just add
    # DECOMP:
        # Iterate over each line, add node name to dict with value 1
            # if item is not in dict already, add and make value 1
            # if in dict already, increment that key's value by 1
    
def count_tags(filename):
    # create XML tree objet
    tree = ET.parse(filename)
    
    # create root object to parse the XML tree
    root = tree.getroot()
    
    # holds all tags (keys) and counts for each instance of key (value)
    tags_dict = {}
    
    # fills in dict with tags and counts
    for element in tree.iter():
        if element.tag in tags_dict:
            tags_dict[element.tag] += 1
        else:
            tags_dict[element.tag] = 1
            
    return tags_dict

def test():

    tags = count_tags('supporting-files/example.osm')
    pprint.pprint(tags)
    assert tags == {'bounds': 1,
                     'member': 3,
                     'nd': 4,
                     'node': 20,
                     'osm': 1,
                     'relation': 1,
                     'tag': 7,
                     'way': 1}

    

if __name__ == "__main__":
    test()

{'bounds': 1,
 'member': 3,
 'nd': 4,
 'node': 20,
 'osm': 1,
 'relation': 1,
 'tag': 7,
 'way': 1}
