## LESSON 1 NOTES

### LESSON 1.a - Dealing with csv files


In [1]:
# Your task is to read the input DATAFILE line by line, and for the first
# 10 lines (not including the header)
# split each line on "," and then for each line, create a dictionary
# where the key is the header title of the field, and the value is the 
# value of that field in the row.

# The function parse_file should return a list of dictionaries,
# each data line in the file being a single list entry.
# Field names and values should not contain extra whitespace, like spaces 
# or newline characters.
# You can use the Python string method strip() to remove the extra 
# whitespace.
# You have to parse only the first 10 data lines in this exercise,
# so the returned list should have 10 entries!
import os

DATADIR = ""
DATAFILE = "beatles-diskography.csv"

def parse_file(datafile):
    data = []
    with open(datafile, "r") as f:
        count = 0
        header = f.readline().split(",") # read 1st line and split strings
        for line in f:
            if count == 10:
                break
            fields = line.split(",") # for every line in f split strings
            entry = {} # create new dictionary
            
            for i, value in enumerate(fields):
                entry[header[i].strip()] = value.strip() #strip pulls out white space
            data.append(entry)
            count += 1
    print data

parse_file(DATAFILE)


[{'Title': 'Please Please Me', 'UK Chart Position': '1', 'Label': 'Parlophone(UK)', 'Released': '22 March 1963', 'US Chart Position': '\xe2\x80\x94', 'RIAA Certification': 'Platinum', 'BPI Certification': 'Gold'}, {'Title': 'With the Beatles', 'UK Chart Position': '1', 'Label': 'Parlophone(UK)', 'Released': '22 November 1963', 'US Chart Position': '\xe2\x80\x94', 'RIAA Certification': 'Gold', 'BPI Certification': 'Platinum'}, {'Title': 'Beatlemania! With the Beatles', 'UK Chart Position': '\xe2\x80\x94', 'Label': 'Capitol(CAN)', 'Released': '25 November 1963', 'US Chart Position': '\xe2\x80\x94', 'RIAA Certification': '', 'BPI Certification': ''}, {'Title': 'Introducing... The Beatles', 'UK Chart Position': '\xe2\x80\x94', 'Label': 'Vee-Jay(US)', 'Released': '10 January 1964', 'US Chart Position': '2', 'RIAA Certification': '', 'BPI Certification': ''}, {'Title': 'Meet the Beatles!', 'UK Chart Position': '\xe2\x80\x94', 'Label': 'Capitol(US)', 'Released': '20 January 1964', 'US Chart P

### LESSON 1.b - problematic line
The problem lay in a row that had a comma in a cell: Parphor(NZ), Capitol(US) this value would hault our above function.


### LESSON 1.c - Reading & Parsing  Excel File


In [2]:
import xlrd

datafile = "2013_ERCOT_Hourly_Load_Data.xls"


def parse_file(datafile):
    workbook = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)

    data = [[sheet.cell_value(r, col) # reading all workbook data into a list
                for col in range(sheet.ncols)] 
                    for r in range(sheet.nrows)]

    print "\nList Comprehension for data[3][2]:", data[3][2]

    print "\nCells in a nested loop:"    
    for row in range(sheet.nrows):
        for col in range(sheet.ncols):
            if row == 50:
                print sheet.cell_value(row, col),


    ### other useful methods:
    print "\nROWS, COLUMNS, and CELLS:"
    print "Number of rows in the sheet:", 
    print sheet.nrows
    print "Type of data in cell (row 3, col 2):", 
    print sheet.cell_type(3, 2)
    print "Value in cell (row 3, col 2):", 
    print sheet.cell_value(3, 2)
    print "Get a slice of values in column 3, from rows 1-3:"
    print sheet.col_values(3, start_rowx=1, end_rowx=4)

    print "\nDATES:"
    print "Type of data in cell (row 1, col 0):", 
    print sheet.cell_type(1, 0)
    exceltime = sheet.cell_value(1, 0)
    print "Time in Excel format:",
    print exceltime
    print "Convert time to a Python datetime tuple, from the Excel float:",
    print xlrd.xldate_as_tuple(exceltime, 0)

    return data

data = parse_file(datafile)


List Comprehension for data[3][2]: 1036.088697

Cells in a nested loop:
41277.0833333 9238.73731 1438.20528 1565.442856 916.708348 14010.903488 3027.98334 6165.211119 1157.741663 37520.933404 
ROWS, COLUMNS, and CELLS:
Number of rows in the sheet: 7296
Type of data in cell (row 3, col 2): 2
Value in cell (row 3, col 2): 1036.088697
Get a slice of values in column 3, from rows 1-3:
[1411.7505669999982, 1403.4722870000019, 1395.053150000001]

DATES:
Type of data in cell (row 1, col 0): 3
Time in Excel format: 41275.0416667
Convert time to a Python datetime tuple, from the Excel float: (2013, 1, 1, 1, 0, 0)


### LESSON 1.d - Wrangling Excel using xlrd

In [28]:
#!/usr/bin/env python
"""
Your task is as follows:
- read the provided Excel file
- find and return the min, max and average values for the COAST region
- find and return the time value for the min and max entries
- the time values should be returned as Python tuples

Please see the test function for the expected return format
"""

import xlrd
from zipfile import ZipFile
datafile = "2013_ERCOT_Hourly_Load_Data.xls"

# def open_zip(datafile):
#     with ZipFile('{0}.zip'.format(datafile), 'r') as myzip:
#         myzip.extractall()

def parse_file(datafile):
    workbook = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)

    ### example on how you can get the data
    data = [[sheet.cell_value(r, col) for col in range(sheet.ncols)] for r in range(sheet.nrows)]
    coast_col = sheet.col_values(1, start_rowx=1, end_rowx=None)
    
    maxvalue = max(coast_col)
    minvalue = min(coast_col)

    maxpos = coast_col.index(maxvalue) + 1 # get row index position
    minpos = coast_col.index(minvalue) + 1 # add 1 due to row and col starts at 0
    
    maxtime = sheet.cell_value(maxpos, 0)
    real_max_time = xlrd.xldate_as_tuple(maxtime, 0)
    mintime = sheet.cell_value(minpos, 0)
    real_min_time = xlrd.xldate_as_tuple(mintime, 0)
    
    data = {
            'maxtime': real_max_time,
            'maxvalue': maxvalue,
            'mintime': real_min_time,
            'minvalue': minvalue,
            'avgcoast': sum(coast_col) / float(len(coast_col))
    }
    return data
parse_file(datafile)

# def test():
#     open_zip(datafile)
#     data = parse_file(datafile)

#     assert data['maxtime'] == (2013, 8, 13, 17, 0, 0)
#     assert round(data['maxvalue'], 10) == round(18779.02551, 10)


# test()

{'avgcoast': 10976.933460679751,
 'maxtime': (2013, 8, 13, 17, 0, 0),
 'maxvalue': 18779.025510000003,
 'mintime': (2013, 2, 3, 4, 0, 0),
 'minvalue': 6602.113898999982}

### LESSON 1.e Wrangling JSON

In [67]:
# To experiment with this code freely you will have to run this code locally.
# Take a look at the main() function for an example of how to use the code.
# We have provided example json output in the other code editor tabs for you to
# look at, but you will not be able to run any queries through our UI.
import json
import requests


BASE_URL = "http://musicbrainz.org/ws/2/"
ARTIST_URL = BASE_URL + "artist/"

# query parameters are given to the requests.get function as a dictionary; this
# variable contains some starter parameters.
query_type = {  "simple": {},
                "atr": {"inc": "aliases+tags+ratings"},
                "aliases": {"inc": "aliases"},
                "releases": {"inc": "releases"}}


def query_site(url, params, uid="", fmt="json"):
    # This is the main function for making queries to the musicbrainz API.
    # A json document should be returned by the query.
    params["fmt"] = fmt
    r = requests.get(url + uid, params=params)
    print "requesting", r.url

    if r.status_code == requests.codes.ok:
        return r.json()
    else:
        r.raise_for_status()


def query_by_name(url, params, name):
    # This adds an artist name to the query parameters before making
    # an API call to the function above.
    params["query"] = "artist:" + name
    return query_site(url, params)


def pretty_print(data, indent=4):
    # After we get our output, we can format it to be more readable
    # by using this function.
    if type(data) == dict:
        print json.dumps(data, indent=indent, sort_keys=True)
    else:
        print data


def main():
    '''
    Modify the function calls and indexing below to answer the questions on
    the next quiz. HINT: Note how the output we get from the site is a
    multi-level JSON document, so try making print statements to step through
    the structure one level at a time or copy the output to a separate output
    file.
    '''
    results = query_by_name(ARTIST_URL, query_type["simple"], "Queen")
    pretty_print(results)
    
    artist_id = results["artists"][1]["id"]
    print "\nARTIST:"
    pretty_print(results["artists"][1])

    artist_data = query_site(ARTIST_URL, query_type["releases"], artist_id)
    releases = artist_data["releases"]
    print "\nONE RELEASE:"
    pretty_print(releases[0], indent=2)
    release_titles = [r["title"] for r in releases]

    print "\nALL TITLES:"
    for t in release_titles:
        print t


if __name__ == '__main__':
    main()


requesting http://musicbrainz.org/ws/2/artist/?query=artist%3AQueen&fmt=json
{
    "artists": [
        {
            "aliases": [
                {
                    "begin-date": "2011", 
                    "end-date": null, 
                    "locale": null, 
                    "name": "Queen + Adam Lambert", 
                    "primary": null, 
                    "sort-name": "Queen + Adam Lambert", 
                    "type": null
                }
            ], 
            "area": {
                "id": "8a754a16-0027-3a29-b6d7-2b40ea0481ed", 
                "name": "United Kingdom", 
                "sort-name": "United Kingdom"
            }, 
            "begin-area": {
                "id": "f03d09b3-39dc-4083-afd6-159e3f0d462f", 
                "name": "London", 
                "sort-name": "London"
            }, 
            "country": "GB", 
            "disambiguation": "UK rock group", 
            "id": "0383dadf-2a4e-4d10-a46a-e9e041da8eb3", 
         

IndexError: list index out of range

## LESSON 1 EXERCISES

In [135]:
#!/usr/bin/env python
"""
The data should be returned as a list of lists (not dictionaries).
You can use the csv modules "reader" method to get data in such format.
Another useful method is next() - to get the next line from the iterator.
You should only change the parse_file function.
"""
import csv
import os

DATADIR = ""
DATAFILE = "745090.csv"

def parse_file(datafile):
    name = ""
    data = []
    with open(datafile,'rb') as f:
        # create a reader object that is the csv module which parses the data
        reader = csv.reader(f, delimiter=',')
        # this gives us the first line of the list which next passes, we want the 1st item
        name = next(reader)[1]
        # next passes the next line as a list, in this case the whole header.
        header = next(reader)
        # to create a lists of lists from the data
        data = list(list(rec) for rec in csv.reader(f, delimiter=','))
    return (name, data)

parse_file(DATAFILE)

# better solution:..................
def parse_file(datafile):
    name = ""
    data = []
    with open(datafile,'rb') as f:
        r = csv.reader(f)
        name = r.next()[1]
        header = r.next()
        data = [row for row in r]

    return (name, data)

('MOUNTAIN VIEW MOFFETT FLD NAS',
 [['01/01/2005',
   '01:00',
   '0',
   '0',
   '0',
   '2',
   '0',
   '0',
   '2',
   '0',
   '0',
   '2',
   '0',
   '0',
   '2',
   '0',
   '0',
   '2',
   '0',
   '0',
   '2',
   '0',
   '0',
   '2',
   '0',
   '3',
   'E',
   '9',
   '3',
   'E',
   '9',
   '8.0',
   'A',
   '7',
   '6.0',
   'A',
   '7',
   '87',
   'A',
   '7',
   '1013',
   'A',
   '7',
   '150',
   'A',
   '7',
   '2.1',
   'A',
   '7',
   '16100',
   'A',
   '7',
   '77777',
   'A',
   '7',
   '1.1',
   'E',
   '8',
   '0.099',
   'F',
   '8',
   '0.160',
   'F',
   '8',
   '0',
   '1',
   'A',
   '7'],
  ['01/01/2005',
   '02:00',
   '0',
   '0',
   '0',
   '2',
   '0',
   '0',
   '2',
   '0',
   '0',
   '2',
   '0',
   '0',
   '2',
   '0',
   '0',
   '2',
   '0',
   '0',
   '2',
   '0',
   '0',
   '2',
   '0',
   '10',
   'E',
   '9',
   '10',
   'E',
   '9',
   '8.0',
   'A',
   '7',
   '7.0',
   'A',
   '7',
   '93',
   'A',
   '7',
   '1013',
   'A',
   '7',
   '0',
   

In [203]:
# -*- coding: utf-8 -*-
# Find the time and value of max load for each of the regions
# COAST, EAST, FAR_WEST, NORTH, NORTH_C, SOUTHERN, SOUTH_C, WEST
# and write the result out in a csv file, using pipe character | as the delimiter.
# An example output can be seen in the "example.csv" file.

import xlrd
import os
import csv
from zipfile import ZipFile

datafile = "2013_ERCOT_Hourly_Load_Data.xls"
outfile = "2013_Max_Loads.csv"

def parse_file(datafile):
    data=[]
    workbook = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)
    data1 = [[sheet.cell_value(row, col) for col in range(sheet.ncols-1)] for row in range(sheet.nrows)]
    
    data = {}
    # process all rows that contain station data
    for n in range (1, 9):
        station = sheet.cell_value(0, n)
        cv = sheet.col_values(n, start_rowx=1, end_rowx=None)
        maxval = max(cv)
        
        maxpos = cv.index(maxval) + 1
        maxtime = sheet.cell_value(maxpos, 0)
        realtime = xlrd.xldate_as_tuple(maxtime, 0)
        data[station] = {"maxval": maxval,
                         "maxtime": realtime}

def save_file(data, filename):
    with open(filename, "w") as f:
        w = csv.writer(f, delimiter='|')
        w.writerow(["Station", "Year", "Month", "Day", "Hour", "Max Load"])
        for s in data:
            year, month, day, hour, _ , _= data[s]["maxtime"]
            w.writerow([s, year, month, day, hour, data[s]["maxval"]])
    

parse_file(datafile)
save_file(data, outfile)

{u'FAR_WEST': {'maxval': 2281.2722140000024, 'maxtime': (2013, 6, 26, 17, 0, 0)}, u'NORTH': {'maxval': 1544.7707140000005, 'maxtime': (2013, 8, 7, 17, 0, 0)}, u'WEST': {'maxval': 1862.6137649999998, 'maxtime': (2013, 8, 7, 17, 0, 0)}, u'SOUTHERN': {'maxval': 5494.157645, 'maxtime': (2013, 8, 8, 16, 0, 0)}, u'SOUTH_C': {'maxval': 11433.30491600001, 'maxtime': (2013, 8, 8, 18, 0, 0)}, u'COAST': {'maxval': 18779.025510000003, 'maxtime': (2013, 8, 13, 17, 0, 0)}, u'NORTH_C': {'maxval': 24415.570226999993, 'maxtime': (2013, 8, 7, 18, 0, 0)}, u'EAST': {'maxval': 2380.1654089999956, 'maxtime': (2013, 8, 5, 17, 0, 0)}}


TypeError: list indices must be integers, not list

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
This exercise shows some important concepts that you should be aware about:
- using codecs module to write unicode files
- using authentication with web APIs
- using offset when accessing web APIs

Your task is to process the saved file that represents the most popular (by view count)
articles in the last day, and return the following data:
- list of dictionaries, where the dictionary key is "section" and value is "title"
- list of URLs for all media entries with "format": "Standard Thumbnail"
"""
import json
import codecs
import requests

URL_MAIN = "http://api.nytimes.com/svc/"
URL_POPULAR = URL_MAIN + "mostpopular/v2/"
API_KEY = { "popular": "",
            "article": ""}


def get_from_file(kind, period):
    filename = "popular-{0}-{1}.json".format(kind, period)
    with open(filename, "r") as f:
        return json.loads(f.read())


def article_overview(kind, period):
    data = get_from_file(kind, period)
    titles = []
    urls =[]
    # YOUR CODE HERE

    return (titles, urls)


def query_site(url, target, offset):
    # This will set up the query with the API key and offset
    # Web services often use offset paramter to return data in small chunks
    # NYTimes returns 20 articles per request, if you want the next 20
    # You have to provide the offset parameter
    if API_KEY["popular"] == "" or API_KEY["article"] == "":
        print "You need to register for NYTimes Developer account to run this program."
        print "See Intructor notes for information"
        return False
    params = {"api-key": API_KEY[target], "offset": offset}
    r = requests.get(url, params = params)

    if r.status_code == requests.codes.ok:
        return r.json()
    else:
        r.raise_for_status()


def get_popular(url, kind, days, section="all-sections", offset=0):
    # This function will construct the query according to the requirements of the site
    # and return the data, or print an error message if called incorrectly
    if days not in [1,7,30]:
        print "Time period can be 1,7, 30 days only"
        return False
    if kind not in ["viewed", "shared", "emailed"]:
        print "kind can be only one of viewed/shared/emailed"
        return False

    url += "most{0}/{1}/{2}.json".format(kind, section, days)
    data = query_site(url, "popular", offset)

    return data


def save_file(kind, period):
    # This will process all results, by calling the API repeatedly with supplied offset value,
    # combine the data and then write all results in a file.
    data = get_popular(URL_POPULAR, "viewed", 1)
    num_results = data["num_results"]
    full_data = []
    with codecs.open("popular-{0}-{1}.json".format(kind, period), encoding='utf-8', mode='w') as v:
        for offset in range(0, num_results, 20):        
            data = get_popular(URL_POPULAR, kind, period, offset=offset)
            full_data += data["results"]
        
        v.write(json.dumps(full_data, indent=2))


## LESSON 2 NOTES

### LESSON 2.a Extracting data from XML (proper used XML)

In [79]:
#!/usr/bin/env python
# Your task here is to extract data from xml on authors of an article
# and add it to a list, one item for an author.
# See the provided data structure for the expected format.
# The tags for first name, surname and email should map directly
# to the dictionary keys
import xml.etree.ElementTree as ET

article_file = "exampleResearchArticle.xml"

def get_root(fname):
    tree = ET.parse(fname)
    return tree.getroot()

def get_authors(root):
    authors = []
    for author in root.findall('./fm/bibl/aug/au'):
        data = {
                "fnm": None,
                "snm": None,
                "email": None
        }

        # YOUR CODE HERE
        data["fnm"] = author.find('./fnm').text
        data["snm"] = author.find('./snm').text
        data["email"] = author.find('./email').text
        
        authors.append(data)

    return authors

AttributeError: 'str' object has no attribute 'findall'

### LESSON 2.b Handling XML Attributes

In [82]:
#!/usr/bin/env python
# Your task here is to extract data from xml on authors of an article
# and add it to a list, one item for an author.
# See the provided data structure for the expected format.
# The tags for first name, surname and email should map directly
# to the dictionary keys, but you have to extract the attributes from the "insr" tag
# and add them to the list for the dictionary key "insr"
import xml.etree.ElementTree as ET

article_file = "exampleResearchArticle.xml"


def get_root(fname):
    tree = ET.parse(fname)
    return tree.getroot()


def get_authors(root):
    authors = []
    for author in root.findall('./fm/bibl/aug/au'):
        data = {
                "fnm": None,
                "snm": None,
                "email": None,
                "insr": []
        }

        # YOUR CODE HERE
        data["fnm"] = author.find('./fnm').text
        data["snm"] = author.find('./snm').text
        data["email"] = author.find('./email').text
        insr = author.findall('./insr')
        for item in insr:
            data["insr"].append(item.attrib["iid"])
 
        authors.append(data)

    print authors

### LESSON 2.c Web Scraping

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
import json

html_page = "page_source.html"

def extract_data(page):
    data = {"eventvalidation": "",
            "viewstate": ""}
    # do something here....
    with open(page, "r") as html:
        soup = BeautifulSoup(html, "lxml")
        ev = soup.find(id="__EVENTVALIDATION")
        data["eventvalidation"] = ev["value"]

        vs = soup.find(id="__VIEWSTATE")
        data["viewstate"] = vs["value"]

    return data

def make_request(data):
    eventvalidation = data["eventvalidation"]
    viewstate = data["viewstate"]

    r = requests.post("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
                    data={'AirportList': "BOS",
                          'CarrierList': "VX",
                          'Submit': 'Submit',
                          "__EVENTTARGET": "",
                          "__EVENTARGUMENT": "",
                          "__EVENTVALIDATION": eventvalidation,
                          "__VIEWSTATE": viewstate
                    })

    return r.text


## LESSON 2 EXERCISES

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
html_page = "options.html"


def extract_carriers(page):
    data = []

    with open(page, "r") as html:
        print type(page)
        print type(html)
            # Create soup parser. soup is now our html file parsed.
        soup = BeautifulSoup(html, "lxml")
            # after looking at the html file I see that 
            # I want data located in the <option></option> tag so this 
            # finds all data with that tag.
        options =  soup.find_all('option')
            # now I will loop through the data in the tag.
        for code in options:
                # I know that the data I want is all len('str') == 2
            if len((code.get('value'))) == 2:
                data.append((code.get('value')))
    return data

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
html_page = "options.html"


def extract_airports(page):
    data = []
    all_term = "all"
    with open(page, "r") as html:
        # do something here to find the necessary values
        soup = BeautifulSoup(html, "lxml")
        options = soup.find_all('option')
        #print options
        for option in options:
            if len(option.get('value')) == 3:
                data.append(option.get('value'))
        del data[0:2]       

    return data